**BoW**

In [1]:
# pembentukan corpus
corpus = [
    'Linux has been around since the mid-1990s.',
    'Linux distributions include the Linux kernel',
    'Linux is one of the most prominent open-source software.'
]
corpus

['Linux has been around since the mid-1990s.',
 'Linux distributions include the Linux kernel',
 'Linux is one of the most prominent open-source software.']

In [2]:
# penerapan BoW dengan CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer() # bangun objectnya
vectorized_X = vectorizer.fit_transform(corpus).todense()
# todense akan mengubah hasil dari fit transform menjadi array 2 dimensi
vectorized_X

matrix([[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1],
        [0, 0, 0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1]],
       dtype=int64)

In [3]:
# memunculkan fitur name
vectorizer.get_feature_names()



['1990s',
 'around',
 'been',
 'distributions',
 'has',
 'include',
 'is',
 'kernel',
 'linux',
 'mid',
 'most',
 'of',
 'one',
 'open',
 'prominent',
 'since',
 'software',
 'source',
 'the']

In [4]:
# penggunaan data frame pada BoW
import pandas as pd

df = pd.DataFrame(vectorized_X,
                 index = [f'kalimat-{i+1}' for i in range(len(corpus))],
                 columns = vectorizer.get_feature_names())

In [5]:
df

Unnamed: 0,1990s,around,been,distributions,has,include,is,kernel,linux,mid,most,of,one,open,prominent,since,software,source,the
kalimat-1,1,1,1,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1
kalimat-2,0,0,0,1,0,1,0,1,2,0,0,0,0,0,0,0,0,0,1
kalimat-3,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,0,1,1,1


In [6]:
# contoh penggunaan hasil BoW
from sklearn.metrics.pairwise import euclidean_distances

for i in range(len(vectorized_X)):
    for j in range(i, len(vectorized_X)):
        if i == j:
            continue
        jarak = euclidean_distances(vectorized_X[i],vectorized_X[j])
        print(f' Jarak dokumen {i+1} dan {j+1} : {jarak}')

 Jarak dokumen 1 dan 2 : [[3.16227766]]
 Jarak dokumen 1 dan 3 : [[3.74165739]]
 Jarak dokumen 2 dan 3 : [[3.46410162]]




**Tf-Idf**

In [7]:
# corpus untuk contoh Tf-Idf
corpus = [
    'the house had a tiny little mouse',
    'the cat saw the mouse',
    'the mouse ran away from the house',
    'the cat finally ate the mouse',
    'the end of the mouse story'
]
corpus

['the house had a tiny little mouse',
 'the cat saw the mouse',
 'the mouse ran away from the house',
 'the cat finally ate the mouse',
 'the end of the mouse story']

In [8]:
# penyelesaian Tf-Idf dengan tfidfVectorizer
# ambil modul TfidfVectorizer dari sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english') # membuat object
response = vectorizer.fit_transform(corpus)
print(response)

  (0, 7)	0.2808823162882302
  (0, 6)	0.5894630806320427
  (0, 11)	0.5894630806320427
  (0, 5)	0.47557510189256375
  (1, 9)	0.7297183669435993
  (1, 2)	0.5887321837696324
  (1, 7)	0.3477147117091919
  (2, 1)	0.5894630806320427
  (2, 8)	0.5894630806320427
  (2, 7)	0.2808823162882302
  (2, 5)	0.47557510189256375
  (3, 0)	0.5894630806320427
  (3, 4)	0.5894630806320427
  (3, 2)	0.47557510189256375
  (3, 7)	0.2808823162882302
  (4, 10)	0.6700917930430479
  (4, 3)	0.6700917930430479
  (4, 7)	0.3193023297639811


In [9]:
# memunculkan fitur name
vectorizer.get_feature_names()



['ate',
 'away',
 'cat',
 'end',
 'finally',
 'house',
 'little',
 'mouse',
 'ran',
 'saw',
 'story',
 'tiny']

In [10]:
# Array 2 dimensi menampilkan hasil Tf-Idf
response.todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.4755751 , 0.58946308, 0.28088232, 0.        , 0.        ,
         0.        , 0.58946308],
        [0.        , 0.        , 0.58873218, 0.        , 0.        ,
         0.        , 0.        , 0.34771471, 0.        , 0.72971837,
         0.        , 0.        ],
        [0.        , 0.58946308, 0.        , 0.        , 0.        ,
         0.4755751 , 0.        , 0.28088232, 0.58946308, 0.        ,
         0.        , 0.        ],
        [0.58946308, 0.        , 0.4755751 , 0.        , 0.58946308,
         0.        , 0.        , 0.28088232, 0.        , 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.67009179, 0.        ,
         0.        , 0.        , 0.31930233, 0.        , 0.        ,
         0.67009179, 0.        ]])

In [11]:
# Hasil Akhir Tf-Idf
import pandas as pd

df = pd.DataFrame(response.todense().T,
                 index = vectorizer.get_feature_names(),
                 columns = [f'D{i+1}' for i in range(len(corpus))])
df

Unnamed: 0,D1,D2,D3,D4,D5
ate,0.0,0.0,0.0,0.589463,0.0
away,0.0,0.0,0.589463,0.0,0.0
cat,0.0,0.588732,0.0,0.475575,0.0
end,0.0,0.0,0.0,0.0,0.670092
finally,0.0,0.0,0.0,0.589463,0.0
house,0.475575,0.0,0.475575,0.0,0.0
little,0.589463,0.0,0.0,0.0,0.0
mouse,0.280882,0.347715,0.280882,0.280882,0.319302
ran,0.0,0.0,0.589463,0.0,0.0
saw,0.0,0.729718,0.0,0.0,0.0


**Post Test**

**BoW**

In [12]:
# corpus dalam file
file = 'corpus1.csv'
data = pd.read_csv(file)

# validasi
corpus = data["corpus"]

data.head()

Unnamed: 0,corpus
0,Linux has been around since the mid-1990s.
1,Linux distributions include the Linux kernel
2,Linux is one of the most prominent open-source...


In [13]:
# penerapan BoW dengan CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer() # bangun objectnya
vectorized_X = vectorizer.fit_transform(corpus).todense()
# todense akan mengubah hasil dari fit transform menjadi array 2 dimensi
vectorized_X

matrix([[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1],
        [0, 0, 0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1]],
       dtype=int64)

In [14]:
# memunculkan fitur name
vectorizer.get_feature_names()



['1990s',
 'around',
 'been',
 'distributions',
 'has',
 'include',
 'is',
 'kernel',
 'linux',
 'mid',
 'most',
 'of',
 'one',
 'open',
 'prominent',
 'since',
 'software',
 'source',
 'the']

In [15]:
# penggunaan data frame pada BoW
import pandas as pd

df = pd.DataFrame(vectorized_X,
                 index = [f'kalimat-{i+1}' for i in range(len(corpus))],
                 columns = vectorizer.get_feature_names())

In [16]:
df

Unnamed: 0,1990s,around,been,distributions,has,include,is,kernel,linux,mid,most,of,one,open,prominent,since,software,source,the
kalimat-1,1,1,1,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1
kalimat-2,0,0,0,1,0,1,0,1,2,0,0,0,0,0,0,0,0,0,1
kalimat-3,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,0,1,1,1


In [17]:
# contoh penggunaan hasil BoW
from sklearn.metrics.pairwise import euclidean_distances

for i in range(len(vectorized_X)):
    for j in range(i, len(vectorized_X)):
        if i == j:
            continue
        jarak = euclidean_distances(vectorized_X[i],vectorized_X[j])
        print(f' Jarak dokumen {i+1} dan {j+1} : {jarak}')

 Jarak dokumen 1 dan 2 : [[3.16227766]]
 Jarak dokumen 1 dan 3 : [[3.74165739]]
 Jarak dokumen 2 dan 3 : [[3.46410162]]




**Tf-Idf**

In [18]:
# corpus dalam file
file = 'corpus2.csv'
data = pd.read_csv(file)

# validasi
corpus = data["corpus"]

data.head()

Unnamed: 0,corpus
0,the house had a tiny little mouse
1,the cat saw the mouse
2,the mouse ran away from the house
3,the cat finally ate the mouse
4,the end of the mouse story


In [19]:
# penyelesaian Tf-Idf dengan tfidfVectorizer
# ambil modul TfidfVectorizer dari sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english') # membuat object
response = vectorizer.fit_transform(corpus)
print(response)

  (0, 7)	0.2808823162882302
  (0, 6)	0.5894630806320427
  (0, 11)	0.5894630806320427
  (0, 5)	0.47557510189256375
  (1, 9)	0.7297183669435993
  (1, 2)	0.5887321837696324
  (1, 7)	0.3477147117091919
  (2, 1)	0.5894630806320427
  (2, 8)	0.5894630806320427
  (2, 7)	0.2808823162882302
  (2, 5)	0.47557510189256375
  (3, 0)	0.5894630806320427
  (3, 4)	0.5894630806320427
  (3, 2)	0.47557510189256375
  (3, 7)	0.2808823162882302
  (4, 10)	0.6700917930430479
  (4, 3)	0.6700917930430479
  (4, 7)	0.3193023297639811


In [20]:
# memunculkan fitur name
vectorizer.get_feature_names()



['ate',
 'away',
 'cat',
 'end',
 'finally',
 'house',
 'little',
 'mouse',
 'ran',
 'saw',
 'story',
 'tiny']

In [21]:
# Array 2 dimensi menampilkan hasil Tf-Idf
response.todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.4755751 , 0.58946308, 0.28088232, 0.        , 0.        ,
         0.        , 0.58946308],
        [0.        , 0.        , 0.58873218, 0.        , 0.        ,
         0.        , 0.        , 0.34771471, 0.        , 0.72971837,
         0.        , 0.        ],
        [0.        , 0.58946308, 0.        , 0.        , 0.        ,
         0.4755751 , 0.        , 0.28088232, 0.58946308, 0.        ,
         0.        , 0.        ],
        [0.58946308, 0.        , 0.4755751 , 0.        , 0.58946308,
         0.        , 0.        , 0.28088232, 0.        , 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.67009179, 0.        ,
         0.        , 0.        , 0.31930233, 0.        , 0.        ,
         0.67009179, 0.        ]])

In [22]:
# Hasil Akhir Tf-Idf
import pandas as pd

df = pd.DataFrame(response.todense().T,
                 index = vectorizer.get_feature_names(),
                 columns = [f'D{i+1}' for i in range(len(corpus))])
df

Unnamed: 0,D1,D2,D3,D4,D5
ate,0.0,0.0,0.0,0.589463,0.0
away,0.0,0.0,0.589463,0.0,0.0
cat,0.0,0.588732,0.0,0.475575,0.0
end,0.0,0.0,0.0,0.0,0.670092
finally,0.0,0.0,0.0,0.589463,0.0
house,0.475575,0.0,0.475575,0.0,0.0
little,0.589463,0.0,0.0,0.0,0.0
mouse,0.280882,0.347715,0.280882,0.280882,0.319302
ran,0.0,0.0,0.589463,0.0,0.0
saw,0.0,0.729718,0.0,0.0,0.0
