## Example (Bag of Words II)

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# create data set
text = ['The car crashed long ago.',
         'The car has rusted.',
         'A rusted car is unsafe.',
         'Spare car parts are needed urgently.']
corpus = pd.DataFrame()
corpus['document'] = text
corpus

Unnamed: 0,document
0,The car crashed long ago.
1,The car has rusted.
2,A rusted car is unsafe.
3,Spare car parts are needed urgently.


In [3]:
corpus.values

array([['The car crashed long ago.'],
       ['The car has rusted.'],
       ['A rusted car is unsafe.'],
       ['Spare car parts are needed urgently.']], dtype=object)

In [4]:
corpus.values.flatten()

array(['The car crashed long ago.', 'The car has rusted.',
       'A rusted car is unsafe.', 'Spare car parts are needed urgently.'],
      dtype=object)

In [5]:
Vectorizer = CountVectorizer(analyzer='word',stop_words='english',ngram_range=(1,2))
#ngram_range=(1,2): all 1 grams & all 2 grams (1-word-word and 2-word-word)

In [6]:
word_counts = Vectorizer.fit_transform(corpus.values.flatten())
print(word_counts)

  (0, 1)	1
  (0, 6)	1
  (0, 8)	1
  (0, 0)	1
  (0, 2)	1
  (0, 7)	1
  (0, 9)	1
  (1, 1)	1
  (1, 14)	1
  (1, 4)	1
  (2, 1)	1
  (2, 14)	1
  (2, 18)	1
  (2, 15)	1
  (2, 5)	1
  (3, 1)	1
  (3, 16)	1
  (3, 12)	1
  (3, 10)	1
  (3, 19)	1
  (3, 17)	1
  (3, 3)	1
  (3, 13)	1
  (3, 11)	1


In [7]:
word_counts.shape

(4, 20)

In [8]:
words = Vectorizer.get_feature_names()
words

['ago',
 'car',
 'car crashed',
 'car parts',
 'car rusted',
 'car unsafe',
 'crashed',
 'crashed long',
 'long',
 'long ago',
 'needed',
 'needed urgently',
 'parts',
 'parts needed',
 'rusted',
 'rusted car',
 'spare',
 'spare car',
 'unsafe',
 'urgently']

In [9]:
words = pd.Series(words)
words

0                 ago
1                 car
2         car crashed
3           car parts
4          car rusted
5          car unsafe
6             crashed
7        crashed long
8                long
9            long ago
10             needed
11    needed urgently
12              parts
13       parts needed
14             rusted
15         rusted car
16              spare
17          spare car
18             unsafe
19           urgently
dtype: object

In [10]:
words[5]

'car unsafe'

In [11]:
word_counts_total = np.asarray(word_counts.sum(axis = 0)).flatten()
word_counts_total

array([1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1],
      dtype=int64)

In [12]:
word_counts_total = pd.Series(word_counts_total, index = words)
word_counts_total.sort_values(ascending=False)

car                4
rusted             2
urgently           1
long               1
car crashed        1
car parts          1
car rusted         1
car unsafe         1
crashed            1
crashed long       1
long ago           1
unsafe             1
needed             1
needed urgently    1
parts              1
parts needed       1
rusted car         1
spare              1
spare car          1
ago                1
dtype: int64

In [13]:
print(word_counts)

  (0, 1)	1
  (0, 6)	1
  (0, 8)	1
  (0, 0)	1
  (0, 2)	1
  (0, 7)	1
  (0, 9)	1
  (1, 1)	1
  (1, 14)	1
  (1, 4)	1
  (2, 1)	1
  (2, 14)	1
  (2, 18)	1
  (2, 15)	1
  (2, 5)	1
  (3, 1)	1
  (3, 16)	1
  (3, 12)	1
  (3, 10)	1
  (3, 19)	1
  (3, 17)	1
  (3, 3)	1
  (3, 13)	1
  (3, 11)	1


In [14]:
word_counts.toarray()

array([[1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1]],
      dtype=int64)

In [15]:
df = pd.DataFrame(word_counts.toarray(), columns = words)
df

Unnamed: 0,ago,car,car crashed,car parts,car rusted,car unsafe,crashed,crashed long,long,long ago,needed,needed urgently,parts,parts needed,rusted,rusted car,spare,spare car,unsafe,urgently
0,1,1,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0
3,0,1,0,1,0,0,0,0,0,0,1,1,1,1,0,0,1,1,0,1


In [16]:
#should match L23's Table
df = df.div(df.sum(axis=1),axis=0)
df.round(2)

Unnamed: 0,ago,car,car crashed,car parts,car rusted,car unsafe,crashed,crashed long,long,long ago,needed,needed urgently,parts,parts needed,rusted,rusted car,spare,spare car,unsafe,urgently
0,0.14,0.14,0.14,0.0,0.0,0.0,0.14,0.14,0.14,0.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.33,0.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0
2,0.0,0.2,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.2,0.0
3,0.0,0.11,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.11,0.11,0.11,0.0,0.0,0.11,0.11,0.0,0.11


In [17]:
df.sum(axis=1)

0    1.0
1    1.0
2    1.0
3    1.0
dtype: float64

In [18]:
df.T.corr().round(2) 
#strong correlation betweeen document 1 and 2
#all perfect correlated to themselves

Unnamed: 0,0,1,2,3
0,1.0,-0.01,-0.18,-0.45
1,-0.01,1.0,0.4,-0.1
2,-0.18,0.4,1.0,-0.29
3,-0.45,-0.1,-0.29,1.0


In [19]:
#g
