In [43]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
text = ["wookie stormtrooper",
        "wookie wookie wookie stormtrooper stormtrooper stormtrooper",
        "harry potter"]

## Pairs Solutions

### Part 1

In [45]:
vect = CountVectorizer()
data = vect.fit_transform(text).todense()

In [47]:
print(vect.get_feature_names())
print(data)

['harry', 'potter', 'stormtrooper', 'wookie']
[[0 0 1 1]
 [0 0 3 3]
 [1 1 0 0]]


In [50]:
print(pairwise_distances(data, metric='euclidean'))

[[ 0.          2.82842712  2.        ]
 [ 2.82842712  0.          4.47213595]
 [ 2.          4.47213595  0.        ]]


In [51]:
print(pairwise_distances(data, metric='cosine').round(2))

[[ 0.  0.  1.]
 [ 0.  0.  1.]
 [ 1.  1.  0.]]


In [52]:
np.corrcoef(data)

array([[ 1.,  1., -1.],
       [ 1.,  1., -1.],
       [-1., -1.,  1.]])

In [53]:
print(1 - pairwise_distances(data - data.mean(axis=1), metric='cosine'))

[[ 1.  1. -1.]
 [ 1.  1. -1.]
 [-1. -1.  1.]]


### Part 2

## Pairs Work

In [6]:
vectorizer = CountVectorizer(ngram_range=(1,2))
vectorizer.fit(text)

x = vectorizer.transform(text)

In [8]:
print(x)

  (0, 3)	1
  (0, 5)	1
  (0, 6)	1
  (1, 3)	3
  (1, 4)	2
  (1, 5)	3
  (1, 6)	1
  (1, 7)	2
  (2, 0)	1
  (2, 1)	1
  (2, 2)	1


In [10]:
x_back = x.toarray()
print(x_back)

[[0 0 0 1 0 1 1 0]
 [0 0 0 3 2 3 1 2]
 [1 1 1 0 0 0 0 0]]


In [11]:
pd.DataFrame(x_back, columns=vectorizer.get_feature_names())

Unnamed: 0,harry,harry potter,potter,stormtrooper,stormtrooper stormtrooper,wookie,wookie stormtrooper,wookie wookie
0,0,0,0,1,0,1,1,0
1,0,0,0,3,2,3,1,2
2,1,1,1,0,0,0,0,0


In [14]:
x_back[0, 1]

0

In [41]:
# Cosine Similarity
def cos_sim(v1, v2):
    return np.dot(v1, v2.T /  np.linalg.norm(v1) / np.linalg.norm(v2))

In [21]:
cos_sim(x_back[0, :], x_back[1, :])

0.7777777777777779

### Demean Vectors

In [30]:
demeaned = x_back - np.array([np.mean(x_back, axis=1)]).T
demeaned

array([[-0.375, -0.375, -0.375,  0.625, -0.375,  0.625,  0.625, -0.375],
       [-1.375, -1.375, -1.375,  1.625,  0.625,  1.625, -0.375,  0.625],
       [ 0.625,  0.625,  0.625, -0.375, -0.375, -0.375, -0.375, -0.375]])

### Calculate 1 - Cosine Similarity

In [35]:
print(1 - cos_sim(demeaned[0, :], demeaned[1, :]))
print(1 - cos_sim(demeaned[0, :], demeaned[2, :]))
print(1 - cos_sim(demeaned[1, :], demeaned[2, :]))

0.39071511575
1.6
1.87419135566


### Pearson Correlation Coeff

In [36]:
np.corrcoef(demeaned[0, :], demeaned[1, :])

array([[ 1.        ,  0.60928488],
       [ 0.60928488,  1.        ]])

In [38]:
np.corrcoef(demeaned[0, :], demeaned[2, :])

array([[ 1. , -0.6],
       [-0.6,  1. ]])

In [39]:
np.corrcoef(demeaned[1, :], demeaned[2, :])

array([[ 1.        , -0.87419136],
       [-0.87419136,  1.        ]])