In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

Read the results from previous solution

In [2]:
df = pd.read_pickle('../solution-1/step_df.pk')
df.shape

(1328, 3)

In [3]:
df.head()

Unnamed: 0,title,body,bullets
0,"Data Engineer - Columbus, GA 31909","Data Engineer - Columbus, GA 31909\nCelebratin...","(Bachelor’s or Master’s degree in statistics, ..."
1,"Data Analyst - St. Louis, MO","Data Analyst - St. Louis, MO\nDuties\nSummary\...",(Job family (Series)\n1501 General Mathematics...
2,"Data Scientist - Newark, CA","Data Scientist - Newark, CA\nData Scientist\n\...","(Design, develop, document and maintain machin..."
3,Patient Care Assistant / PCA - Med/Surg (Fayet...,Patient Care Assistant / PCA - Med/Surg (Fayet...,(Provides all personal care services in accord...
4,"Scientific Programmer - Berkeley, CA","Scientific Programmer - Berkeley, CA\nCaribou ...","(Demonstrated proficiency with Python, JavaScr..."


In [4]:
print(df.columns)

Index(['title', 'body', 'bullets'], dtype='object')


Create tokenizer

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')

Tokenize bodies

In [6]:
tokens = vectorizer.fit_transform(df['body'])
token_matrix = tokens.toarray()
token_matrix.shape

(1328, 19003)

TFIDF

In [7]:
df2 = pd.DataFrame(token_matrix, columns = vectorizer.get_feature_names())
df2.shape

(1328, 19003)

Cosines

In [8]:
similarities = token_matrix @ token_matrix[0]
print(similarities)

[1.         0.11299447 0.10744324 ... 0.14479813 0.04623131 0.08948497]


In [30]:
most_similar_index = np.argsort(similarities)[-2]
similarity_index = similarities[most_similar_index]
print(df.loc[0])
print()
print(df.loc[most_similar_index])

title                     Data Engineer - Columbus, GA 31909
body       Data Engineer - Columbus, GA 31909\nCelebratin...
bullets    (Bachelor’s or Master’s degree in statistics, ...
Name: 0, dtype: object

title          Lead Data Scientist - San Francisco, CA 94105
body       Lead Data Scientist - San Francisco, CA 94105\...
bullets    (Build, deploy, and monitor predictive models ...
Name: 949, dtype: object


In [10]:
shrunk_matrix = TruncatedSVD(n_components=100).fit_transform(token_matrix)
shrunk_matrix.shape

(1328, 100)

In [11]:
np.linalg.norm(shrunk_matrix[0])

0.5143164895819115

In [12]:
shrunk_matrix = normalize(shrunk_matrix)
np.linalg.norm(shrunk_matrix[0])

0.9999999999999999

In [15]:
similarity_matrix = shrunk_matrix @ shrunk_matrix.T
print(similarity_matrix)

[[1.         0.39520786 0.51268481 ... 0.52962179 0.2077426  0.15667034]
 [0.39520786 1.         0.35075227 ... 0.42295605 0.24586456 0.22662674]
 [0.51268481 0.35075227 1.         ... 0.52613312 0.27394351 0.22472543]
 ...
 [0.52962179 0.42295605 0.52613312 ... 1.         0.38490832 0.30895069]
 [0.2077426  0.24586456 0.27394351 ... 0.38490832 1.         0.13515973]
 [0.15667034 0.22662674 0.22472543 ... 0.30895069 0.13515973 1.        ]]


In [33]:
# np.random.seed(1)
index1 = np.random.randint(len(df))
index2 = np.argsort(similarity_matrix[index1])[-2]
similarity = similarity_matrix[index1][index2]
print(f"The documents at indices {index1} and {index2} share a cosine similarity of {similarity:.2f}")
print()
print(df.loc[index1])
print()
print(df.loc[index2])

The documents at indices 1278 and 298 share a cosine similarity of 0.99

title              TG Shift Supervisor - Blue Lake, CA 95525
body       TG Shift Supervisor - Blue Lake, CA 95525\nBlu...
bullets    (Enforces performance standards, policies and ...
Name: 1278, dtype: object

title                 Data & Applied Scientist - Redmond, WA
body       Data & Applied Scientist - Redmond, WA\nAI for...
bullets    (Completed Master’s Degree in a quantitative f...
Name: 298, dtype: object
