In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial


Read in job descriptions from glassdoor and profile from linkedin. Removing missing values.


In [2]:
df = pd.read_csv('../data/dsjobs_training_culled.csv', index_col=0)
df = df.dropna()

In [3]:
profile_vector = pd.read_csv('../data/profile_vector.csv', index_col=0)


Creating a Full dataframe that includes the profile as the first observation.


In [4]:
full_df = profile_vector['profile'].append(df['jobs'])

Fitting TfidfVectorizer on the whole corpus and creating a dataframe with the results.

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(full_df)
transformed_model = vectorizer.transform(full_df)
tfidf_df = pd.DataFrame(transformed_model.toarray())

In [7]:
profile = tfidf_df.iloc[0, :]

Calculating the Cosine distance between the linkedin profile and each job posting.

In [19]:
distances = []
for i in range(len(tfidf_df.index)):
    distances.append(spatial.distance.cosine(profile, tfidf_df.iloc[i,:]))

Sort distances by closest to furthest 

In [20]:
sorted_distances = np.sort(distances)

In [24]:
sorted_distances[:6]

array([0.        , 0.80989248, 0.84734903, 0.8862868 , 0.89611812,
       0.89793871])

Argsort distances to get indices of a sorted list

In [21]:
indices = np.argsort(distances)

In [23]:
indices[:6]

array([  0,  14,  98, 113, 103,  89])

Sort the full dataframe by the sorted indices to get descriptions in order of most similar to least similar

In [25]:
sorted_df = pd.DataFrame({'jobs': full_df.iloc[indices]}).set_index(np.arange(0,120))

Create column 'Labels' of zeros to be filled in with either a True, or a False

In [27]:
sorted_df['labels'] = np.zeros(120)