In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial


Read in job descriptions from glassdoor and profile from linkedin. Removing missing values.


In [2]:
df = pd.read_csv('../data/dsjobs_training_culled.csv', index_col=0)
df = df.dropna()

In [3]:
profile_vector = pd.read_csv('../data/profile_vector.csv', index_col=0)


Creating a Full dataframe that includes the profile as the first observation.


In [4]:
full_df = profile_vector['profile'].append(df['jobs'])

Fitting TfidfVectorizer on the whole corpus and creating a dataframe with the results.

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(full_df)
transformed_model = vectorizer.transform(full_df)
tfidf_df = pd.DataFrame(transformed_model.toarray())

In [7]:
profile = tfidf_df.iloc[0, :]

Calculating the Cosine distance between the linkedin profile and each job posting.

In [19]:
distances = []
for i in range(len(tfidf_df.index)):
    distances.append(spatial.distance.cosine(profile, tfidf_df.iloc[i,:]))

Sort distances by closest to furthest 

In [20]:
sorted_distances = np.sort(distances)

In [24]:
sorted_distances[:6]

array([0.        , 0.80989248, 0.84734903, 0.8862868 , 0.89611812,
       0.89793871])

Argsort distances to get indices of a sorted list

In [21]:
indices = np.argsort(distances)

In [23]:
indices[:6]

array([  0,  14,  98, 113, 103,  89])

In [64]:
indices_df = pd.DataFrame({'indices': indices})

In [75]:
sorted_distances_df = pd.DataFrame({'distances': sorted_distances})

Sort the full dataframe by the sorted indices to get descriptions in order of most similar to least similar

In [25]:
sorted_df = pd.DataFrame({'jobs': full_df.iloc[indices]}).set_index(np.arange(0,120))

Create column 'Labels' of zeros to be filled in with either a True, or a False

In [27]:
sorted_df['labels'] = np.zeros(120)

Create test dataframe to experiment with user issued labels

In [46]:
sorted_tfidf = pd.DataFrame(tfidf_df.iloc[indices]).set_index(np.arange(0,120))

In [113]:
total_df = pd.concat([indices_df, sorted_df, sorted_distances_df, sorted_tfidf], axis=1)

In [114]:
total_df.iat[0,2] = 10.0

In [462]:
test_df = total_df.copy()

In [463]:
# test_df.iat[1,1] = 1.0

In [464]:
test_df.head()

Unnamed: 0,indices,jobs,labels,distances,0,1,2,3,4,5,...,4684,4685,4686,4687,4688,4689,4690,4691,4692,4693
0,0,Data Scientist Greater Seattle Area Data Scien...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.117186,0.0,0.0,0.0
1,14,Are you interested in working for one of the m...,0.0,0.809892,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,98,The Microsoft Cloud+AI Design team is looking ...,0.0,0.847349,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,113,"Lead Data Scientist\n\nSeattle, WA\n\nJob Desc...",0.0,0.886287,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,103,Overall Job Purpose:\n\nThis role will be loca...,0.0,0.896118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [477]:
class yes():
    def __init__(self, df):
        self.df = df
    
    def prompt_user(self, index):
        print(self.df.iloc[index,1])
        label = input("\nyes/no (quit)")
        if label == 'quit':
            return "Done"
        if label == 'yes':
            self.df.iat[index,2] = 1.0
        if label == 'no':
            self.df.iat[index,2] = -1.0
        return self.df
    
    def improve_yes(self, df):
        yes_df = pd.DataFrame(df[df['labels'] == 1.0])
        yes_remains = pd.DataFrame(df[df['labels'] == 0.0])
        yes_remains = yes_remains.set_index(np.arange(0, yes_remains.shape[0]))
        distances_from_yes = np.zeros((yes_remains.shape[0], yes_df.shape[0]))
        for i in range(yes_df.shape[0]):
            for j in range(yes_remains.shape[0]):
                distances_from_yes[j,i] = (spatial.distance.cosine(yes_df.iloc[0,4:], yes_remains.iloc[j,4:]))
        yes_remains['distances_from_yes'] = np.sum(distances_from_yes, axis=1)
        return yes_remains
    
    def improve_no(self, df):
        no_df = pd.DataFrame(df[df['labels'] == -1.0])
        no_remains = pd.DataFrame(df[df['labels'] == 0.0])
        no_remains = no_remains.set_index(np.arange(0, no_remains.shape[0]))
        distances_from_no = np.zeros((no_remains.shape[0], no_df.shape[0]))
        for i in range(no_df.shape[0]):
            for j in range(no_remains.shape[0]):
                distances_from_no[j,i] = (spatial.distance.cosine(no_df.iloc[0,4:], no_remains.iloc[j,4:]))
        no_remains['distances_from_no'] = np.sum(distances_from_no, axis=1)
        return no_remains
    
    def find_next_best(self, df, index):
        adjusted_df = self.prompt_user(index)
        yes_remains = self.improve_yes(adjusted_df)
        no_remains = self.improve_no(adjusted_df)
        total_distance = yes_remains.distances + yes_remains.distances_from_yes - no_remains.distances_from_no
        remains = yes_remains.copy()
        remains['total_distance'] = total_distance
        next_best_index = remains.total_distance.idxmin()
        return self.find_next_best(next_best_index)
    

In [478]:
test = yes(test_df)

In [492]:
changed_df = test.prompt_user(next_index)

Data Scientist Greater Seattle Area Data Scientist with an economics background from Colorado with experience in machine learning, data analysis, and regression analysis. Proven leader of small groups focused on efficiency and continuous improvement. student galvanize inc sep 2018 – present 3 mos greater seattle area logwork master blue ridge log works aug 2016 – aug 2018 2 yrs 1 mo united states • built custom log furniture for a high-end reclaimed wood furniture company. • managed people within my department in order to achieve efficiency without sacrificing quality. • continuously improved processes to increase productivity. • acquired new skills and techniques in order to increase the quality of the company’s econometrics regression analysis microsoft office industry knowledge data analysis tools & technologies microsoft powerpoint microsoft excel python (programming language) pandas (software) sql postgresql mongodb amazon web services (aws) selenium public speaking numpy scikit-l

KeyboardInterrupt: 

In [480]:
changed_df.head()

Unnamed: 0,indices,jobs,labels,distances,0,1,2,3,4,5,...,4684,4685,4686,4687,4688,4689,4690,4691,4692,4693
0,0,Data Scientist Greater Seattle Area Data Scien...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.117186,0.0,0.0,0.0
1,14,Are you interested in working for one of the m...,1.0,0.809892,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,98,The Microsoft Cloud+AI Design team is looking ...,0.0,0.847349,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,113,"Lead Data Scientist\n\nSeattle, WA\n\nJob Desc...",0.0,0.886287,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,103,Overall Job Purpose:\n\nThis role will be loca...,0.0,0.896118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [481]:
changed_df.shape

(120, 4698)

In [482]:
posi = test.improve_yes(changed_df)

In [483]:
posi.head()

Unnamed: 0,indices,jobs,labels,distances,0,1,2,3,4,5,...,4685,4686,4687,4688,4689,4690,4691,4692,4693,distances_from_yes
0,98,The Microsoft Cloud+AI Design team is looking ...,0.0,0.847349,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.593506
1,113,"Lead Data Scientist\n\nSeattle, WA\n\nJob Desc...",0.0,0.886287,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.848914
2,103,Overall Job Purpose:\n\nThis role will be loca...,0.0,0.896118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.796519
3,89,Lead Data Scientist\nIf you are a Lead Data Sc...,0.0,0.897939,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.819353
4,25,Blue Nile was created in 1999 by a man in love...,0.0,0.900881,0.037139,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.884882


In [484]:
posi.shape

(118, 4699)

In [485]:
negi = test.improve_no(changed_df)

In [486]:
negi.head()

Unnamed: 0,indices,jobs,labels,distances,0,1,2,3,4,5,...,4685,4686,4687,4688,4689,4690,4691,4692,4693,distances_from_no
0,98,The Microsoft Cloud+AI Design team is looking ...,0.0,0.847349,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,113,"Lead Data Scientist\n\nSeattle, WA\n\nJob Desc...",0.0,0.886287,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,103,Overall Job Purpose:\n\nThis role will be loca...,0.0,0.896118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,89,Lead Data Scientist\nIf you are a Lead Data Sc...,0.0,0.897939,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25,Blue Nile was created in 1999 by a man in love...,0.0,0.900881,0.037139,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [487]:
negi.shape

(118, 4699)

In [488]:
total_distance = posi.distances + posi.distances_from_yes - negi.distances_from_no

In [489]:
alter = posi.copy()

In [493]:
alter.head()

Unnamed: 0,indices,jobs,labels,distances,0,1,2,3,4,5,...,4686,4687,4688,4689,4690,4691,4692,4693,distances_from_yes,total_distance
0,98,The Microsoft Cloud+AI Design team is looking ...,0.0,0.847349,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.593506,1.440855
1,113,"Lead Data Scientist\n\nSeattle, WA\n\nJob Desc...",0.0,0.886287,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.848914,1.7352
2,103,Overall Job Purpose:\n\nThis role will be loca...,0.0,0.896118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.796519,1.692637
3,89,Lead Data Scientist\nIf you are a Lead Data Sc...,0.0,0.897939,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.819353,1.717292
4,25,Blue Nile was created in 1999 by a man in love...,0.0,0.900881,0.037139,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.884882,1.785763


In [490]:
alter['total_distance'] = total_distance

In [491]:
next_index = alter.total_distance.idxmin()

In [494]:
next_index

0

In [None]:
remains = yes_remains.copy()
remains['total_distance'] = total_distance
next_best_index = remains.total_distance.idxmin()
remains.iloc[next_best_index, 1].split('\n')
return self.find_next_best(remains, next_best_index)