In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial


Read in job descriptions from glassdoor and profile from linkedin. Removing missing values.


In [2]:
df = pd.read_csv('../data/dsjobs_training_culled.csv', index_col=0)
df = df.dropna()

In [3]:
profile_vector = pd.read_csv('../data/profile_vector.csv', index_col=0)


Creating a Full dataframe that includes the profile as the first observation.


In [4]:
full_df = profile_vector['profile'].append(df['jobs'])

Fitting TfidfVectorizer on the whole corpus and creating a dataframe with the results.

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(full_df)
transformed_model = vectorizer.transform(full_df)
tfidf_df = pd.DataFrame(transformed_model.toarray())

In [7]:
profile = tfidf_df.iloc[0, :]

Calculating the Cosine distance between the linkedin profile and each job posting.

In [19]:
distances = []
for i in range(len(tfidf_df.index)):
    distances.append(spatial.distance.cosine(profile, tfidf_df.iloc[i,:]))

Sort distances by closest to furthest 

In [20]:
sorted_distances = np.sort(distances)

In [24]:
sorted_distances[:6]

array([0.        , 0.80989248, 0.84734903, 0.8862868 , 0.89611812,
       0.89793871])

Argsort distances to get indices of a sorted list

In [21]:
indices = np.argsort(distances)

In [23]:
indices[:6]

array([  0,  14,  98, 113, 103,  89])

In [64]:
indices_df = pd.DataFrame({'indices': indices})

In [75]:
sorted_distances_df = pd.DataFrame({'distances': sorted_distances})

Sort the full dataframe by the sorted indices to get descriptions in order of most similar to least similar

In [25]:
sorted_df = pd.DataFrame({'jobs': full_df.iloc[indices]}).set_index(np.arange(0,120))

Create column 'Labels' of zeros to be filled in with either a True, or a False

In [27]:
sorted_df['labels'] = np.zeros(120)

Create test dataframe to experiment with user issued labels

In [46]:
sorted_tfidf = pd.DataFrame(tfidf_df.iloc[indices]).set_index(np.arange(0,120))

In [113]:
total_df = pd.concat([indices_df, sorted_df, sorted_distances_df, sorted_tfidf], axis=1)

In [114]:
total_df.iat[0,2] = 10.0

In [433]:
test_df = total_df.copy()

In [434]:
# test_df.iat[1,1] = 1.0

In [435]:
test_df.head()

Unnamed: 0,indices,jobs,labels,distances,0,1,2,3,4,5,...,4684,4685,4686,4687,4688,4689,4690,4691,4692,4693
0,0,Data Scientist Greater Seattle Area Data Scien...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.117186,0.0,0.0,0.0
1,14,Are you interested in working for one of the m...,0.0,0.809892,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,98,The Microsoft Cloud+AI Design team is looking ...,0.0,0.847349,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,113,"Lead Data Scientist\n\nSeattle, WA\n\nJob Desc...",0.0,0.886287,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,103,Overall Job Purpose:\n\nThis role will be loca...,0.0,0.896118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [436]:
class yes():
    def __init__(self, param=None):
        self.param = param
    
    def prompt_user(self, df, index):
        print(df.iloc[index,1])
        label = input("\nyes/no (quit)")
        if label == 'quit':
            return "Done"
        if label == 'yes':
            df.iat[index,2] = 1.0
        if label == 'no':
            df.iat[index,2] = -1.0
        return df
    
    def improve_yes(self, df):
        yes_df = pd.DataFrame(df[df['labels'] == 1.0])
        yes_remains = pd.DataFrame(df[df['labels'] == 0.0])
        yes_remains = yes_remains.set_index(np.arange(0, yes_remains.shape[0]))
        distances_from_yes = np.zeros((yes_remains.shape[0], yes_df.shape[0]))
        for i in range(yes_df.shape[0]):
            for j in range(yes_remains.shape[0]):
                distances_from_yes[j,i] = (spatial.distance.cosine(yes_df.iloc[0,4:], yes_remains.iloc[j,4:]))
        yes_remains['distances_from_yes'] = np.sum(distances_from_yes, axis=1)
        return yes_remains
    
    def improve_no(self, df):
        no_df = pd.DataFrame(df[df['labels'] == -1.0])
        no_remains = pd.DataFrame(df[df['labels'] == 0.0])
        no_remains = no_remains.set_index(np.arange(0, no_remains.shape[0]))
        distances_from_no = np.zeros((no_remains.shape[0], no_df.shape[0]))
        for i in range(no_df.shape[0]):
            for j in range(no_remains.shape[0]):
                distances_from_no[j,i] = (spatial.distance.cosine(no_df.iloc[0,4:], no_remains.iloc[j,4:]))
        no_remains['distances_from_no'] = np.sum(distances_from_no, axis=1)
        return no_remains
    
    def find_next_best(self, df, index):
        adjusted_df = self.prompt_user(df, index)
        yes_remains = self.improve_yes(adjusted_df)
        no_remains = self.improve_no(adjusted_df)
        total_distance = yes_remains.distances + yes_remains.distances_from_yes - no_remains.distances_from_no
        remains = yes_remains.copy()
        remains['total_distance'] = total_distance
        next_best_index = remains.total_distance.idxmin()
        remains.iloc[next_best_index, 1].split('\n')
        return self.find_next_best(remains, next_best_index)
    

In [437]:
test = yes()

In [438]:
changed_df = test.prompt_user(test_df, 1)

Are you interested in working for one of the most exciting products in Microsoft, passionate about exceeding customer expectations and advancing Microsoft's cloud first strategy? Are you interested in a start-up like environment, excited about cloud computing technology and driving growth in one of Microsoft's core businesses? If so, then look no further than the Azure Customer Experience (CXP) Team!

Microsoft Azure provides customers with an on-demand and infinitely scalable infrastructure and platform for customers to build, host, and scale service applications on the Internet through Microsoft’s global data centers. As part of the Azure Engineering organization, Azure CXP is a rapidly growing team committed to driving Azure growth through our relentless pursuit of satisfied Azure customers, by leading world-class customer reliability engagements, engineering modern customer-first experiences for scale, and by driving deep customer insights and empathy into the broader Azure Enginee


yes/no (quit) yes


In [439]:
changed_df.head()

Unnamed: 0,indices,jobs,labels,distances,0,1,2,3,4,5,...,4684,4685,4686,4687,4688,4689,4690,4691,4692,4693
0,0,Data Scientist Greater Seattle Area Data Scien...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.117186,0.0,0.0,0.0
1,14,Are you interested in working for one of the m...,1.0,0.809892,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,98,The Microsoft Cloud+AI Design team is looking ...,0.0,0.847349,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,113,"Lead Data Scientist\n\nSeattle, WA\n\nJob Desc...",0.0,0.886287,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,103,Overall Job Purpose:\n\nThis role will be loca...,0.0,0.896118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [440]:
posi = test.improve_yes(changed_df)

In [441]:
posi.head()

Unnamed: 0,indices,jobs,labels,distances,0,1,2,3,4,5,...,4685,4686,4687,4688,4689,4690,4691,4692,4693,distances_from_yes
0,98,The Microsoft Cloud+AI Design team is looking ...,0.0,0.847349,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.593506
1,113,"Lead Data Scientist\n\nSeattle, WA\n\nJob Desc...",0.0,0.886287,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.848914
2,103,Overall Job Purpose:\n\nThis role will be loca...,0.0,0.896118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.796519
3,89,Lead Data Scientist\nIf you are a Lead Data Sc...,0.0,0.897939,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.819353
4,25,Blue Nile was created in 1999 by a man in love...,0.0,0.900881,0.037139,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.884882


In [442]:
negi = test.improve_no(changed_df)

In [443]:
negi.head()

Unnamed: 0,indices,jobs,labels,distances,0,1,2,3,4,5,...,4685,4686,4687,4688,4689,4690,4691,4692,4693,distances_from_no
0,98,The Microsoft Cloud+AI Design team is looking ...,0.0,0.847349,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,113,"Lead Data Scientist\n\nSeattle, WA\n\nJob Desc...",0.0,0.886287,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,103,Overall Job Purpose:\n\nThis role will be loca...,0.0,0.896118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,89,Lead Data Scientist\nIf you are a Lead Data Sc...,0.0,0.897939,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25,Blue Nile was created in 1999 by a man in love...,0.0,0.900881,0.037139,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [444]:
total_distance = posi.distances + posi.distances_from_yes - negi.distances_from_no

In [445]:
alter = posi.copy()

In [446]:
alter['total_distance'] = total_distance

In [None]:
next_index = 

In [None]:
remains = yes_remains.copy()
remains['total_distance'] = total_distance
next_best_index = remains.total_distance.idxmin()
remains.iloc[next_best_index, 1].split('\n')
return self.find_next_best(remains, next_best_index)

In [355]:
test.find_next_best(test_df, 1)

Are you interested in working for one of the most exciting products in Microsoft, passionate about exceeding customer expectations and advancing Microsoft's cloud first strategy? Are you interested in a start-up like environment, excited about cloud computing technology and driving growth in one of Microsoft's core businesses? If so, then look no further than the Azure Customer Experience (CXP) Team!

Microsoft Azure provides customers with an on-demand and infinitely scalable infrastructure and platform for customers to build, host, and scale service applications on the Internet through Microsoft’s global data centers. As part of the Azure Engineering organization, Azure CXP is a rapidly growing team committed to driving Azure growth through our relentless pursuit of satisfied Azure customers, by leading world-class customer reliability engagements, engineering modern customer-first experiences for scale, and by driving deep customer insights and empathy into the broader Azure Enginee


yes/no (quit) yes


The Microsoft Cloud+AI Design team is looking for a Senior Data Scientist to join our Experience Analytics team ? we work hard, have fun, and value collaboration and individuality in each other. As a team, we?re passionate about maximizing the impact of Data Science work on Design and Product decisions, and we are leading discussions in this area within Microsoft and beyond.

Join us if you want to impact how millions of users use the cloud and all the services it enables. Our customers range from people with highly technical skills to information workers working with cloud enabled devices and services. Our product portfolio includes Dynamics, Azure, Power BI, PowerApps, Flow, and more.

The mission of the Experience Analytics team is to leverage product telemetry to unpack the nuance behind our customers? end-to-end journeys and drive design decisions. As a Data Scientist on the Experience Analytics team, you?ll answer key questions about users, their in-product workflow, and the qual


yes/no (quit) no


Minimum of 5 years of working experience.
Experience in a technical/international organization is an advantage.
You are fluent in both written and spoken English.
You have knowledge of SQL Server databases and SQL.
You master programming in R or Python.
You have broad experience in applied statistics.
Knowledge of C# is an advantage.
Knowledge of Microsoft Azure is an advantage (Azure Machine Learning, Stream Analytics, Azure functions)
You have experience with reporting tools like PowerBI.
IoT knowledge is an advantage.
Experis is an Equal Opportunity Employer (EOE/AA) - provided by Dice

machine learning,Azure,Python,R,Iot,C#,data science



yes/no (quit) quit


TypeError: string indices must be integers

In [324]:
no_df = no_adjusted_df[no_adjusted_df['labels'] == -1.0]
no_remains = pd.DataFrame(no_adjusted_df[no_adjusted_df['labels'] == 0.0])
no_remains = no_remains.set_index(np.arange(0, no_remains.shape[0]))

In [343]:
# no_df.head()

In [344]:
# no_remains.head(2)

In [327]:
distances_from_no = []
for j in range(no_remains.shape[0]):
    distances_from_no.append(spatial.distance.cosine(no_df.iloc[0,4:], no_remains.iloc[j,4:]))

In [328]:
no_remains['distances_from_no'] = distances_from_no

In [329]:
yes_df = yes_adjusted_df[yes_adjusted_df['labels'] == 1.0]
yes_remains = pd.DataFrame(yes_adjusted_df[yes_adjusted_df['labels'] == 0.0])
yes_remains = yes_remains.set_index(np.arange(0, yes_remains.shape[0]))

In [330]:
distances_from_yes = []
for j in range(yes_remains.shape[0]):
    distances_from_yes.append(spatial.distance.cosine(yes_df.iloc[0,4:], yes_remains.iloc[j,4:]))

In [331]:
yes_remains['distances_from_yes'] = distances_from_yes

In [332]:
yes_remains.shape

(117, 4699)

In [333]:
no_remains.shape

(117, 4699)

In [334]:
yes_remains.head()

Unnamed: 0,indices,jobs,labels,distances,0,1,2,3,4,5,...,4685,4686,4687,4688,4689,4690,4691,4692,4693,distances_from_yes
0,98,The Microsoft Cloud+AI Design team is looking ...,0.0,0.847349,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.593506
1,103,Overall Job Purpose:\n\nThis role will be loca...,0.0,0.896118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.796519
2,89,Lead Data Scientist\nIf you are a Lead Data Sc...,0.0,0.897939,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.819353
3,25,Blue Nile was created in 1999 by a man in love...,0.0,0.900881,0.037139,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.884882
4,116,Back to search results\n\nSearch by Job Title ...,0.0,0.901781,0.016726,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.888248


In [335]:
no_remains.head()

Unnamed: 0,indices,jobs,labels,distances,0,1,2,3,4,5,...,4685,4686,4687,4688,4689,4690,4691,4692,4693,distances_from_no
0,98,The Microsoft Cloud+AI Design team is looking ...,0.0,0.847349,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.833532
1,103,Overall Job Purpose:\n\nThis role will be loca...,0.0,0.896118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.805844
2,89,Lead Data Scientist\nIf you are a Lead Data Sc...,0.0,0.897939,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208425
3,25,Blue Nile was created in 1999 by a man in love...,0.0,0.900881,0.037139,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.854529
4,116,Back to search results\n\nSearch by Job Title ...,0.0,0.901781,0.016726,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.888094


In [336]:
total_distance = yes_remains.distances + yes_remains.distances_from_yes - no_remains.distances_from_no

In [337]:
remains = yes_remains.copy()

In [339]:
remains['total_distance'] = total_distance

In [342]:
remains.head()

Unnamed: 0,indices,jobs,labels,distances,0,1,2,3,4,5,...,4686,4687,4688,4689,4690,4691,4692,4693,distances_from_yes,total_distance
0,98,The Microsoft Cloud+AI Design team is looking ...,0.0,0.847349,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.593506,0.607323
1,103,Overall Job Purpose:\n\nThis role will be loca...,0.0,0.896118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.796519,0.886793
2,89,Lead Data Scientist\nIf you are a Lead Data Sc...,0.0,0.897939,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.819353,1.508867
3,25,Blue Nile was created in 1999 by a man in love...,0.0,0.900881,0.037139,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.884882,0.931233
4,116,Back to search results\n\nSearch by Job Title ...,0.0,0.901781,0.016726,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.888248,0.901934


In [321]:
yes_adjusted_df = test.prompt_user(test_df, 1)

Are you interested in working for one of the most exciting products in Microsoft, passionate about exceeding customer expectations and advancing Microsoft's cloud first strategy? Are you interested in a start-up like environment, excited about cloud computing technology and driving growth in one of Microsoft's core businesses? If so, then look no further than the Azure Customer Experience (CXP) Team!

Microsoft Azure provides customers with an on-demand and infinitely scalable infrastructure and platform for customers to build, host, and scale service applications on the Internet through Microsoft’s global data centers. As part of the Azure Engineering organization, Azure CXP is a rapidly growing team committed to driving Azure growth through our relentless pursuit of satisfied Azure customers, by leading world-class customer reliability engagements, engineering modern customer-first experiences for scale, and by driving deep customer insights and empathy into the broader Azure Enginee


yes/no (quit) yes


In [322]:
no_adjusted_df = test.prompt_user(test_df, 3)

Lead Data Scientist

Seattle, WA

Job Description

We are looking for a Lead Data Scientist to help research and discover new technology to help us deliver even greater levels of value to our customers. The ideal candidate will bring analytical rigor, deep understanding of statistical methods, and experience with machine learning to help our R&D team continue to succeed.

Responsibilities

• Pull large batches of data from multiple SQL data sources to drive your analysis

• Work with and understand Python to research and perform analysis

• Research and deliver machine learning solutions to solve large data problems

• Work with a team of Software Engineers to deliver maintainable solutions

• Clearly document your delivered research and analysis

• Teach others and learn new techniques for applying statistical analysis and machine learning

• Effectively communicate with business stake holders and your team members when answering questions, participating in discussions, or otherwise h


yes/no (quit) no


In [163]:
# remains = test.improve_yes(adjusted_df)

In [198]:
yes_df = adjusted_df[adjusted_df['labels'] == 1.0]
remains = pd.DataFrame(adjusted_df[adjusted_df['labels'] == 0.0]).set_index(np.arange(0,remains.shape[0]))

In [202]:
test_distances = []
for j in range(remains.shape[0]):
    test_distances.append(spatial.distance.cosine(yes_df.iloc[0,4:], remains.iloc[j,4:]))

In [203]:
remains['distances_from_yes'] = test_distances

In [205]:
remains['summed_distances'] = remains.distances + remains.distances_from_yes

In [250]:
#remains.iloc[remains.summed_distances.idxmin(),1].split('\n')