In [40]:
import pandas as pd
import numpy as np
#Import the literal_eval function from ast
from ast import literal_eval

In [41]:
df = pd.read_csv('../jobs.csv')
df.columns

Index(['country', 'country_code', 'date_added', 'has_expired', 'job_board',
       'job_description', 'job_title', 'job_type', 'location', 'organization',
       'page_url', 'salary', 'sector', 'uniq_id'],
      dtype='object')

In [42]:
df = df.drop('has_expired', axis=1)
df = df.drop('page_url', axis=1)
df = df.drop('country', axis=1)
df = df.drop('country_code', axis=1)
df = df.drop('date_added', axis=1)
df = df.drop('job_board', axis=1)
df = df.drop('salary', axis=1)
df.drop(df.index[11000:22000-1], inplace=True)
df.shape

(11001, 7)

In [43]:
df.head(20)

Unnamed: 0,job_description,job_title,job_type,location,organization,sector,uniq_id
0,TeamSoft is seeing an IT Support Specialist to...,IT Support Technician Job in Madison,Full Time Employee,"Madison, WI 53702",,IT/Software Development,11d599f229a80023d2f40e7c52cd941e
1,The Wisconsin State Journal is seeking a flexi...,Business Reporter/Editor Job in Madison,Full Time,"Madison, WI 53708",Printing and Publishing,,e4cbb126dabf22159aff90223243ff2a
2,Report this job About the Job DePuy Synthes Co...,Johnson & Johnson Family of Companies Job Appl...,"Full Time, Employee",DePuy Synthes Companies is a member of Johnson...,Personal and Household Services,,839106b353877fa3d896ffb9c1fe01c0
3,Why Join Altec? If you’re considering a career...,Engineer - Quality Job in Dixon,Full Time,"Dixon, CA",Altec Industries,Experienced (Non-Manager),58435fcab804439efdcaa7ecca0fd783
4,Position ID# 76162 # Positions 1 State CT C...,Shift Supervisor - Part-Time Job in Camphill,Full Time Employee,"Camphill, PA",Retail,Project/Program Management,64d0272dc8496abfd9523a8df63c184c
5,Job Description Job #: 720298Apex Systems has...,Construction PM - Charlottesville Job in Charl...,Full Time Employee,"Charlottesville, VA",Computer/IT Services,Experienced (Non-Manager),1e2637cb5f7a2c4615a99a26c0566c66
6,Report this job About the Job Based in San Fra...,CyberCoders Job Application for Principal QA E...,"Full Time, Employee",Contact name Tony Zerio,Computer Software,,455802d725fde67293970ab3953b1d39
7,RESPONSIBILITIES:Kforce has a client seeking a...,Mailroom Clerk Job in Austin,Full Time Employee,"Austin, TX 73301",,Experienced (Non-Manager),549a0541e4452ecd155efc032aaa72d7
8,"Part-Time, 4:30 pm - 9:30 pm, Mon - Fri Brookd...",Housekeeper Job in Austin,Part Time Employee,"Austin, TX 78746",Hotels and Lodging Personal and Household Serv...,Customer Support/Client Care,a6a2b5e825b8ce1c3b517adb2497c5ed
9,"Insituform Technologies, LLC, an Aegion compan...",Video Data Management /Transportation Technici...,,"Chesterfield, MO",,,73a9ba2b706e02628fa22ca1357174b1


# Creating the TF-IDF matrix

In [44]:
#Import TfIdfVectorizer from the scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer
#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')
#Replace NaN with an empty string
df['job_description'] = df['job_description'].fillna('')
df['organization'] = df['organization'].fillna('')
df['sector'] = df['sector'].fillna('')
#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(df['job_description'])
#Output the shape of tfidf_matrix
tfidf_matrix.shape

(11001, 65425)

# Computing the cosine similarity score

In [None]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

#Construct a reverse mapping of indices and job titles, and drop duplicate titles, if any
indices = pd.Series(df.index, index=df['job_title']).drop_duplicates()

# Building the recommender function

In [None]:
# Function that takes in job title as input and gives recommendations 
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all jobs with that job
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the jobs based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 15 most similar jobs. Ignore the first job.
    sim_scores = sim_scores[1:16]

    # Get the job indices
    job_indices = [i[0] for i in sim_scores]

    # Return the top 15 most similar jobs
    return df['job_title'].iloc[job_indices]

In [None]:
#Get recommendations
content_recommender('Sales Associate Job in Columbus')