In [1]:
import numpy as np
import pandas as pd


In [2]:
job_df = pd.read_csv("Combined_Jobs_Final.csv")

In [3]:
job_df.head(2)

Unnamed: 0,Job.ID,Provider,Status,Slug,Title,Position,Company,City,State.Name,State.Code,...,Industry,Job.Description,Requirements,Salary,Listing.Start,Listing.End,Employment.Type,Education.Required,Created.At,Updated.At
0,111,1,open,palo-alto-ca-tacolicious-server,Server @ Tacolicious,Server,Tacolicious,Palo Alto,California,CA,...,Food and Beverages,Tacolicious' first Palo Alto store just opened...,,8.0,,,Part-Time,,2013-03-12 02:08:28 UTC,2014-08-16 15:35:36 UTC
1,113,1,open,san-francisco-ca-claude-lane-kitchen-staff-chef,Kitchen Staff/Chef @ Claude Lane,Kitchen Staff/Chef,Claude Lane,San Francisco,California,CA,...,Food and Beverages,\r\n\r\nNew French Brasserie in S.F. Financia...,,0.0,,,Part-Time,,2013-04-12 08:36:36 UTC,2014-08-16 15:35:36 UTC


In [4]:
print(len(job_df))
job_df = job_df[['Status', 'Title', 'Position', 'Company', 'Job.Description']]
print(len(job_df))

84090
84090


In [5]:
job_df.shape

(84090, 5)

In [6]:
job_df['Job.Description'][20]

'Hiring Event Details\r\nStore Associate\r\n\r\n$12.00 / Hour\r\nAdditional $1.00 Per Hour For ALL Sunday Shifts!\r\n50 Cent Wage Increases Beginning At 6 Months - Up to $13.50 At 2 Years\r\n\r\nMonday, December 15, 2014\r\n9am - 11am\r\n\r\nALDI\r\n3133 Market Place Dr\r\nOnalaska, WI 54650\r\n\r\n&nbsp;\r\nFor consideration, please apply in person at the hiring event only. Get started now by downloading our Store Employment Application.\r\n\r\nStore Associate - Retail Sales ( Customer Service )\r\n\r\nIf you are a customer service minded individual with a positive and energetic personality and you&rsquo;re interested in working for one of the best-known grocery stores in the nation, join the ALDI family! We are looking for motivated and reliable individuals to serve as a Store Associate. You will serve as the face of ALDI, providing customers with friendly and efficient check-out services. But that&rsquo;s just the beginning. You will also assist the store manager in a variety of rol

In [7]:
job_df.isnull().sum()
job_df.fillna('',inplace=True)
job_df.isnull().sum()

Status             0
Title              0
Position           0
Company            0
Job.Description    0
dtype: int64

In [8]:
job_df = job_df.sample(n=1000,random_state=42)

In [9]:
job_df.shape

(1000, 5)

# cleaning dataset
keeping all letters and digits                          
lover case                             
removing stopwords                            
tokenization                            
stemming                         

In [10]:
from nltk.corpus import stopwords
# import nltk
import re
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
# nltk.download('punkt')
# nltk.download('stopwords')
# print(nltk.data.find('tokenizers/punkt'))
# nltk.data.path.append('/Users/chiraglokhande/nltk_data')

import spacy

nlp = spacy.load("en_core_web_sm")

In [11]:
# def cleaning(txt):
#     txt = re.sub(r'[^a-zA-Z0-9\s]', '', txt)
#     doc = nlp(txt.lower())
#     tokens = [token.text for token in doc]
#     stemming = [ps.stem(w) for w in tokens if w not in stopwords.words('english')]
#     # Return a single string instead of a list
#     return ' '.join(stemming)


def cleaning(txt):
    # Remove special characters, newline characters, and multiple spaces
    txt = re.sub(r'[^a-zA-Z0-9\s]', '', txt)  # Remove special characters
    txt = re.sub(r'[\r\n]+', ' ', txt)  # Replace newline characters with a space
    txt = re.sub(r'\s+', ' ', txt)  # Replace multiple spaces with a single space
    
    doc = nlp(txt.lower())  # Convert to lowercase and process text
    tokens = [token.text for token in doc]  # Tokenize the text
    
    # Apply stemming and remove stopwords
    stemming = [ps.stem(w) for w in tokens if w not in stopwords.words('english')]
    
    # Return a single string instead of a list
    return ' '.join(stemming)
    

In [12]:
cleaning("\n\rhelo the master piece is my loving moving cat @9032#%$")

'  helo master piec love move cat 9032'

In [13]:
job_df['Job.Description'] = job_df['Job.Description'].astype(str).apply(lambda x: cleaning(x))
job_df['Title'] = job_df['Title'].astype(str).apply(lambda x: cleaning(x))
job_df['Position'] = job_df['Position'].astype(str).apply(lambda x: cleaning(x))

In [14]:
job_df['clean_text'] = job_df['Job.Description']+" "+job_df['Title']+job_df['Position']

In [15]:
job_df['clean_text'][64119]

'job summari knowledg univers ku site director site leader inspir children teacher alik learn grow passion educ excel confid teach children adult use nation recogn curriculum framework creat uniqu engag classroom experi commit make site success know meaning relationship children famili team import success fulli engag enthusiast work eager share knowledg other job respons essenti function basic expect site director cours creativ new way meet exceed expect encourag long requir essenti function also met supervis children staff record keep licens record child file lesson plan implement mainten safe welcom classroom environ build relationship commun school recruit new student program applic must strong organiz skill site director knowledg universsite director'

# vectorizatoin

In [16]:
job_df['combined_text'] = job_df['Title'] + " " + job_df['Position'] + " " + job_df['Company'] + "" +job_df['clean_text'] 
 
job_df['combined_text'][64119]

'site director knowledg univers site director Knowledge Universejob summari knowledg univers ku site director site leader inspir children teacher alik learn grow passion educ excel confid teach children adult use nation recogn curriculum framework creat uniqu engag classroom experi commit make site success know meaning relationship children famili team import success fulli engag enthusiast work eager share knowledg other job respons essenti function basic expect site director cours creativ new way meet exceed expect encourag long requir essenti function also met supervis children staff record keep licens record child file lesson plan implement mainten safe welcom classroom environ build relationship commun school recruit new student program applic must strong organiz skill site director knowledg universsite director'

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
# tfidf = TfidfVectorizer(stop_words='english')
# matrix = tfidf.fit_transform(job_df['clean_text'])
# similarity = cosine_similarity(matrix)

tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the cleaned text
matrix = tfidf.fit_transform(job_df['combined_text'])

# Compute the cosine similarity matrix
similarity = cosine_similarity(matrix)

In [19]:
similarity

array([[1.        , 0.03750839, 0.01579204, ..., 0.04718412, 0.01708843,
        0.06612123],
       [0.03750839, 1.        , 0.02408708, ..., 0.03279808, 0.00478697,
        0.02723048],
       [0.01579204, 0.02408708, 1.        , ..., 0.04443238, 0.02996735,
        0.04040011],
       ...,
       [0.04718412, 0.03279808, 0.04443238, ..., 1.        , 0.05217026,
        0.09961131],
       [0.01708843, 0.00478697, 0.02996735, ..., 0.05217026, 1.        ,
        0.37446873],
       [0.06612123, 0.02723048, 0.04040011, ..., 0.09961131, 0.37446873,
        1.        ]])

In [20]:
sorted(list(enumerate(similarity[0])), key=lambda x: x[1], reverse=True)[1:20]

[(276, np.float64(0.9484782708382822)),
 (730, np.float64(0.4614697501291734)),
 (81, np.float64(0.4291312793015668)),
 (917, np.float64(0.4291312793015668)),
 (252, np.float64(0.26058581908840395)),
 (629, np.float64(0.23188313365483862)),
 (825, np.float64(0.2072347239225912)),
 (360, np.float64(0.20405678270391356)),
 (128, np.float64(0.19600885831079434)),
 (38, np.float64(0.19323954080632813)),
 (982, np.float64(0.16772644271462298)),
 (114, np.float64(0.16414592987038637)),
 (298, np.float64(0.15197869520452675)),
 (847, np.float64(0.14371085721904345)),
 (245, np.float64(0.13953676434226892)),
 (113, np.float64(0.13463028202009936)),
 (254, np.float64(0.13445659698301712)),
 (59, np.float64(0.1326890672201081)),
 (528, np.float64(0.12496696461467746))]

# Recommendation System

In [21]:
job_df

Unnamed: 0,Status,Title,Position,Company,Job.Description,clean_text,combined_text
64119,open,site director knowledg univers,site director,Knowledge Universe,job summari knowledg univers ku site director ...,job summari knowledg univers ku site director ...,site director knowledg univers site director K...
35827,open,administr assist officeteam,administr assist,OfficeTeam,ref 03110118480classif secretaryadmin asstcomp...,ref 03110118480classif secretaryadmin asstcomp...,administr assist officeteam administr assist O...
72100,open,account manag chi payment system,account manag,CHI Payment Systems,yoursquor energet motiv hardwork look prosper ...,yoursquor energet motiv hardwork look prosper ...,account manag chi payment system account manag...
46355,open,outsid wholesal sale rep parttim river front c...,outsid wholesal sale rep parttim,River Front Chrysler Jeep Dodge,outsid wholesal sale rep job respons sale repr...,outsid wholesal sale rep job respons sale repr...,outsid wholesal sale rep parttim river front c...
34166,open,custom servic rep help peopl hear loss captioncal,custom servic rep help peopl hear loss,CaptionCall,captioncal commun assist imagin life without...,captioncal commun assist imagin life without...,custom servic rep help peopl hear loss caption...
...,...,...,...,...,...,...,...
66282,open,sale repres sale associ entri level vector market,sale repres sale associ entri level,Vector Marketing,nbsp eager learn opportun whether look partt...,nbsp eager learn opportun whether look partt...,sale repres sale associ entri level vector mar...
39515,open,staff account accountemp,staff account,Accountemps,ref id02120107460classificationaccount staffco...,ref id02120107460classificationaccount staffco...,staff account accountemp staff account Account...
69231,open,unarm secur offic us secur associ,unarm secur offic,US Security Associates,direct supervis unarm secur offic respons patr...,direct supervis unarm secur offic respons patr...,unarm secur offic us secur associ unarm secur ...
69618,open,line cook crown plaza independ own oper,line cook,Crowne Plaza- Independently Owned & Operated,summari respons maintain set food product qual...,summari respons maintain set food product qual...,line cook crown plaza independ own oper line c...


In [22]:
print(job_df['Title'].unique())

['site director knowledg univers' 'administr assist officeteam'
 'account manag chi payment system'
 'outsid wholesal sale rep parttim river front chrysler jeep dodg'
 'custom servic rep help peopl hear loss captioncal' 'bookkeep accountemp'
 'nuclear medicin tech prn hcaeast florida'
 'construct project admin need asap officeteam' 'account clerk accountemp'
 'cashier murphi usa'
 'caregiv home health aid cna home instead senior care'
 'hha cna need immedi bayada home health care'
 'oto sale advisor 224 twinsburg club demonstr servic'
 'behavior health nurs bhn mission healthcar servic inc'
 'recent gradsaccount financ accountemp'
 'custom servic repres sale experi prefer officeteam'
 'market sale repres weed man usa franchis'
 'staff account emerg softwar compani accountemp'
 'warehous stocker restaur depot'
 'automot titl support clerk part time jack cooper transport'
 'lpn oakview rehab nurs center'
 'oto culinari ambassador 39 washington club demonstr servic'
 'lm administr officet

In [23]:
def recommend(title):
    idx=job_df[job_df['Title']==title].index[0]
    idx=job_df.index.get_loc(idx)
    distances = sorted(list(enumerate(similarity[idx])),key=lambda x:x[1],reverse=True)[1:21]
     
    print(distances)
    jobs=[]
    for i in distances:
        job_info = {
            'Title': job_df.iloc[i[0]]['Title'],
            'Position': job_df.iloc[i[0]]['Position'],
            'Company': job_df.iloc[i[0]]['Company'],
            'Status': job_df.iloc[i[0]]['Status'],
            'Job Description': job_df.iloc[i[0]]['Job.Description']
        }
        #print(job_info)
        jobs.append(job_info)
    
    return jobs



In [24]:
recommend('account manag chi payment system')

[(383, np.float64(0.9496440937426611)), (872, np.float64(0.5352879938074053)), (501, np.float64(0.23168821254943936)), (415, np.float64(0.20482781816723114)), (874, np.float64(0.19311706608136958)), (359, np.float64(0.1880466780236648)), (594, np.float64(0.18200611088429028)), (821, np.float64(0.18131232089211247)), (764, np.float64(0.16486071558176807)), (712, np.float64(0.13658657095934923)), (533, np.float64(0.1306285106559144)), (448, np.float64(0.12690715373513498)), (424, np.float64(0.12166897081870358)), (29, np.float64(0.12157788337075755)), (173, np.float64(0.12157788337075755)), (175, np.float64(0.12157788337075755)), (277, np.float64(0.12157788337075755)), (314, np.float64(0.12157788337075755)), (328, np.float64(0.12157788337075755)), (414, np.float64(0.12157788337075755))]


[{'Title': 'entri level sale repres chi payment system',
  'Position': 'entri level sale repres',
  'Company': 'CHI Payment Systems',
  'Status': 'open',
  'Job Description': 'yoursquor energet motiv hardwork look prosper occup chi payment system exactli look employ career chi payment system make dream own busi earn sixfigur incom realiti nbsp nbsp chi payment system util independ sale agent across unit state set new exist busi abil accept credit card transactionsnbsp juggernaut merchant servic industri enabl agent offer competit rate fee prospect merchantsnbsp also provid superior level support valu busi set merchant servicesnbspnbsp chi payment system compris season manag team two decad experi merchant servic industrynbsp dedic train support motiv sale agentsnbsp arm agent skill techniqu need domin competitionnbsp also incentiv agent extrem aggress commiss structur includ bonus addit residu payout account set success success independ agent recruit depart open fromnbsp7am 6 pm pacif t

In [25]:
import pickle
pickle.dump(job_df,open('df.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))

In [26]:
import pandas as pd
print(pd.__version__)


2.2.3


In [27]:
job_df


Unnamed: 0,Status,Title,Position,Company,Job.Description,clean_text,combined_text
64119,open,site director knowledg univers,site director,Knowledge Universe,job summari knowledg univers ku site director ...,job summari knowledg univers ku site director ...,site director knowledg univers site director K...
35827,open,administr assist officeteam,administr assist,OfficeTeam,ref 03110118480classif secretaryadmin asstcomp...,ref 03110118480classif secretaryadmin asstcomp...,administr assist officeteam administr assist O...
72100,open,account manag chi payment system,account manag,CHI Payment Systems,yoursquor energet motiv hardwork look prosper ...,yoursquor energet motiv hardwork look prosper ...,account manag chi payment system account manag...
46355,open,outsid wholesal sale rep parttim river front c...,outsid wholesal sale rep parttim,River Front Chrysler Jeep Dodge,outsid wholesal sale rep job respons sale repr...,outsid wholesal sale rep job respons sale repr...,outsid wholesal sale rep parttim river front c...
34166,open,custom servic rep help peopl hear loss captioncal,custom servic rep help peopl hear loss,CaptionCall,captioncal commun assist imagin life without...,captioncal commun assist imagin life without...,custom servic rep help peopl hear loss caption...
...,...,...,...,...,...,...,...
66282,open,sale repres sale associ entri level vector market,sale repres sale associ entri level,Vector Marketing,nbsp eager learn opportun whether look partt...,nbsp eager learn opportun whether look partt...,sale repres sale associ entri level vector mar...
39515,open,staff account accountemp,staff account,Accountemps,ref id02120107460classificationaccount staffco...,ref id02120107460classificationaccount staffco...,staff account accountemp staff account Account...
69231,open,unarm secur offic us secur associ,unarm secur offic,US Security Associates,direct supervis unarm secur offic respons patr...,direct supervis unarm secur offic respons patr...,unarm secur offic us secur associ unarm secur ...
69618,open,line cook crown plaza independ own oper,line cook,Crowne Plaza- Independently Owned & Operated,summari respons maintain set food product qual...,summari respons maintain set food product qual...,line cook crown plaza independ own oper line c...
