In [1]:
import pandas as pd
import numpy as np
import random
import warnings
from clean import clean_txt
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings("ignore")

### Job_Views dataset

In [2]:
df_job_view = pd.read_csv("./data/Job_Views.csv")
df_job_view.head(2)

Unnamed: 0,Applicant.ID,Job.ID,Title,Position,Company,City,State.Name,State.Code,Industry,View.Start,View.End,View.Duration,Created.At,Updated.At
0,10000,73666,Cashiers & Valets Needed! @ WallyPark,Cashiers & Valets Needed!,WallyPark,Newark,New Jersey,NJ,,2014-12-12 20:12:35 UTC,2014-12-12 20:31:24 UTC,1129.0,2014-12-12 20:12:35 UTC,2014-12-12 20:12:35 UTC
1,10000,96655,Macy's Seasonal Retail Fragrance Cashier - Ga...,Macy's Seasonal Retail Fragrance Cashier - Ga...,Macy's,Garden City,New York,NY,,2014-12-12 20:08:50 UTC,2014-12-12 20:10:15 UTC,84.0,2014-12-12 20:08:50 UTC,2014-12-12 20:08:50 UTC


In [26]:
df_job_view[df_job_view['Applicant.ID'] == 10000]['Position'].values.tolist()

['Cashiers & Valets Needed!',
 "Macy's Seasonal Retail Fragrance Cashier  - Garden City, NY - Roosevelt Field"]

In [3]:
df_job_view = df_job_view[['Applicant.ID', 'Position', 'Company', 'City']]
df_job_view.isna().sum()

Applicant.ID      0
Position          0
Company         580
City              0
dtype: int64

In [4]:
df_job_view['Company'].fillna(' ', inplace=True)
df_job_view["select_pos_com_city"] = df_job_view["Position"] + " " + df_job_view["Company"] + " " + df_job_view["City"]
df_job_view['select_pos_com_city'] = df_job_view['select_pos_com_city'].map(str).apply(clean_txt)
df_job_view['select_pos_com_city'] = df_job_view['select_pos_com_city'].str.lower()
df_job_view = df_job_view[['Applicant.ID', 'select_pos_com_city']]
df_job_view.head()

Unnamed: 0,Applicant.ID,select_pos_com_city
0,10000,cashier valet need wallypark newark
1,10000,macys seasonal retail fragrance cashier garden...
2,10001,part time showroom sales cashier grizzly indus...
3,10002,event specialist part time advantage sales mar...
4,10002,bonefish kitchen staff bonefish grill greenville


Gộm tất cả thông tin log vào các Job.ID

In [5]:
df_job_view = df_job_view.groupby('Applicant.ID')['select_pos_com_city'].apply(' '.join).reset_index()
df_job_view

Unnamed: 0,Applicant.ID,select_pos_com_city
0,42,movie extras actors model want san francisco p...
1,96,kitchen staff izakaya yuzuki san francisco ser...
2,153,valic financial advisor intern roseville aig c...
3,601,retail sales consultant retail bay area associ...
4,1877,sales associate see candy sunnyvale
...,...,...
3443,14546,personal driver chauffeur quest llc los angele...
3444,14547,part time front office professional audtalent ...
3445,14549,louis vuitton sales professional part time blo...
3446,14552,personal assistant administrative assistant qu...


In [6]:
df_job_view.isna().sum()

Applicant.ID           0
select_pos_com_city    0
dtype: int64

### Experience dataset

In [7]:
df_experience = pd.read_csv("./data/Experience.csv")
df_experience.head(2)

Unnamed: 0,Applicant.ID,Position.Name,Employer.Name,City,State.Name,State.Code,Start.Date,End.Date,Job.Description,Salary,Can.Contact.Employer,Created.At,Updated.At
0,10001,Account Manager / Sales Administration / Quali...,Barcode Resourcing,Bellingham,Washington,WA,2012-10-15,,,,,2014-12-12 20:10:02 UTC,2014-12-12 20:10:02 UTC
1,10001,Electronics Technician / Item Master Controller,Ryzex Group,Bellingham,Washington,WA,2001-12-01,2012-04-01,,,,2014-12-12 20:10:02 UTC,2014-12-12 20:10:02 UTC


In [8]:
df_experience = df_experience[['Applicant.ID', 'Position.Name']]
df_experience.isna().sum()

Applicant.ID       0
Position.Name    950
dtype: int64

In [9]:
df_experience['Position.Name'].fillna(' ', inplace=True)
df_experience['Position.Name'] = df_experience['Position.Name'].map(str).apply(clean_txt)
df_experience.head()

Unnamed: 0,Applicant.ID,Position.Name
0,10001,account manager sales administration quality a...
1,10001,electronics technician item master controller
2,10001,machine operator
3,10003,maintenance technician
4,10003,electrical helper


In [10]:
df_experience['Position.Name'].fillna(' ', inplace=True)
df_experience['Position.Name'] = df_experience['Position.Name'].map(str).apply(clean_txt)
df_experience.head()

Unnamed: 0,Applicant.ID,Position.Name
0,10001,account manager sales administration quality a...
1,10001,electronics technician item master controller
2,10001,machine operator
3,10003,maintenance technician
4,10003,electrical helper


In [11]:
df_experience.isna().sum()

Applicant.ID     0
Position.Name    0
dtype: int64

Do 1 người có kinh nghiệm trong nhiều vị trị nên ta sẽ gộp tất cả các vị trị của từng người

In [12]:
df_experience = df_experience.groupby('Applicant.ID')['Position.Name'].apply(' '.join).reset_index()
df_experience

Unnamed: 0,Applicant.ID,Position.Name
0,2,writer uloop blog volunteer
1,3,prep cook server market intern
2,6,project assistant
3,8,deli clerk server cashier food prep order taker
4,11,cashier
...,...,...
3785,14637,
3786,14638,doorman bartender facilities department bartender
3787,14639,founder girl boss consultant tour guide studen...
3788,14642,owner adjunct faculty founder ceo vice preside...


## Position of Interest dataset

In [13]:
df_position =  pd.read_csv("./data/Positions_Of_Interest.csv")
df_position

Unnamed: 0,Applicant.ID,Position.Of.Interest,Created.At,Updated.At
0,10003,security officer,2014-12-12 21:20:54 UTC,2014-12-12 21:20:54 UTC
1,10007,Server,2014-08-14 15:56:42 UTC,2015-02-26 20:35:12 UTC
2,10007,Bartender,2014-08-14 15:56:44 UTC,2015-02-19 23:21:28 UTC
3,10008,Host,2014-08-14 15:56:42 UTC,2015-02-26 20:35:12 UTC
4,10008,Barista,2014-08-14 15:56:43 UTC,2015-02-18 02:35:06 UTC
...,...,...,...,...
6555,9995,Server,2014-08-14 15:56:42 UTC,2015-02-26 20:35:12 UTC
6556,9996,Security,2014-12-12 19:06:28 UTC,2014-12-12 19:06:28 UTC
6557,9997,Server,2014-08-14 15:56:42 UTC,2015-02-26 20:35:12 UTC
6558,9997,Barista,2014-08-14 15:56:43 UTC,2015-02-18 02:35:06 UTC


In [14]:
df_position = df_position[['Applicant.ID', 'Position.Of.Interest']]
df_position.isna().sum()

Applicant.ID            0
Position.Of.Interest    2
dtype: int64

In [15]:
df_position['Position.Of.Interest'].fillna(' ', inplace=True)
df_position['Position.Of.Interest'] = df_position['Position.Of.Interest'].map(str).apply(clean_txt)
df_position.head(10)

Unnamed: 0,Applicant.ID,Position.Of.Interest
0,10003,security officer
1,10007,server
2,10007,bartender
3,10008,host
4,10008,barista
5,10008,receptionist
6,10008,book keeper
7,10008,customer service rep
8,10012,server
9,10016,server


In [16]:
df_position.isna().sum()

Applicant.ID            0
Position.Of.Interest    0
dtype: int64

Do 1 người có quan tâm tới nhiều vị trí nên ta sẽ gộp tất cả các vị trị của từng người

In [17]:
df_position = df_position.groupby('Applicant.ID')['Position.Of.Interest'].apply(' '.join).reset_index()
df_position.head()

Unnamed: 0,Applicant.ID,Position.Of.Interest
0,96,server
1,153,server host barista customer service rep sales...
2,256,server host receptionist book keeper customer ...
3,438,server host barista customer service rep
4,568,receptionist book keeper customer service rep


## Tạo user dataset

Merge 3 datasets

In [18]:
df_jobs_exp = df_job_view.merge(df_experience, on='Applicant.ID', how='outer')
df_jobs_exp = df_jobs_exp.fillna(' ')
df_jobs_exp = df_jobs_exp.sort_values(by='Applicant.ID')

df_jobs_exp_poi = df_jobs_exp.merge(df_position, on='Applicant.ID', how='outer')
df_jobs_exp_poi = df_jobs_exp_poi.fillna(' ')
df_jobs_exp_poi = df_jobs_exp_poi.sort_values(by='Applicant.ID')
df_jobs_exp_poi.head()

Unnamed: 0,Applicant.ID,select_pos_com_city,Position.Name,Position.Of.Interest
0,2,,writer uloop blog volunteer,
1,3,,prep cook server market intern,
2,6,,project assistant,
3,8,,deli clerk server cashier food prep order taker,
4,11,,cashier,


In [19]:
df_jobs_exp_poi["text"] = df_jobs_exp_poi["select_pos_com_city"] + df_jobs_exp_poi["Position.Name"] +" "+ df_jobs_exp_poi["Position.Of.Interest"]
final_user_df = df_jobs_exp_poi[['Applicant.ID', 'text']]
final_user_df['text'] = final_user_df['text'].apply(clean_txt)
final_user_df.head()
final_user_df.to_csv('./clean_data/final_user_df.csv', index=False)

In [27]:
final_df_job = pd.read_csv('./clean_data/final_df_jobs.csv')
combine_df = pd.read_csv('./data/Combined_Jobs_Final.csv')

In [32]:
text = 'cuisinier' 
from extract_feature import feature_extraction_user
from utils import get_recommendation


In [33]:
top, scores = feature_extraction_user([text], combine_df, type=2)

Số document trong Job Corpus là: 84090
Số lượng từ trong Job Corpus là: 51497


In [34]:
get_recommendation(top, combine_df, final_df_job, scores)

Unnamed: 0,ApplicantID,JobID,title,Position,Company,City,Job.Description,Employment.Type,score
0,,280421.0,"SLMC- RN, Emerg Dept @ HCA-East Florida","SLMC- RN, Emerg Dept",HCA-East Florida,Port Saint Lucie,"Job DescriptionSLMC- RN, Emerg Dept(Job Number...",Part-Time,1.0
1,,295494.0,Retail Sales Teammate @ Firestone Complete Aut...,Retail Sales Teammate,Firestone Complete Auto Care,Matthews,"Bridgestone Retail Operations, LLC employs ove...",Part-Time,1.0
2,,276659.0,RN- NICU .75fte (60hrs) nights St Joseph Medic...,RN- NICU .75fte (60hrs) nights St Joseph Medic...,Catholic Health Initiatives,Tacoma,JOB SUMMARY \r\n\r\nThis job is responsible fo...,Part-Time,1.0
3,,310413.0,"Registered Nurse, Medical Unit, PRN @ HCA-East...","Registered Nurse, Medical Unit, PRN",HCA-East Florida,Okeechobee,"Job DescriptionRegistered Nurse, Medical Unit,...",Per Diem,1.0
4,,137210.0,Temporary Drivers Needed! @ Volt Workforce Sol...,Temporary Drivers Needed!,Volt Workforce Solutions,Sewickley,PUT YOUR FUTURE\r\nIN DRIVE\r\n\r\nLooking for...,Seasonal/Temp,1.0
5,,267046.0,Licensed Practical Nurses (LPN's) - Now Accept...,Licensed Practical Nurses (LPN's) - Now Accept...,Trilogy Health Services,Howell,Overview\r\n\r\n\r\n\r\n\r\nFounded in Decembe...,Full-Time/Part-Time,1.0
6,,252102.0,IMMEDIATE NEED: Certified Nursing Assistant CN...,IMMEDIATE NEED: Certified Nursing Assistant CN...,CareSouth,Richmond,\r\nIMMEDIATE NEED: Certified Nurse Aides for ...,Part-Time,1.0
7,,265455.0,State Tested Nursing Assistant (STNA) @ Lincol...,State Tested Nursing Assistant (STNA),Lincoln Park Manor,Dayton,\r\n&nbsp;STNA\r\n\r\n\r\n\r\n\r\n\r\n\r\nComp...,Full-Time/Part-Time,1.0
8,,302295.0,Part Time 20 hour Teller - Eastern Hills Trans...,Part Time 20 hour Teller - Eastern Hills Trans...,KeyBank,Williamsville,ABOUT KEY: Cleveland-based KeyCorp is one of t...,Part-Time,1.0
9,,310813.0,Guest Services Coordinator (Seasonal - Full Ti...,Guest Services Coordinator (Seasonal - Full Time),Seaport RV Resort,Old Mystic,Are you a customer service all-star who is loo...,Seasonal/Temp,1.0


In [None]:
Retail Sales Teammate