In [1]:
import os 
import pandas as pd
from sklearn import preprocessing
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import texthero as hero

In [2]:
project_filepath = 'C:/Users/kaslou/Desktop/job-recommendation-system/'
data_filepath = 'C:/Users/kaslou/Desktop/job-recommendation-system/data'

In [3]:
clean_corpus_filepath = os.path.join(data_filepath, 'clean_corpus.csv')

In [4]:
df = pd.read_csv(clean_corpus_filepath, delimiter=',')

In [5]:
df.head()

Unnamed: 0,job_id,account_id,title,description,requirement_summary,user_keywords,employment_type,function,experience,education,collar_color,soft_skills,technical_skills
0,0,825,project coordin,open health bring togeth deep scientif knowled...,,project manag research public growthopportun m...,Full-time,Project Management,0.2,Bachelor's Degree,White,attent detail commun organ,public heor internet research support folder m...
1,1,1778,suppli chain specialist,droplett revolution drug deliveri build disrup...,,,Full-time,Supply Chain,0.3,Bachelor's Degree,White,interdisciplinari detail orient commun detail ...,needl free platform technolog materi suppli re...
2,2,1099,food beverag director,fotografiskaanyth ordinari museum fotografiska...,averag year creat manag food beverag oper high...,food beverag food beverag manag luxuri hospit ...,Full-time,Management,0.8,Bachelor's Degree,White,,forecast fine dine experi written document mai...
3,3,1207,qualiti assur specialist,acolad intern leader languag content solut ser...,univers degre busi manag administr engin equiv...,,Full-time,Quality Assurance,0.3,Bachelor's Degree,White,problem solv mindset time manag custom orient,iso six sigma black belt process improv busi p...
4,4,577,peopl oper associ sep,peopl oper associ respons provid effect effici...,ideal candid least year exposur support employ...,human resourc hr peopl oper,Full-time,Human Resources,0.2,Bachelor's Degree,White,self starter attent detail organiz approach,hr softwar human hr


In [6]:
encoders_filepath = os.path.join(project_filepath, 'encoders')
if not os.path.exists(encoders_filepath):
    os.mkdir(encoders_filepath)
    
columns_to_enumerate = ['employment_type', 'function', 'education', 'collar_color']
for col in columns_to_enumerate:
    le = preprocessing.LabelEncoder()
    df[col] = le.fit_transform(df[col].values)
    
    encoder_filename = 'encode_' + col + '.pickle'
    encoder_filename = os.path.join(encoders_filepath, encoder_filename)
    
    with open(encoder_filename, 'wb') as fout:
        pickle.dump(le, fout, pickle.HIGHEST_PROTOCOL)
    

In [7]:
df.head()

Unnamed: 0,job_id,account_id,title,description,requirement_summary,user_keywords,employment_type,function,experience,education,collar_color,soft_skills,technical_skills
0,0,825,project coordin,open health bring togeth deep scientif knowled...,,project manag research public growthopportun m...,1,26,0.2,0,1,attent detail commun organ,public heor internet research support folder m...
1,1,1778,suppli chain specialist,droplett revolution drug deliveri build disrup...,,,1,34,0.3,0,1,interdisciplinari detail orient commun detail ...,needl free platform technolog materi suppli re...
2,2,1099,food beverag director,fotografiskaanyth ordinari museum fotografiska...,averag year creat manag food beverag oper high...,food beverag food beverag manag luxuri hospit ...,1,20,0.8,0,1,,forecast fine dine experi written document mai...
3,3,1207,qualiti assur specialist,acolad intern leader languag content solut ser...,univers degre busi manag administr engin equiv...,,1,29,0.3,0,1,problem solv mindset time manag custom orient,iso six sigma black belt process improv busi p...
4,4,577,peopl oper associ sep,peopl oper associ respons provid effect effici...,ideal candid least year exposur support employ...,human resourc hr peopl oper,1,17,0.2,0,1,self starter attent detail organiz approach,hr softwar human hr


In [8]:
columns_to_count_frequencies = ['title', 'description', 'requirement_summary', 'user_keywords', 'soft_skills', 'technical_skills']

In [9]:
for col in columns_to_count_frequencies: # Convert NaN (floats) to a preserved keyword (string) to avoid future errors
    df[[col]] = df[[col]].astype(str)

In [10]:
vectorizers_filepath = os.path.join(project_filepath, 'vectorizers')
if not os.path.exists(vectorizers_filepath):
    os.mkdir(vectorizers_filepath)

merged_df = df.copy()
for col in columns_to_count_frequencies:
    df[col] = df[col].fillna('')
    
    if col in ['title', 'user_keywords']:
        max_df = 0.99
        tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1), max_df=max_df)
    elif col in ['description', 'requirement_summary', 'soft_skills', 'technical_skills']:
        max_df = 0.95
        min_df = 0.05
        tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1), max_df=max_df, min_df=min_df)
    
    tfidf_matrix = tfidf.fit_transform(df[col])
    print(tfidf_matrix.shape)
    
    vectorizer_filename = 'vectorize_' + col + '.pickle'
    vectorizer_filename = os.path.join(vectorizers_filepath, vectorizer_filename)
        
    with open(vectorizer_filename, 'wb') as fout:
        pickle.dump(tfidf, fout, pickle.HIGHEST_PROTOCOL)

    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names())
    merged_df = pd.concat([merged_df, tfidf_df], axis=1)

(22553, 4006)




(22553, 602)




(22553, 285)




(22553, 5750)




(22553, 30)




(22553, 46)




In [11]:
merged_df.head()

Unnamed: 0,job_id,account_id,title,description,requirement_summary,user_keywords,employment_type,function,experience,education,...,servic,social,softwar,suit,support,technolog,test,tool,web,word
0,0,825,project coordin,open health bring togeth deep scientif knowled...,,project manag research public growthopportun m...,1,26,0.2,0,...,0.0,0.217613,0.0,0.0,0.216736,0.0,0.0,0.0,0.0,0.192619
1,1,1778,suppli chain specialist,droplett revolution drug deliveri build disrup...,,,1,34,0.3,0,...,0.0,0.0,0.0,0.0,0.0,0.234396,0.0,0.0,0.0,0.0
2,2,1099,food beverag director,fotografiskaanyth ordinari museum fotografiska...,averag year creat manag food beverag oper high...,food beverag food beverag manag luxuri hospit ...,1,20,0.8,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1207,qualiti assur specialist,acolad intern leader languag content solut ser...,univers degre busi manag administr engin equiv...,,1,29,0.3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,577,peopl oper associ sep,peopl oper associ respons provid effect effici...,ideal candid least year exposur support employ...,human resourc hr peopl oper,1,17,0.2,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
vectorized_df = merged_df.copy()

for col in columns_to_count_frequencies:
    vectorized_df.drop(col, axis=1, inplace=True)

In [13]:
vectorized_df.head()

Unnamed: 0,job_id,account_id,employment_type,function,experience,education,collar_color,aapi,aav,aba,...,servic,social,softwar,suit,support,technolog,test,tool,web,word
0,0,825,1,26,0.2,0,1,0.0,0.0,0.0,...,0.0,0.217613,0.0,0.0,0.216736,0.0,0.0,0.0,0.0,0.192619
1,1,1778,1,34,0.3,0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.234396,0.0,0.0,0.0,0.0
2,2,1099,1,20,0.8,0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1207,1,29,0.3,0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,577,1,17,0.2,0,1,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
vectorized_corpus_filepath = os.path.join(data_filepath, 'vectorized_corpus.csv')
vectorized_df.to_csv(vectorized_corpus_filepath, sep=',', index=False)