In [19]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, balanced_accuracy_score, jaccard_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score

#ML Model
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

#joblib
import joblib

In [20]:
df=pd.read_csv('./dataset/job/job_skills_clean.csv')
df.head()
df.shape

(1235, 12)

In [21]:
# Combining two requirement
df['Qualifications'] = df['Minimum_Qualifications_clean'] + ' ' + df['Preferred_Qualifications_clean']

In [22]:
# Removing duplicate words
def remove_dups(words):
    word_token = word_tokenize(words)
    ordered_token = set()
    result=[]
    for word in word_token:
        if word not in ordered_token:
            ordered_token.add(word)
            result.append(word)
    return ' '.join(result)

In [23]:
#apply stemming
def stemming(words):
    ps = PorterStemmer()
    ordered_token = set()
    result=[]
    for word in word_tokenize(words):
        w = ps.stem(word)
        if w not in ordered_token:
            ordered_token.add(w)
            result.append(w)
    return ' '.join(result)

In [24]:
df['Qualifications_unique'] = df['Qualifications'].apply(remove_dups)

In [25]:
df['Qualifications_stem'] = df['Qualifications'].apply(stemming)

### The Best (default) Model is LinearSVC with "Qualifications" Features

In [26]:
cols=['Minimum_Qualifications', 'Preferred_Qualifications', 'Minimum_Qualifications_clean', 'Preferred_Qualifications_clean', 'Qualifications', 'Qualifications_unique', 'Qualifications_stem']
target=['Category']
scoring=['balanced_accuracy', 'f1_macro', 'jaccard_macro']
classifier = {
    'lsvc': LinearSVC(),
    'dt': DecisionTreeClassifier(),
    'rf': RandomForestClassifier(),
    'svc': SVC()
}


In [27]:
c ={i:[0]*len(list(classifier.keys())) for i in cols}
for y in scoring:
    vars()[f'df_{y}'] = pd.DataFrame(c,index=list(classifier.keys()))
    for j in cols:
        for i in classifier:
            pipe = make_pipeline(
                CountVectorizer(),
                TfidfTransformer(),
                classifier[i]
            )
            score = cross_val_score(pipe, df[j], df[target], scoring=y)
            vars()[f'df_{y}'].loc[i,j] = np.mean(score)


In [28]:
df_jaccard_macro.T

Unnamed: 0,lsvc,dt,rf,svc
Minimum_Qualifications,0.591487,0.464033,0.552621,0.515147
Preferred_Qualifications,0.637566,0.473251,0.609061,0.539063
Minimum_Qualifications_clean,0.59167,0.479003,0.546703,0.522432
Preferred_Qualifications_clean,0.64014,0.480406,0.63176,0.545649
Qualifications,0.6888,0.515899,0.636795,0.575789
Qualifications_unique,0.690955,0.496689,0.619425,0.562697
Qualifications_stem,0.687512,0.50949,0.615283,0.561151


In [29]:
df_balanced_accuracy.T

Unnamed: 0,lsvc,dt,rf,svc
Minimum_Qualifications,0.700645,0.6104,0.650852,0.58673
Preferred_Qualifications,0.736206,0.61787,0.682969,0.598041
Minimum_Qualifications_clean,0.706976,0.606171,0.656604,0.593596
Preferred_Qualifications_clean,0.739399,0.619842,0.709295,0.604274
Qualifications,0.780606,0.659446,0.726785,0.637359
Qualifications_unique,0.77979,0.624856,0.702159,0.620711
Qualifications_stem,0.773403,0.652312,0.711962,0.621308


In [30]:
df_f1_macro.T

Unnamed: 0,lsvc,dt,rf,svc
Minimum_Qualifications,0.69464,0.572428,0.649672,0.613846
Preferred_Qualifications,0.738339,0.582662,0.719869,0.63053
Minimum_Qualifications_clean,0.69546,0.59429,0.65208,0.62056
Preferred_Qualifications_clean,0.740084,0.579159,0.71539,0.636532
Qualifications,0.779049,0.64229,0.740633,0.669879
Qualifications_unique,0.780376,0.615939,0.717621,0.650701
Qualifications_stem,0.77707,0.636936,0.715753,0.65214


In [31]:
model = LinearSVC()
print(model)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)


In [32]:
param={
    'lsvc__penalty': ['l1', 'l2'],
    'lsvc__loss': ['hinge', 'squared_hinge'],
    'lsvc__dual': [True, False],
    'lsvc__multi_class': ['ovr', 'crammer_singer'],
    'lsvc__fit_intercept': [True, False],
    'lsvc__max_iter': [10, 100, 1000]
}

pipe = Pipeline(steps=[
    ('cv', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lsvc', LinearSVC())
])
clf = GridSearchCV(pipe, param)
clf.fit(df['Qualifications'], df['Category'])

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cv',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        pre

In [33]:
clf.best_params_

{'lsvc__dual': True,
 'lsvc__fit_intercept': True,
 'lsvc__loss': 'squared_hinge',
 'lsvc__max_iter': 10,
 'lsvc__multi_class': 'ovr',
 'lsvc__penalty': 'l2'}

In [34]:
pipe=Pipeline(steps=[
    ('cv', CountVectorizer(ngram_range=(1,3))),
    ('tfidf', TfidfTransformer()),
    ('lsvc', LinearSVC(loss='squared_hinge', penalty='l2', max_iter=10))
])
score = cross_val_score(pipe, df['Qualifications'], df['Category'], scoring='balanced_accuracy')
print(np.mean(score))

0.782083902592722


In [35]:
pipe.fit(df['Qualifications'], df['Category'])

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('lsvc',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                      

In [36]:
filename = 'linearsvc_model.sav'
joblib.dump(pipe, filename)

['linearsvc_model.sav']

In [40]:
df.drop(columns='Unnamed: 0', inplace=True)
df.to_csv('./dataset/job/job_skills_clean_final.csv')