In [21]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, balanced_accuracy_score, jaccard_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score

#ML Model
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

#joblib
import joblib

In [22]:
df=pd.read_csv('./dataset/job/job_skills_clean.csv')
df.head()
df.shape

(1235, 12)

In [23]:
# Combining two requirement
df['Qualifications'] = df['Minimum_Qualifications_clean'] + ' ' + df['Preferred_Qualifications_clean']

In [24]:
# Removing duplicate words
def remove_dups(words):
    word_token = word_tokenize(words)
    ordered_token = set()
    result=[]
    for word in word_token:
        if word not in ordered_token:
            ordered_token.add(word)
            result.append(word)
    return ' '.join(result)

In [25]:
#apply stemming
def stemming(words):
    ps = PorterStemmer()
    ordered_token = set()
    result=[]
    for word in word_tokenize(words):
        w = ps.stem(word)
        if w not in ordered_token:
            ordered_token.add(w)
            result.append(w)
    return ' '.join(result)

In [26]:
df['Qualifications_unique'] = df['Qualifications'].apply(remove_dups)

In [27]:
df['Qualifications_stem'] = df['Qualifications'].apply(stemming)

### The Best (default) Model is LinearSVC with "Qualifications" Features

In [28]:
cols=['Minimum_Qualifications', 'Preferred_Qualifications', 'Minimum_Qualifications_clean', 'Preferred_Qualifications_clean', 'Qualifications', 'Qualifications_unique', 'Qualifications_stem']
target=['Category']
scoring=['balanced_accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted']
classifier = {
    'lsvc': LinearSVC(),
    'dt': DecisionTreeClassifier(),
    'rf': RandomForestClassifier(),
    'svc': SVC()
}


In [29]:
c ={i:[0]*len(list(classifier.keys())) for i in cols}
for y in scoring:
    vars()[f'df_{y}'] = pd.DataFrame(c,index=list(classifier.keys()))
    for j in cols:
        for i in classifier:
            pipe = make_pipeline(
                CountVectorizer(),
                TfidfTransformer(),
                classifier[i]
            )
            score = cross_val_score(pipe, df[j], df[target], scoring=y)
            vars()[f'df_{y}'].loc[i,j] = np.mean(score)


In [30]:
df_balanced_accuracy.T

Unnamed: 0,lsvc,dt,rf,svc
Minimum_Qualifications,0.700645,0.606385,0.652859,0.58673
Preferred_Qualifications,0.736206,0.613842,0.716719,0.598041
Minimum_Qualifications_clean,0.706976,0.609064,0.650512,0.593596
Preferred_Qualifications_clean,0.739399,0.613464,0.697115,0.604274
Qualifications,0.780606,0.658097,0.716395,0.637359
Qualifications_unique,0.77979,0.630645,0.707376,0.620711
Qualifications_stem,0.773403,0.662875,0.701607,0.621308


In [31]:
df_f1_weighted.T

Unnamed: 0,lsvc,dt,rf,svc
Minimum_Qualifications,0.733265,0.646111,0.700265,0.701774
Preferred_Qualifications,0.770261,0.64618,0.744536,0.707549
Minimum_Qualifications_clean,0.737593,0.656153,0.703717,0.707346
Preferred_Qualifications_clean,0.774892,0.664496,0.750376,0.715818
Qualifications,0.803285,0.70154,0.767164,0.739845
Qualifications_unique,0.800915,0.684753,0.758026,0.732824
Qualifications_stem,0.800251,0.696207,0.761186,0.731


In [32]:
df_precision_weighted.T

Unnamed: 0,lsvc,dt,rf,svc
Minimum_Qualifications,0.747057,0.664231,0.738489,0.751674
Preferred_Qualifications,0.78192,0.652179,0.766373,0.76997
Minimum_Qualifications_clean,0.748346,0.674125,0.726261,0.75357
Preferred_Qualifications_clean,0.786476,0.681033,0.771439,0.773954
Qualifications,0.810507,0.715087,0.791318,0.785775
Qualifications_unique,0.810795,0.711702,0.789479,0.780909
Qualifications_stem,0.812735,0.704713,0.781229,0.785616


In [33]:
df_recall_weighted.T

Unnamed: 0,lsvc,dt,rf,svc
Minimum_Qualifications,0.74251,0.651012,0.715789,0.718219
Preferred_Qualifications,0.778947,0.652632,0.751417,0.722267
Minimum_Qualifications_clean,0.747368,0.670445,0.710931,0.723077
Preferred_Qualifications_clean,0.782996,0.667206,0.766802,0.730364
Qualifications,0.811336,0.703644,0.785425,0.751417
Qualifications_unique,0.808907,0.694737,0.777328,0.748178
Qualifications_stem,0.808097,0.679352,0.77004,0.746559


In [34]:
model = LinearSVC()
print(model)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)


In [35]:
param={
    'lsvc__penalty': ['l1', 'l2'],
    'lsvc__loss': ['hinge', 'squared_hinge'],
    'lsvc__dual': [True, False],
    'lsvc__multi_class': ['ovr', 'crammer_singer'],
    'lsvc__fit_intercept': [True, False],
    'lsvc__max_iter': [10, 100, 1000]
}

pipe = Pipeline(steps=[
    ('cv', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lsvc', LinearSVC())
])
clf = GridSearchCV(pipe, param)
clf.fit(df['Qualifications'], df['Category'])

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cv',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        pre

In [36]:
clf.best_params_

{'lsvc__dual': True,
 'lsvc__fit_intercept': True,
 'lsvc__loss': 'squared_hinge',
 'lsvc__max_iter': 100,
 'lsvc__multi_class': 'ovr',
 'lsvc__penalty': 'l2'}

In [37]:
pipe=Pipeline(steps=[
    ('cv', CountVectorizer(ngram_range=(1,3))),
    ('tfidf', TfidfTransformer()),
    ('lsvc', LinearSVC(loss='squared_hinge', penalty='l2', max_iter=10))
])
score = cross_val_score(pipe, df['Qualifications'], df['Category'], scoring='balanced_accuracy')
print(np.mean(score))

0.7812856386231026


In [38]:
pipe.fit(df['Qualifications'], df['Category'])

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('lsvc',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                      

In [39]:
filename = 'linearsvc_model.sav'
joblib.dump(pipe, filename)

['linearsvc_model.sav']

In [40]:
df.drop(columns='Unnamed: 0', inplace=True)
df.to_csv('./dataset/job/job_skills_clean_final.csv')