###### Packages

In [127]:
import pandas as pd


from nltk import word_tokenize
from nltk.corpus import stopwords
import nltk
import re
import string
stop_words = stopwords.words('english')
from collections import Counter

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


from sklearn.ensemble import VotingClassifier

In [13]:
data = pd.read_csv('Data.csv')

In [14]:
data.head()

Unnamed: 0,text,title
0,honda performance development inc a leader in ...,0
1,data lake architect w saas remote months contr...,1
2,senior software engineer qrypt is a post quant...,0
3,as a staff software engineer backend you will ...,0
4,we are looking for problem solvers with master...,1


###### About Dataset
| Columns | Descrtiption                |
| :---: |    :----             |
| text | Job's Description    |                      
| title | Job Title      |                      


###### About title 
| title | Value                 |
| :---: |    :----             |
| 0 | Software Engineer Jobs    |                      
| 1 | Data Scientist Jobs       |                      


###### Preprocessing Functions

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'[0-9]+', '', text)
    return text


# Remove stopwords
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

# Lemmatization
def lemma(text):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text)
    lemmatizer=nltk.stem.WordNetLemmatizer()
    lem = " ".join(lemmatizer.lemmatize(token) for token in tokens)
    return lem

# Combining text
def combine_text(list_of_text):
    combined_text = ' '.join(str(list_of_text))
    return combined_text

# remove duplicates 
def setify(lis): 
    unique_list = []
    for x in lis:
    # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x)
    return unique_list

# text preprocessing function
def text_preprocessing(text):

    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    
    listt = setify(remove_stopwords)

    combined_text = ' '.join(listt)
    lem = lemma(combined_text)

        
    return lem

###### Data split , feature and target variable

In [59]:
x = data['text']
y = data['title']

In [60]:
x.head()

0    honda performance development inc a leader in ...
1    data lake architect w saas remote months contr...
2    senior software engineer qrypt is a post quant...
3    as a staff software engineer backend you will ...
4    we are looking for problem solvers with master...
Name: text, dtype: object

In [61]:
y.head()

0    0
1    1
2    0
3    0
4    1
Name: title, dtype: int64

Check text preprocessing function

In [64]:
text_preprocessing(x[0])

'honda performance development inc leader motorsports racing seeking highly organized skilled software engineer santa clarita california north los angeles county location year b degree electrical engineering emphasis control system preferred experience use modelling specifically matlab simulink stateflow responsibility include pu ice es bm vehicle dynamic powertrain design implement robust model based algorithm clarity according given specification work stakeholder develop variety format unambiguously delineate required functionality including scope measurement calibration diagnostics well fault tolerance mechanism integrate prototype proposed solution larger system negotiating adaptation embedded target conformance established structural pattern necessary test alongside functional implementation provide greatest coverage possible reference final specification create high level written language description component congruent remediate defect deficiency existing documenting root cause 

In [65]:
x_pre = [text_preprocessing(i) for i in x]

In [66]:
x_pre[0]

'honda performance development inc leader motorsports racing seeking highly organized skilled software engineer santa clarita california north los angeles county location year b degree electrical engineering emphasis control system preferred experience use modelling specifically matlab simulink stateflow responsibility include pu ice es bm vehicle dynamic powertrain design implement robust model based algorithm clarity according given specification work stakeholder develop variety format unambiguously delineate required functionality including scope measurement calibration diagnostics well fault tolerance mechanism integrate prototype proposed solution larger system negotiating adaptation embedded target conformance established structural pattern necessary test alongside functional implementation provide greatest coverage possible reference final specification create high level written language description component congruent remediate defect deficiency existing documenting root cause 

In [67]:
x_pre[1]

'data lake architect w saas remote month contract hire need tech product exp within cloud environment year experience analytics architecture platform development developing solution key management platform tool including azure databricks apache spark snowflake machine learning stream etc transformation pipeline like r python c demonstrated ability lead influence team direct reporting relationship strong track record successfully architecting high growth technology product relevant big artificial intelligence enterprise system integration hybrid multi designing software service posse verbal written communication skill help interaction variety stakeholder ranging developer executive developed good facilitation negotiation presentation preferred pharmaceutical biotechnology life science healthcare industry ci cd devops jenkins education required bachelor degree technical discipline related business m computer science engineering matlen silver matter let driven delivered complex talent for

In [68]:
##Creating a backup, since the pre-processing step take a lot of time to compute
X_pre = pd.DataFrame(x_pre)
X_pre.to_csv('Preprocessed_features.csv',index=False)

###### Train-Test Split

In [69]:
print(len(data)*0.3)

704.4


In [114]:
X_train, X_test, y_train, y_test = train_test_split(X_pre, y, test_size=0.3, random_state=10)

In [115]:
print("X_train shape :",X_train.shape)
print("y_train shape :",y_train.shape)

print("X_test shape  :",X_test.shape)
print("y_test shape  :",y_test.shape)

X_train shape : (1643, 1)
y_train shape : (1643,)
X_test shape  : (705, 1)
y_test shape  : (705,)


###### Convert text to vector form

In [116]:
count_vectorizer = CountVectorizer()
train_vectors = count_vectorizer.fit_transform(X_train.stack())
test_vectors = count_vectorizer.transform(X_test.stack())

In [117]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(X_train.stack())
test_tfidf = tfidf.transform(X_test.stack())

###### Classification using Machine Learning models 

Logistic Regression, just comaparing count vectorizer and Tfidf

In [165]:
clf = LogisticRegression(C=1.0,random_state = 10)
log_cls =clf.fit(train_vectors, y_train)
y_pred = log_cls.predict(train_vectors)
print("Train Accuracy :",round(accuracy_score(y_train, y_pred),3))

Train Accuracy : 1.0


In [166]:
clf = LogisticRegression(C=1.0,random_state = 10)
log_cls =clf.fit(train_tfidf, y_train)
y_pred = log_cls.predict(train_tfidf)
print("Train Accuracy :",round(accuracy_score(y_train, y_pred),3))

Train Accuracy : 0.989


In [123]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC  
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV

In [124]:
#train classifier
model1 = SGDClassifier()
model2 = RandomForestClassifier()
model3 = XGBClassifier()
model4 = MLPClassifier() 
model5 = SVC()
model6 = LogisticRegression()

In [128]:
predictors=[('SGD',model1),('RandomForest',model2),('XGBoost',model3),('MLP',model4),('SVM',model5),('LogisticRegression',model6)]

VT=VotingClassifier(predictors)

In [139]:
#=======================================================================================
## SGD
#build the parameter grid

SGD_grid = [{'loss':['hinge', 'log'],
             'alpha':[1,0.01,0.1],
             'learning_rate' : ['optimal','adaptive']
            }]


#build a grid search to find the best parameters
gridsearchSGD = GridSearchCV(model1, SGD_grid, cv=3)

#run the grid search
gridsearchSGD.fit(train_vectors,y_train)
print("GridSearch  : SGD Done !")

#=======================================================================================
## Random Forest
#build the parameter grid

RF_grid = [{'n_estimators' : [100,200,300],
            'criterion' : ['gini', 'entropy'],
            'random_state' : [12, 26, 44]
           }]

#build a grid search to find the best parameters
gridsearchRF  = GridSearchCV(model2, RF_grid, cv=3)

#run the grid search
gridsearchRF.fit(train_vectors,y_train)
print("GridSearch  : RF Done !")
#=======================================================================================
## XGBoost
#build the parameter grid
XGB_grid = [{'booster' : ['gbtree', 'gblinear'],
            'max_depth' : [10,22],
             'n_estimators' : [100,200]
            }]


#build a grid search to find the best parameters
gridsearchXGB  = GridSearchCV(model3, XGB_grid, cv=3)

#run the grid search
gridsearchXGB.fit(train_vectors,y_train)
print("GridSearch  : XGB Done !")
#=======================================================================================
## Multi-Layer Perceptron
#build the parameter grid
MLP_grid = [{'hidden_layer_sizes' : [(100,), (50,40,30,20,20,10)],
            'solver' : ['adam' , 'sgd'],
             'activation' : ['relu','logistic'],
             'max_iter' : [100,300]
            }]

#build a grid search to find the best parameters
gridsearchMLP  = GridSearchCV(model4, MLP_grid, cv=3)

#run the grid search
gridsearchMLP.fit(train_vectors,y_train)
print("GridSearch  : MLP Done !")
#=======================================================================================
## SVM
#build the parameter grid
SVM_grid = [{'kernel' : ['rbf', 'sigmoid'],
            'gamma' : ['scale','auto'],
             'random_state' : [10,20,15]
            }]

#build a grid search to find the best parameters
gridsearchSVM  = GridSearchCV(model5,SVM_grid, cv=3)

#run the grid search
gridsearchSVM.fit(train_vectors,y_train)
print("GridSearch  : SVM Done !")




18 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\skaks\miniconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\skaks\miniconda3\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 883, in fit
    return self._fit(
  File "C:\Users\skaks\miniconda3\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 649, in _fit
    self._validate_params()
  File "C:\Users\skaks\miniconda3\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 149, in _validate_params
   

GridSearch  : SGD Done !
GridSearch  : RF Done !
















































Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth" } might not be used.

  Th



Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






GridSearch  : XGB Done !




GridSearch  : MLP Done !
GridSearch  : SVM Done !


ValueError: Invalid parameter gamma for estimator LogisticRegression(). Check the list of available parameters with `estimator.get_params().keys()`.

Note : We mistakenly used the parameters of SVM on Logistic regression, So we have executed the Gridsearch for logistic regression below seperately

In [140]:
#=======================================================================================
## Logistic Regression
#build the parameter grid
LOG_grid = [{'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'max_iter' : [100,200]
            }]

#build a grid search to find the best parameters
gridsearchLOG  = GridSearchCV(model6, LOG_grid, cv=3)

#run the grid search
gridsearchLOG.fit(train_vectors,y_train)

print("GridSearch  : LogisticRegression Done !")
#=======================================================================================



GridSearch  : LogisticRegression Done !




##### Best parameters of GridSearch for each model

In [143]:
def show_best(gridsearchModel):
    for param_name in gridsearchModel.best_params_:
        print(param_name, gridsearchModel.best_params_[param_name])

In [144]:
show_best(gridsearchSGD)

alpha 0.01
learning_rate optimal
loss log


In [145]:
show_best(gridsearchRF)

criterion entropy
n_estimators 300
random_state 12


In [146]:
show_best(gridsearchXGB)

booster gbtree
max_depth 22
n_estimators 100


In [147]:
show_best(gridsearchMLP)

activation relu
hidden_layer_sizes (100,)
max_iter 300
solver sgd


In [148]:
show_best(gridsearchSVM)

gamma scale
kernel rbf
random_state 10


In [149]:
show_best(gridsearchLOG)

max_iter 100
solver saga


In [150]:
### Using the best parameter for each model
model1 = SGDClassifier(alpha = 0.01,learning_rate = 'optimal', loss ='log')
model2 = RandomForestClassifier(criterion = 'entropy',n_estimators =300, random_state =12 )
model3 = XGBClassifier(booster ='gbtree',max_depth =22,n_estimators =100  )
model4 = MLPClassifier(activation ='relu', hidden_layer_sizes =(100,), max_iter =300, solver ='sgd'  ) 
model5 = SVC(gamma ='scale',kernel ='rbf',random_state =10 )
model6 = LogisticRegression(max_iter =100, solver ='saga')

predictors=[('SGD',model1),('RandomForest',model2),('XGBoost',model3),('MLP',model4),('SVM',model5),('LogisticRegression',model6)]

VT=VotingClassifier(predictors)

###### Using Voting Classifier

Using TF-IDF Vectors 

In [152]:
VT.fit(train_tfidf,y_train)

#use the VT classifier to predict
predicted=VT.predict(test_tfidf)

#print the accuracy
print (accuracy_score(predicted,y_test))



0.9361702127659575


Using Count Vectors

In [153]:
VT.fit(train_vectors,y_train)

#use the VT classifier to predict
predicted=VT.predict(test_vectors)

#print the accuracy
print (accuracy_score(predicted,y_test))







0.9432624113475178


###### Checking test accuracy on each individual classifiers

In [156]:
## SGD Classifier
model = model1

model.fit(train_vectors,y_train)
predicted=model.predict(test_vectors)

print("SGD Classifier accuracy :", round(accuracy_score(predicted,y_test),3))

SGD Classifier accuracy : 0.948


In [158]:
## Random Forest Classifier
model = model2

model.fit(train_vectors,y_train)
predicted=model.predict(test_vectors)

print("Random Forest Classifier accuracy :", round(accuracy_score(predicted,y_test),3))

Random Forest Classifier accuracy : 0.936


In [159]:
## XGB Classifier
model = model3

model.fit(train_vectors,y_train)
predicted=model.predict(test_vectors)

print("XGB Classifier accuracy :", round(accuracy_score(predicted,y_test),3))



XGB Classifier accuracy : 0.938


In [160]:
## MLP Classifier
model = model4

model.fit(train_vectors,y_train)
predicted=model.predict(test_vectors)

print("MLP Classifier accuracy :", round(accuracy_score(predicted,y_test),3))

MLP Classifier accuracy : 0.946


In [161]:
## MLP Classifier wil more hidden layers
model = MLPClassifier(activation ='relu', hidden_layer_sizes =(50,40,30,20,20,10), max_iter =300, solver ='sgd' )


model.fit(train_vectors,y_train)
predicted=model.predict(test_vectors)

print("MLP Classifier accuracy :", round(accuracy_score(predicted,y_test),3))

MLP Classifier accuracy : 0.948


In [162]:
## SVM Classifier
model = model5

model.fit(train_vectors,y_train)
predicted=model.predict(test_vectors)

print("SVM Classifier accuracy :", round(accuracy_score(predicted,y_test),3))

SVM Classifier accuracy : 0.936


In [164]:
## Logistic Regression Classifier
model = model6

model.fit(train_vectors,y_train)
predicted=model.predict(test_vectors)

print("Logistic Regression Classifier accuracy :", round(accuracy_score(predicted,y_test),3))

Logistic Regression Classifier accuracy : 0.945




#### Result

<li>The voting classifier model gave an accuracy of  <b>94.32%</b> accuracy on test data
<li>The SGD classifier model gave an accuracy of  <b>94.80%</b> accuracy on test data
<li>The Random Forest classifier model gave an accuracy of  <b>93.60%</b> accuracy on test data
<li>The XGB classifier model gave an accuracy of  <b>93.8%</b> accuracy on test data
<li>The MLP Classifier model gave an accuracy of  <b>94.6%</b> accuracy on test data
<li>The MLP Classifier (with more hidden layers) model gave an accuracy of  <b>94.8%</b> accuracy on test data
<li>The SVM Classifier model gave an accuracy of  <b>93.6%</b> accuracy on test data
<li>The Logistic Regression Classifier model gave an accuracy of  <b>94.5%</b> accuracy on test data
 

<li>The best classifier models are <u><b>MLP Classifier (with more hidden layers)</b></u> and <u><b>SGD classifier</b></u> that gave an accuracy of <b>94.5%</b> on test data