In [2]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import regex as re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [3]:
df1 = pd.read_pickle('./sg_my_indeed_data_salaried_jobs')
df1.reset_index(drop=True, inplace=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3553 entries, 0 to 3552
Data columns (total 5 columns):
job_category        3553 non-null object
job_title           3553 non-null object
company_name        3553 non-null object
summary             3553 non-null object
salary_high_tier    3553 non-null int64
dtypes: int64(1), object(4)
memory usage: 138.9+ KB


In [81]:
data_jobs = df1[df1.job_title.str.contains('data')]
non_data_jobs = df1[~df1.job_title.str.contains('data')]
data_jobs.shape

(675, 5)

In [82]:
ana_jobs = non_data_jobs[non_data_jobs.job_title.str.contains('analyst')]
non_ana_jobs = non_data_jobs[~non_data_jobs.job_title.str.contains('analyst')]
ana_jobs.shape

(857, 5)

In [83]:
eng_jobs = non_ana_jobs[(non_ana_jobs.job_title.str.contains('engineer')) | (non_ana_jobs.job_title.str.contains('database'))]
non_eng_jobs = non_ana_jobs[(~non_ana_jobs.job_title.str.contains('engineer')) & (~non_ana_jobs.job_title.str.contains('database'))]
eng_jobs.shape

(709, 5)

In [84]:
man_jobs = non_eng_jobs[non_eng_jobs.job_title.str.contains('manager')]
non_man_jobs = non_eng_jobs[~non_eng_jobs.job_title.str.contains('manager')]
man_jobs.shape

(435, 5)

In [89]:
df = pd.DataFrame()
data_jobs.job_title = data_jobs.job_title.map(lambda x: 'data_jobs')
ana_jobs.job_title = ana_jobs.job_title.map(lambda x: 'analyst_jobs')
eng_jobs.job_title = eng_jobs.job_title.map(lambda x: 'engineer_jobs')
man_jobs.job_title = man_jobs.job_title.map(lambda x: 'manager_jobs')
non_man_jobs.job_title = non_man_jobs.job_title.map(lambda x: 'other_jobs')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [90]:
df['job_titles'] = pd.concat([data_jobs.job_title,ana_jobs.job_title,
                        eng_jobs.job_title,man_jobs.job_title,
                        non_man_jobs.job_title], ignore_index=True)

In [93]:
final_df = pd.concat([df1[['company_name','summary','salary_high_tier']], df.job_titles], axis=1)

In [95]:
# convert job_title into numbers
final_df.job_titles = final_df.job_titles.map(lambda x: 1 if x == 'data_jobs' else
                                             2 if x == 'analyst_jobs' else
                                             3 if x == 'engineer_jobs' else
                                             4 if x == 'manager_jobs' else 5)

In [100]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3553 entries, 0 to 3552
Data columns (total 4 columns):
company_name        3553 non-null object
summary             3553 non-null object
salary_high_tier    3553 non-null int64
job_titles          3553 non-null int64
dtypes: int64(2), object(2)
memory usage: 111.1+ KB


In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [97]:
# Get TFIDF for company name
job_company_tvec = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_company_tvec.fit(final_df.company_name)
job_company_tvec_df = pd.DataFrame(job_company_tvec.transform(final_df.company_name).todense(),
                       columns=['company_[' + f + ']' for f in job_company_tvec.get_feature_names()])

In [105]:
# Get TFIDF for job summary
job_summary_tvec = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_summary_tvec.fit(final_df.summary)
job_summary_tvec_df = pd.DataFrame(job_summary_tvec.transform(final_df.summary).todense(),
                       columns=['summary_[' + f + ']' for f in job_summary_tvec.get_feature_names()])

In [106]:
X = pd.concat([final_df[['salary_high_tier']], job_company_tvec_df, job_summary_tvec_df], axis=1)
y = final_df['job_titles'].values.ravel()

In [107]:
# Get training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [108]:
# Standardize predictors
X_train_ss = StandardScaler().fit_transform(X_train)
X_test_ss = StandardScaler().fit_transform(X_test)

In [109]:
X_train_ss = pd.DataFrame(X_train_ss, columns=X_train.columns)
X_test_ss = pd.DataFrame(X_test_ss, columns=X_train.columns)

In [110]:
# Gridsearch for Ridge and Lasso Logistic Regression, optimize C

parameters = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

print ("GRID SEARCH:")
lr_grid_search = GridSearchCV(LogisticRegression(), parameters, cv=10, verbose=0)
lr_grid_search.fit(X_train_ss, y_train)
print ("Best parameters set:")
lr_best_parameters = lr_grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, lr_best_parameters[param_name]))

GRID SEARCH:
Best parameters set:
	C: 0.0095454845666183372
	penalty: 'l2'
	solver: 'liblinear'


In [111]:
print ("Logistic Regression with best parameter:")
clf = LogisticRegression(**lr_best_parameters)
clf.fit(X_train_ss, y_train)
lr_gs_pred = clf.predict(X_test_ss)
print(metrics.classification_report(y_test, lr_gs_pred, labels=[1,2,3,4,5], target_names=['data jobs','analyst jobs','engineer jobs','manager jobs','other jobs']))

Logistic Regression with best parameter:
               precision    recall  f1-score   support

    data jobs       0.38      0.31      0.34       229
 analyst jobs       0.41      0.67      0.51       251
engineer jobs       0.46      0.65      0.54       195
 manager jobs       0.40      0.06      0.11       126
   other jobs       0.47      0.32      0.38       265

  avg / total       0.43      0.43      0.40      1066



In [112]:
from sklearn.tree import DecisionTreeClassifier

In [113]:
# gridsearch params
dtc_params = {
    'max_depth':[None,1,2,3,4],
    'max_features':[None,'log2','sqrt',2,3,4,5],
    'min_samples_split':[2,3,4,5,10,15,20,25,30,40,50]
}

# set the gridsearch
dtc_gs = GridSearchCV(DecisionTreeClassifier(), dtc_params, cv=5, verbose=1)

In [114]:
# use the gridsearch C model to fit the data
dtc_gs.fit(X_train_ss, y_train)

Fitting 5 folds for each of 385 candidates, totalling 1925 fits


[Parallel(n_jobs=1)]: Done 1925 out of 1925 | elapsed:   10.0s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None, 1, 2, 3, 4], 'max_features': [None, 'log2', 'sqrt', 2, 3, 4, 5], 'min_samples_split': [2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [115]:
# Best Estimator
dtc_best = dtc_gs.best_estimator_
print(dtc_gs.best_params_)
print(dtc_gs.best_score_)

{'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 30}
0.416566143949


In [119]:
pred = dtc_best.predict(X_test_ss)
print(metrics.classification_report(y_test, pred, labels=[1,2,3,4,5], target_names=['data jobs','analyst jobs','engineer jobs','manager jobs','other jobs']))

               precision    recall  f1-score   support

    data jobs       0.36      0.30      0.33       229
 analyst jobs       0.37      0.56      0.44       251
engineer jobs       0.47      0.58      0.52       195
 manager jobs       0.20      0.17      0.18       126
   other jobs       0.44      0.25      0.32       265

  avg / total       0.38      0.38      0.37      1066



In [116]:
fi = pd.DataFrame({
        'feature':X_train_ss.columns,
        'importance':dtc_best.feature_importances_
    })

fi.sort_values('importance', ascending=False, inplace=True)
fi.head(10)

Unnamed: 0,feature,importance
28,summary_[analytics],0.143267
3,company_[bhd],0.0666
31,summary_[business],0.0584
27,summary_[analyst],0.049351
34,summary_[database],0.042419
36,summary_[engineer],0.04013
40,summary_[manager],0.035201
26,summary_[analysis],0.034207
13,company_[pte],0.034001
37,summary_[experience],0.032947


In [131]:
coef = lr_grid_search.best_estimator_.coef_

lr_coef_data = pd.DataFrame({'coef':coef[0],
                    'mag':np.abs(coef[0]),
                    'pred':X_test.columns})

lr_coef_analyst = pd.DataFrame({'coef':coef[1],
                    'mag':np.abs(coef[1]),
                    'pred':X_test.columns})

lr_coef_engineer = pd.DataFrame({'coef':coef[2],
                    'mag':np.abs(coef[2]),
                    'pred':X_test.columns})

lr_coef_manager = pd.DataFrame({'coef':coef[3],
                    'mag':np.abs(coef[3]),
                    'pred':X_test.columns})

lr_coef_other = pd.DataFrame({'coef':coef[4],
                    'mag':np.abs(coef[4]),
                    'pred':X_test.columns})

lr_coef_data.sort_values('mag', ascending=False, inplace=True)
lr_coef_analyst.sort_values('mag', ascending=False, inplace=True)
lr_coef_engineer.sort_values('mag', ascending=False, inplace=True)
lr_coef_manager.sort_values('mag', ascending=False, inplace=True)
lr_coef_other.sort_values('mag', ascending=False, inplace=True)

In [133]:
# Top predictors for data jobs
lr_coef_data.head(10)

Unnamed: 0,coef,mag,pred
28,-0.25778,0.25778,summary_[analytics]
36,-0.246339,0.246339,summary_[engineer]
34,-0.241629,0.241629,summary_[database]
40,-0.219162,0.219162,summary_[manager]
27,0.187889,0.187889,summary_[analyst]
31,-0.12953,0.12953,summary_[business]
9,-0.128303,0.128303,company_[malaysia]
35,-0.127638,0.127638,summary_[design]
18,0.103262,0.103262,company_[singapore]
14,-0.101301,0.101301,company_[sdn]


In [134]:
# Top predictors for analyst jobs
lr_coef_analyst.head(10)

Unnamed: 0,coef,mag,pred
28,-0.292979,0.292979,summary_[analytics]
34,0.237951,0.237951,summary_[database]
13,0.213333,0.213333,company_[pte]
27,-0.199669,0.199669,summary_[analyst]
36,0.198366,0.198366,summary_[engineer]
40,-0.198204,0.198204,summary_[manager]
9,-0.169721,0.169721,company_[malaysia]
31,-0.165022,0.165022,summary_[business]
17,0.148978,0.148978,company_[sg]
0,0.133071,0.133071,salary_high_tier


In [135]:
# Top predictors for engineer jobs
lr_coef_engineer.head(10)

Unnamed: 0,coef,mag,pred
28,0.541766,0.541766,summary_[analytics]
40,0.335892,0.335892,summary_[manager]
31,0.301222,0.301222,summary_[business]
36,-0.188644,0.188644,summary_[engineer]
34,-0.15094,0.15094,summary_[database]
33,0.144927,0.144927,summary_[data analytics]
37,0.11043,0.11043,summary_[experience]
26,0.093732,0.093732,summary_[analysis]
39,0.078805,0.078805,summary_[management]
3,-0.067987,0.067987,company_[bhd]


In [136]:
# Top predictors for manager jobs
lr_coef_manager.head(10)

Unnamed: 0,coef,mag,pred
28,-0.224331,0.224331,summary_[analytics]
13,-0.206087,0.206087,company_[pte]
31,-0.160932,0.160932,summary_[business]
34,-0.137449,0.137449,summary_[database]
27,0.133304,0.133304,summary_[analyst]
18,-0.123215,0.123215,company_[singapore]
36,-0.12255,0.12255,summary_[engineer]
0,0.102019,0.102019,salary_high_tier
40,-0.099448,0.099448,summary_[manager]
8,-0.09018,0.09018,company_[group]


In [137]:
# Top predictors for other jobs
lr_coef_other.head(10)

Unnamed: 0,coef,mag,pred
0,-0.252439,0.252439,salary_high_tier
36,0.201933,0.201933,summary_[engineer]
9,0.168598,0.168598,company_[malaysia]
3,0.161841,0.161841,company_[bhd]
18,-0.150087,0.150087,company_[singapore]
27,-0.138157,0.138157,summary_[analyst]
34,0.135403,0.135403,summary_[database]
31,0.131487,0.131487,summary_[business]
15,0.129281,0.129281,company_[sdn bhd]
14,0.129281,0.129281,company_[sdn]
