> 
# Job titles to search for:
* data scientist
* data analyst
* data architect
* data engineer
* statistician
* database administrator
* business analyst
* data analytics manager
>
# Countries to search in:
* Singapore


In [None]:
# Each page has 15 job descriptions, 5 advertised jobs
# url = 'https://www.indeed.com.sg/jobs?q=data+scientist&l=Singapore&start='
# Assumption is that all job information gathered are data-related

In [3]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import regex as re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# WEB SCRAPING MODULE

In [81]:
# Initialize search parameters and dataframe
# 'my',
country_set = ['sg','my']
search_string = ['data scientist', 'data analyst', 'data architect', 'data engineer',
       'statistician', 'database administrator', 'business analyst', 'data analytics manager']
columns = ["job_category","job_title", "company_name", "location", "summary", "salary"]

In [82]:
# Initialize container to store all job postings
jobs_list = []

# Iterate through search parameters and store relevant data in respective columns in dataframe
for country in country_set:
    for query in search_string:
        
        url = 'https://www.indeed.com.' + country + '/jobs?q=' + '+'.join([word for word in query.split()]) + '&start='
        time.sleep(1)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'lxml')
        jobs_count = soup.find_all(name='div', attrs={'id':'searchCount'})[0].get_text()
        
        # Get maximum number of jobs to iterate over all pages
        max_jobs = int(re.sub('[^0-9a-zA-Z]+', '', jobs_count.split()[-1]))
        
        for start_number in range(0,max_jobs,10):
            time.sleep(1)
            url_page = url + str(start_number)
            page = requests.get(url_page)
            soup = BeautifulSoup(page.text, 'lxml')
            
            # Get all advertised job descriptions
            regex = re.compile('.*row.*')
            jobs = soup.find_all(name='div', attrs={'class':regex})
            
            # Get job title from job description
            for job in jobs:

                job_title = job.find(name='a', attrs={'data-tn-element':'jobTitle'})
                company = job.find(name='span', attrs={'class':'company'})
                location = job.find(name='span', attrs={'class':'location'})
                summary = job.find(name='span', attrs={'class':'summary'})
                salary = job.find(name='span', attrs={'class':'no-wrap'})

                # Put default for missing variables
                if job_title != None:
                    job_title_result = job_title.get_text()
                    job_title_result = job_title_result.replace('\n','')
                    job_title_result = job_title_result.strip()
                else:
                    job_title_result = np.nan

                if company != None:
                    company_result = company.get_text()
                    company_result = company_result.replace('\n','')
                    company_result = company_result.strip()
                else:
                    company_result = np.nan

                if location != None:
                    location_result = location.get_text()
                    location_result = location_result.replace('\n','')
                    location_result = location_result.strip()
                else:
                    location_result = np.nan

                if summary != None:
                    summary_result = summary.get_text()
                    summary_result = summary_result.replace('\n','')
                    summary_result = summary_result.strip()
                else:
                    summary_result = np.nan

                if salary != None:

                    salary_result = salary.get_text()
                    salary_result = salary_result.replace('\n','')
                    salary_result = salary_result.strip()
                else:
                    salary_result = np.nan

                # Append to list
                job_category = '_'.join([word for word in query.split()])
                jobs_list.append([job_category,job_title_result, company_result, location_result, summary_result, salary_result])

# Convert jobs list to dataframe
df = pd.DataFrame(jobs_list, columns = columns)
# drop all duplicated job postings based on summary
df.drop_duplicates(subset=['summary'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7388 entries, 0 to 7387
Data columns (total 6 columns):
job_category    7388 non-null object
job_title       7388 non-null object
company_name    6245 non-null object
location        7388 non-null object
summary         6647 non-null object
salary          332 non-null object
dtypes: object(6)
memory usage: 346.4+ KB


# SAVE SCRAPED JOBS IN PICKLE

In [84]:
# Save dataframe in pickle to not waste time scrapping again
df.drop_duplicates(subset=['summary'], inplace=True)
df.to_pickle('sg_my_indeed_data_related_jobs')

# READ FILE & CLEAN DATA

In [4]:
df1 = pd.read_pickle('./sg_my_indeed_data_related_jobs')
df1.reset_index(drop=True, inplace=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5815 entries, 0 to 5814
Data columns (total 6 columns):
job_category    5815 non-null object
job_title       5815 non-null object
company_name    4945 non-null object
location        5815 non-null object
summary         5814 non-null object
salary          284 non-null object
dtypes: object(6)
memory usage: 272.7+ KB


In [5]:
# Drop location column, frequency too concentrated on a two values
df1.drop(labels=['location'], axis=1, inplace=True)

In [6]:
# Drop all null values except for salary
df1.dropna(subset=['company_name','summary'], inplace=True)

In [7]:
# convert all to small letters if string
df1 = df1.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)
# remove all non-alphabets
df1.job_title = df1.job_title.map(lambda x: re.sub(r'[^A-Za-z\s]','',x).strip())
df1.summary = df1.summary.map(lambda x: re.sub(r'[^A-Za-z\s]','',x).strip())
# remove business licence numbers
df1.company_name = df1.company_name.map(lambda x: x[:x.index(', ea licence')] if x.find(', ea licence') != -1 else x)

In [8]:
# filter jobs by data related terms
data_terms = ['data','analytics','intelligence','analysis','statistics','machine learning']
df1_summary_null = df1.summary.map(lambda x: x if any(x.find(t)>=0 for t in data_terms) else np.nan).isnull()
df1_job_title_null = df1.job_title.map(lambda x: x if any(x.find(t)>=0 for t in data_terms) else np.nan).isnull()
df1_mod = df1[(~df1_summary_null) | (~df1_job_title_null)]

In [9]:
df1_mod[~df1_mod.salary.isnull()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206 entries, 5 to 5806
Data columns (total 5 columns):
job_category    206 non-null object
job_title       206 non-null object
company_name    206 non-null object
summary         206 non-null object
salary          206 non-null object
dtypes: object(5)
memory usage: 9.7+ KB


In [10]:
# Extract those without salary data
df1_unsalaried = df1_mod[df1_mod.salary.isnull()]

In [11]:
# Extract those with salary data
df1_salaried = df1_mod[~df1_mod.salary.isnull()]
df1_salaried.reset_index(drop=True, inplace=True)

In [12]:
# Convert all salary into yearly format
salary_type = df1_salaried.salary.map(lambda x: 'rm' if x.find('rm') >= 0 else 'sgd')
salary_range = df1_salaried.salary.map(lambda x: re.sub('[^0-9\s]', '', ' '.join(re.findall(r'\d+(?:[\d,.]*\d)', x))))
salary_period = df1_salaried.salary.map(lambda x: x[x.find('month'):] if x.find('month') >= 0 else x[x.find('hour'):] if x.find('hour') >= 0 else 'year')

In [13]:
# Get live exchange rate
import requests as req

In [14]:
url = 'https://api.fixer.io/latest?base=SGD&symbols=MYR'
req_obj = req.get(url)
sgdmy_rate = req_obj.json()

In [15]:
# 1 SGD worth how much MYR
exch_rate = sgdmy_rate['rates']['MYR']

In [16]:
temp_sal = []

for i in range(0, len(salary_period)):
    if salary_period.loc[i] == 'month':
        if salary_type.loc[i] == 'rm':
            sal = int(salary_range[i].split()[0])/exch_rate
            temp_sal.append(sal*12)
        else:
            sal = int(salary_range[i].split()[0])
            temp_sal.append(sal*12)
    elif salary_period.loc[i] == 'hour':
        if salary_type.loc[i] == 'rm':
            sal = int(salary_range[i].split()[0])/exch_rate
            temp_sal.append(sal*2080)
        else:
            sal = int(salary_range[i].split()[0])
            temp_sal.append(sal*2080)
    else:
        if salary_type.loc[i] == 'rm':
            temp_sal.append(int(salary_range[i].split()[0])/exch_rate)
        else:
            temp_sal.append(int(salary_range[i].split()[0]))

salary_annual = pd.DataFrame(temp_sal, columns=['salary_annual'])

In [17]:
salary_annual = salary_annual.astype('int64')
# Classifiy Salary into high, low tier
salary_high_tier = salary_annual.applymap(lambda x: 1 if x > int(salary_annual.median()) else 0)

In [18]:
df1_salaried.drop(labels=['salary'], axis=1, inplace=True)
df1_salaried_mod = pd.concat([df1_salaried, salary_high_tier], axis=1)
df1_salaried_mod.rename(index=str, columns={'salary_annual': 'salary_high_tier'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
# We will use those jobs with description to predict those without
df1_salaried_mod.info()

<class 'pandas.core.frame.DataFrame'>
Index: 206 entries, 0 to 205
Data columns (total 5 columns):
job_category        206 non-null object
job_title           206 non-null object
company_name        206 non-null object
summary             206 non-null object
salary_high_tier    206 non-null int64
dtypes: int64(1), object(4)
memory usage: 9.7+ KB


# BAG OF WORDS MODELLING

In [20]:
# Question 1:

# Get TFIDF of job title, company, location, summary
# Use data with salary to predict those without
# Find out features with highest significant in distinguishing high vs low salary jobs
# Then collect TFIDF again for whole dataset and do second round of modelling
# Check to see whether top features are the same with round 1
# Features that appear highly significant in both rounds are the factors that are best at distinguishing
# high vs low salary

# For my study, i will generate features from my dataset using TFIDF

# Will use log reg and decision tree to predict, unless results really bad

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [22]:
# Get TFIDF for job summary
job_summary_tvec = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_summary_tvec.fit(df1_salaried_mod.summary)
job_summary_tvec_df = pd.DataFrame(job_summary_tvec.transform(df1_salaried_mod.summary).todense(),
                       columns=['summary_[' + f + ']' for f in job_summary_tvec.get_feature_names()])

In [23]:
# Get TFIDF for job title
job_title_tvec = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_title_tvec.fit(df1_salaried_mod.job_title)
job_title_tvec_df = pd.DataFrame(job_title_tvec.transform(df1_salaried_mod.summary).todense(),
                       columns=['title_[' + f + ']' for f in job_title_tvec.get_feature_names()])

In [24]:
# Get TFIDF for company name
job_company_tvec = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_company_tvec.fit(df1_salaried_mod.company_name)
job_company_tvec_df = pd.DataFrame(job_company_tvec.transform(df1_salaried_mod.company_name).todense(),
                       columns=['company_[' + f + ']' for f in job_company_tvec.get_feature_names()])

In [25]:
y_with_sal = df1_salaried_mod.salary_high_tier
X_with_sal = pd.concat([job_summary_tvec_df,job_title_tvec_df,job_company_tvec_df], axis=1)

In [26]:
# Get training and testing set
X_train, X_test, y_train, y_test = train_test_split(X_with_sal, y_with_sal, test_size=0.3, random_state=42)

In [27]:
# Standardize predictors
X_train_ss = StandardScaler().fit_transform(X_train)
X_test_ss = StandardScaler().fit_transform(X_test)

In [30]:
X_train_ss = pd.DataFrame(X_train_ss, columns=X_train.columns)
X_test_ss = pd.DataFrame(X_test_ss, columns=X_train.columns)

In [31]:
# Fit with plain logistic regression
lr = LogisticRegression()
lr.fit(X_train_ss, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
pred = lr.predict(X_test_ss)
score = metrics.f1_score(y_test, pred)
print(classification_report(y_test, pred))
print('f1-score:', score)

             precision    recall  f1-score   support

          0       0.75      0.60      0.67        35
          1       0.59      0.74      0.66        27

avg / total       0.68      0.66      0.66        62

f1-score: 0.655737704918


In [33]:
# Gridsearch for Ridge and Lasso Logistic Regression, optimize C

parameters = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

print ("GRID SEARCH:")
lr_grid_search = GridSearchCV(LogisticRegression(), parameters, cv=10, verbose=0)
lr_grid_search.fit(X_train_ss, y_train)
print ("Best parameters set:")
lr_best_parameters = lr_grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, lr_best_parameters[param_name]))

GRID SEARCH:
Best parameters set:
	C: 1.0000000000000001e-05
	penalty: 'l2'
	solver: 'liblinear'


In [34]:
print ("Logistic Regression with best parameter:")
clf = LogisticRegression(**lr_best_parameters)
clf.fit(X_train_ss, y_train)
lr_gs_pred = clf.predict(X_test_ss)
print(metrics.classification_report(y_test, lr_gs_pred, labels=[1,0], target_names=['high salary','low salary']))

Logistic Regression with best parameter:
             precision    recall  f1-score   support

high salary       0.62      0.85      0.72        27
 low salary       0.84      0.60      0.70        35

avg / total       0.74      0.71      0.71        62



In [35]:
from sklearn.tree import DecisionTreeClassifier

In [36]:
# gridsearch params
dtc_params = {
    'max_depth':[None,1,2,3,4],
    'max_features':[None,'log2','sqrt',2,3,4,5],
    'min_samples_split':[2,3,4,5,10,15,20,25,30,40,50]
}

# set the gridsearch
dtc_gs = GridSearchCV(DecisionTreeClassifier(), dtc_params, cv=5, verbose=1)

In [37]:
# use the gridsearch C model to fit the data
dtc_gs.fit(X_train_ss, y_train)

Fitting 5 folds for each of 385 candidates, totalling 1925 fits


[Parallel(n_jobs=1)]: Done 1925 out of 1925 | elapsed:    4.1s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None, 1, 2, 3, 4], 'max_features': [None, 'log2', 'sqrt', 2, 3, 4, 5], 'min_samples_split': [2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [38]:
# Best Estimator
dtc_best = dtc_gs.best_estimator_
print(dtc_gs.best_params_)
print(dtc_gs.best_score_)

{'max_depth': 4, 'max_features': 'log2', 'min_samples_split': 15}
0.784722222222


In [39]:
fi = pd.DataFrame({
        'feature':X_train_ss.columns,
        'importance':dtc_best.feature_importances_
    })

fi.sort_values('importance', ascending=False, inplace=True)
fi.head(10)

Unnamed: 0,feature,importance
0,summary_[analysis],0.430211
63,company_[pte],0.176147
53,company_[ask],0.164466
70,company_[services],0.123917
66,company_[resources sdn],0.105258
47,title_[support],0.0
52,company_[asia],0.0
51,company_[agensi pekerjaan],0.0
50,company_[agensi],0.0
49,title_[technical],0.0


In [40]:
coef = lr_grid_search.best_estimator_.coef_
lr_coef = pd.DataFrame({'coef':coef.ravel(),
                    'mag':np.abs(coef.ravel()),
                    'pred':X_test_ss.columns})

lr_coef.sort_values('mag', ascending=False, inplace=True)
lr_coef.head(10)

Unnamed: 0,coef,mag,pred
68,-0.00029,0.00029,company_[sdn]
55,-0.00029,0.00029,company_[bhd]
69,-0.00029,0.00029,company_[sdn bhd]
63,0.000278,0.000278,company_[pte]
0,-0.000187,0.000187,summary_[analysis]
51,-0.000158,0.000158,company_[agensi pekerjaan]
62,-0.000158,0.000158,company_[pekerjaan]
50,-0.000158,0.000158,company_[agensi]
4,-0.000152,0.000152,summary_[data analysis]
71,0.000147,0.000147,company_[singapore]


In [41]:
pred = dtc_best.predict(X_test_ss)
print(classification_report(y_test, pred, labels=[1,0], target_names=['high salary','low salary']))

             precision    recall  f1-score   support

high salary       0.48      0.93      0.63        27
 low salary       0.80      0.23      0.36        35

avg / total       0.66      0.53      0.48        62



In [42]:
# Decision Tree has a better score than log reg
# Will take Decision Tree as model to evaluate factors that impact salary

In [43]:
# Use decision tree to predict the rest of the dataset
df1_unsalaried.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3347 entries, 0 to 5814
Data columns (total 5 columns):
job_category    3347 non-null object
job_title       3347 non-null object
company_name    3347 non-null object
summary         3347 non-null object
salary          0 non-null object
dtypes: object(5)
memory usage: 156.9+ KB


In [44]:
# Get TFIDF for job title
job_title_tvec_unsal = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_title_tvec_unsal.fit(df1_unsalaried.job_title)
job_title_tvec_unsal_df = pd.DataFrame(job_title_tvec_unsal.transform(df1_unsalaried.job_title).todense(),
                       columns=['title_[' + f + ']' for f in job_title_tvec_unsal.get_feature_names()])

In [45]:
# Get TFIDF for job summary
job_summary_tvec_unsal = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_summary_tvec_unsal.fit(df1_unsalaried.summary)
job_summary_tvec_unsal_df = pd.DataFrame(job_summary_tvec_unsal.transform(df1_unsalaried.summary).todense(),
                       columns=['summary_[' + f + ']' for f in job_summary_tvec_unsal.get_feature_names()])

In [46]:
# Get TFIDF for company name
job_company_tvec_unsal = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_company_tvec_unsal.fit(df1_unsalaried.company_name)
job_company_tvec_unsal_df = pd.DataFrame(job_company_tvec_unsal.transform(df1_unsalaried.company_name).todense(),
                       columns=['company_[' + f + ']' for f in job_company_tvec_unsal.get_feature_names()])

In [47]:
X_without_sal = pd.concat([job_summary_tvec_unsal_df,job_title_tvec_unsal_df,job_company_tvec_unsal_df], axis=1)

In [48]:
# Standardize predictors
X_without_sal_ss = StandardScaler().fit_transform(X_without_sal)

In [49]:
X_without_sal_ss = pd.DataFrame(X_without_sal_ss, columns=X_without_sal.columns)

In [50]:
pred = dtc_best.predict(X_without_sal_ss)
df1_unsalaried.salary = pred
df1_unsalaried.rename(index=str, columns={"salary": "salary_high_tier"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [51]:
# Merge predicted with original
final_df = pd.concat([df1_unsalaried, df1_salaried_mod], axis=0, ignore_index=True)

In [52]:
# Get TFIDF for job title
job_title_tvec_final = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_title_tvec_final.fit(final_df.job_title)
job_title_tvec_final_df = pd.DataFrame(job_title_tvec_final.transform(final_df.job_title).todense(),
                       columns=['title_[' + f + ']' for f in job_title_tvec_final.get_feature_names()])

In [53]:
# Get TFIDF for job summary
job_summary_tvec_final = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_summary_tvec_final.fit(final_df.summary)
job_summary_tvec_final_df = pd.DataFrame(job_summary_tvec_final.transform(final_df.summary).todense(),
                       columns=['summary_[' + f + ']' for f in job_summary_tvec_final.get_feature_names()])

In [54]:
# Get TFIDF for company name
job_company_tvec_final = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2, max_df=0.5, max_features=25)
job_company_tvec_final.fit(final_df.company_name)
job_company_tvec_final_df = pd.DataFrame(job_company_tvec_final.transform(final_df.company_name).todense(),
                       columns=['company_[' + f + ']' for f in job_company_tvec_final.get_feature_names()])

In [55]:
X = pd.concat([job_title_tvec_final_df, job_summary_tvec_final_df, job_company_tvec_final_df], axis=1)
y = final_df.salary_high_tier

In [56]:
# Standardize predictors
Xs = StandardScaler().fit_transform(X)
Xs = pd.DataFrame(Xs, columns=X.columns)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.33, random_state=42)

In [58]:
# Gridsearch for Ridge and Lasso Logistic Regression, optimize C

parameters = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

print ("GRID SEARCH:")
lr_grid_search = GridSearchCV(LogisticRegression(), parameters, cv=10, verbose=0)
lr_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
lr_best_parameters = lr_grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, lr_best_parameters[param_name]))

GRID SEARCH:
Best parameters set:
	C: 0.061359072734131694
	penalty: 'l1'
	solver: 'liblinear'


In [59]:
print("Logistic Regression with best param:")
clf = LogisticRegression(**lr_best_parameters)
clf.fit(X_train, y_train)
lr_gs_pred = clf.predict(X_test)
print(metrics.classification_report(y_test, lr_gs_pred, labels=[1,0], target_names=['high salary','low salary']))

Logistic Regression with best param:
             precision    recall  f1-score   support

high salary       0.97      0.99      0.98      1002
 low salary       0.92      0.85      0.88       171

avg / total       0.97      0.97      0.97      1173



In [60]:
# Gridsearch params for decision tree classifier
dtc_params = {
    'max_depth':[None,1,2,3,4],
    'max_features':[None,'log2','sqrt',2,3,4,5],
    'min_samples_split':[2,3,4,5,10,15,20,25,30,40,50]
}

# set the gridsearch
dtc_gs = GridSearchCV(DecisionTreeClassifier(), dtc_params, cv=5, verbose=1)

In [61]:
# use the gridsearch C model to fit the data
dtc_gs.fit(X_train, y_train)

Fitting 5 folds for each of 385 candidates, totalling 1925 fits


[Parallel(n_jobs=1)]: Done 1925 out of 1925 | elapsed:    9.0s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None, 1, 2, 3, 4], 'max_features': [None, 'log2', 'sqrt', 2, 3, 4, 5], 'min_samples_split': [2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [62]:
# Best Estimator
dtc_best = dtc_gs.best_estimator_
print(dtc_gs.best_params_)
print(dtc_gs.best_score_)

{'max_depth': 3, 'max_features': None, 'min_samples_split': 10}
0.963025210084


In [63]:
pred = dtc_best.predict(X_test)
print(classification_report(y_test, pred, labels=[1,0], target_names=['high salary','low salary']))

             precision    recall  f1-score   support

high salary       0.97      0.99      0.98      1002
 low salary       0.96      0.84      0.90       171

avg / total       0.97      0.97      0.97      1173



In [64]:
fi = pd.DataFrame({
        'feature':X_train.columns,
        'importance':dtc_best.feature_importances_
    })

fi.sort_values('importance', ascending=False, inplace=True)
fi.head(10)

Unnamed: 0,feature,importance
25,summary_[analysis],0.765275
66,company_[sg],0.106094
53,company_[citi],0.105534
63,company_[sdn],0.019874
37,summary_[knowledge],0.003223
48,summary_[working],0.0
52,company_[bhd],0.0
51,company_[bank],0.0
50,company_[asia],0.0
49,summary_[years],0.0


In [65]:
lr_coef = pd.DataFrame({'coef':clf.coef_.ravel(),
                    'mag':np.abs(clf.coef_.ravel()),
                    'pred':X_test.columns})

lr_coef.sort_values('mag', ascending=False, inplace=True)
lr_coef.head(10)

Unnamed: 0,coef,mag,pred
25,-1.871381,1.871381,summary_[analysis]
53,-0.754058,0.754058,company_[citi]
66,-0.712999,0.712999,company_[sg]
64,-0.339217,0.339217,company_[sdn bhd]
30,0.12386,0.12386,summary_[business]
42,-0.089128,0.089128,summary_[responsible]
28,0.071538,0.071538,summary_[big]
13,0.059183,0.059183,title_[manager]
31,-0.044161,0.044161,summary_[data analysis]
62,0.017321,0.017321,company_[pte]


In [66]:
# Top Factors that affect high salary vs low salary are mostly the same
# Decision Tree is the better model

In [68]:
# pickle final_df for question 2
final_df.to_pickle('sg_my_indeed_data_salaried_jobs')