# Training ML Models for DS Salary Prediction 
by Dev Patel

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('DS_salary_data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,age,python_yn,R_yn,spark,aws,excel,job_simp,seniority,desc_len,num_comp
0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,...,47,1,0,0,0,1,data scientist,na,2536,0
1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,...,36,1,0,0,0,0,data scientist,na,4783,0
2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,...,10,1,0,1,0,1,data scientist,na,3461,0
3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,...,55,1,0,0,0,0,data scientist,na,3883,3
4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,...,22,1,0,0,0,1,data scientist,na,2728,3


In [3]:
# Choosing columns relevant for model building 
# and dropping the rest of the columns

df.drop(['Job Title', 'Salary Estimate', 'Job Description', 'Company Name', 'Location', 'Headquarters', 'Founded', 
        'Competitors', 'min_salary', 'max_salary', 'company_txt', 'R_yn'], axis=1, inplace=True)

In [4]:
# Columns I will be using in model building
df.columns

Index(['Rating', 'Size', 'Type of ownership', 'Industry', 'Sector', 'Revenue',
       'hourly', 'employer_provided', 'avg_salary', 'job_state', 'same_state',
       'age', 'python_yn', 'spark', 'aws', 'excel', 'job_simp', 'seniority',
       'desc_len', 'num_comp'],
      dtype='object')

In [5]:
# Creating dummy variables for the categorical columns
df = pd.get_dummies(df)
df

Unnamed: 0,Rating,hourly,employer_provided,avg_salary,same_state,age,python_yn,spark,aws,excel,...,job_simp_analyst,job_simp_data engineer,job_simp_data scientist,job_simp_director,job_simp_manager,job_simp_mle,job_simp_na,seniority_jr,seniority_na,seniority_senior
0,3.8,0,0,72.0,0,47,1,0,0,1,...,0,0,1,0,0,0,0,0,1,0
1,3.4,0,0,87.5,0,36,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,4.8,0,0,85.0,1,10,1,1,0,1,...,0,0,1,0,0,0,0,0,1,0
3,3.8,0,0,76.5,1,55,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,2.9,0,0,114.5,1,22,1,0,0,1,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,3.9,0,0,84.5,0,190,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
738,4.4,0,0,102.5,0,14,1,1,1,0,...,0,1,0,0,0,0,0,0,0,1
739,2.6,0,0,73.5,1,36,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
740,3.2,0,0,127.5,0,-1,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0


In [6]:
# Train test split
from sklearn.model_selection import train_test_split
X = df.drop('avg_salary', axis=1)
y = df['avg_salary'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

**Multiple Linear Regression**

In [7]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [8]:
lr.fit(X_train, y_train)

LinearRegression()

In [9]:
from sklearn.model_selection import cross_val_predict
np.mean(cross_val_predict(lr,X_train,y_train, cv= 3))

100.76816537713535

In [25]:
mean_absolute_error(y_test, lr.predict(X_test))

16339438.562695945

In [10]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(lr,X_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3))

-21.45711869281797

**Random Forest**

In [11]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)

In [12]:
rf.fit(X_train, y_train)

RandomForestRegressor()

In [13]:
from sklearn.model_selection import cross_val_predict
np.mean(cross_val_predict(rf,X_train,y_train, cv= 3))

101.67400179856114

In [14]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(rf,X_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3))

-14.990785914947205

In [15]:
from sklearn.model_selection import GridSearchCV
param = {'n_estimators':[150,200,250,300], 
         'criterion':('mse','mae'), 
         'max_features':('auto','sqrt','log2')}
gs_rf = GridSearchCV(rf, param, scoring='neg_mean_absolute_error', cv=3)
gs_rf.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
             param_grid={'criterion': ('mse', 'mae'),
                         'max_features': ('auto', 'sqrt', 'log2'),
                         'n_estimators': [150, 200, 250, 300]},
             scoring='neg_mean_absolute_error')

In [16]:
print('Best Estimator: ', gs_rf.best_estimator_)
print('Best Score: ', gs_rf.best_score_)

Best Estimator:  RandomForestRegressor(max_features='sqrt', n_estimators=150)
Best Score:  -14.718439342568374


**Support Vector Regressor**

In [17]:
from sklearn.svm import SVR
svr = SVR()

In [18]:
svr.fit(X_train, y_train)

SVR()

In [19]:
from sklearn.model_selection import cross_val_predict
np.mean(cross_val_predict(svr,X_train,y_train, cv= 3))

96.8669541644736

In [20]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(svr,X_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3))

-30.07067209803617

In [21]:
from sklearn.model_selection import GridSearchCV
param = {'C': [0.1,1, 10, 100], 
         'gamma': [1,0.1,0.01,0.001], 
         'kernel': ['rbf', 'sigmoid']} 
gs_svr = GridSearchCV(svr, param, scoring='neg_mean_absolute_error', cv=3)
gs_svr.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=SVR(),
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['rbf', 'sigmoid']},
             scoring='neg_mean_absolute_error')

In [22]:
print('Best Estimator: ', gs_svr.best_estimator_)
print('Best Score: ', gs_svr.best_score_)

Best Estimator:  SVR(C=100, gamma=0.1)
Best Score:  -16.223511582199233


**Prediction:**

In [35]:
lr_pred = lr.predict(X_test)
rf_pred = gs_rf.best_estimator_.predict(X_test)
svr_pred = gs_svr.best_estimator_.predict(X_test)

In [36]:
from sklearn.metrics import mean_absolute_error
print('MAE for Multiple Linear Regression Model:', mean_absolute_error(y_test, lr_pred))
print('MAE for Random Forest Model:', mean_absolute_error(y_test, rf_pred))
print('MAE for Support Vector Regressor Model):', mean_absolute_error(y_test, svr_pred))

MAE for Multiple Linear Regression Model: 16339438.562695945
MAE for Random Forest Model: 13.472580645161289
MAE for Support Vector Regressor Model): 13.824538123216078


In [38]:
# Since MAE for the Linear Regression model is way too high
print('Avg MAE for RF and SVR Model):', mean_absolute_error(y_test, (rf_pred+svr_pred)/2))

Avg MAE for RF and SVR Model): 13.113145880990427
