<font size="5">In this part, I will be predicting the Data Analyst Average Salary</font>

Importing Necessary Libaries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error  

Putting the csv data into a pandas DataFrame and checking the columns

In [2]:
predict = pd.read_csv('Cleaned_Data.csv', index_col=0)

In [3]:
predict.columns

Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Job_Seniority',
       'Avg_Salary', 'Job_State', 'HQ_Same_State', 'Age', 'Python', 'Tableau',
       'Spark', 'PowerBI', 'Excel', 'SQL'],
      dtype='object')

Converting categorical variables from the data frame into dummy/indicator variables

In [4]:
model_data = predict[['Rating', 'Size', 'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Job_Seniority',
       'Avg_Salary', 'Job_State', 'HQ_Same_State', 'Age', 'Python', 'Tableau',
       'Spark', 'PowerBI', 'Excel', 'SQL']]

In [5]:
dummy = pd.get_dummies(model_data)
dummy.head()

Unnamed: 0,Rating,Avg_Salary,Age,Python,Tableau,Spark,PowerBI,Excel,SQL,Size_1 to 50 employees,...,HQ_Same_State_NC,HQ_Same_State_NJ,HQ_Same_State_NY,HQ_Same_State_OH,HQ_Same_State_PA,HQ_Same_State_TX,HQ_Same_State_UT,HQ_Same_State_VA,HQ_Same_State_WA,HQ_Same_State_na
0,3.2,51.5,59.0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,3.8,51.5,127.0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,0
2,3.4,51.5,17.0,1,1,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,0
3,4.1,51.5,18.0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,3.9,51.5,11.0,1,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,0


Splitting the data into Features and Label

In [6]:
X = dummy.drop('Avg_Salary', axis=1)

In [7]:
y = dummy['Avg_Salary']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

Create a list for the Mean Absolute Error, Mean Squared Error, and Root Mean Squared Error for each Machine Learning models

In [23]:
MAE = []
MSE = []
RMSE = []

Creating Machine Learning models

Gradient Boost Regression

In [24]:
gbr = GradientBoostingRegressor()
param_gbr = {'loss':('ls','lad'), 'n_estimators':np.arange(100,701,100), 'max_features':('auto','sqrt','log2')}

grid_gbr = GridSearchCV(estimator=gbr, param_grid=param_gbr, cv=10)

grid_gbr.fit(X_train, y_train)

print("Best Parameter: ", grid_gbr.best_params_)

Best Parameter:  {'loss': 'ls', 'max_features': 'auto', 'n_estimators': 100}


In [25]:
gbr_pred = grid_gbr.best_estimator_.predict(X_test)

MAE.append(mean_absolute_error(y_test, gbr_pred))
MSE.append(mean_squared_error(y_test, gbr_pred))
RMSE.append(np.sqrt(mean_squared_error(y_test, gbr_pred)))

Random Forest Regression

In [26]:
rfr = RandomForestRegressor()
param_rfr = {'n_estimators':np.arange(100,701,100), 'max_features':('auto','sqrt','log2')}

grid_rfr = GridSearchCV(estimator=rfr, param_grid=param_rfr, cv=10)

grid_rfr.fit(X_train, y_train)

print("Best Parameter: ", grid_rfr.best_params_)

Best Parameter:  {'max_features': 'sqrt', 'n_estimators': 500}


In [27]:
rfr_pred = grid_rfr.best_estimator_.predict(X_test)

MAE.append(mean_absolute_error(y_test, rfr_pred))
MSE.append(mean_squared_error(y_test, rfr_pred))
RMSE.append(np.sqrt(mean_squared_error(y_test, rfr_pred)))

Decision Tree Regression

In [28]:
dtr = DecisionTreeRegressor()

cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_validate(dtr, X, y, cv=cv,scoring=('neg_mean_absolute_error', 'neg_mean_squared_error'))

MAE.append(np.mean(np.absolute(scores['test_neg_mean_absolute_error'])))
MSE.append(np.mean(np.absolute(scores['test_neg_mean_squared_error'])))
RMSE.append(np.sqrt(np.mean(np.absolute(scores['test_neg_mean_squared_error']))))

Create a data frame containing the performance of each Machine Learning models

In [29]:
evaluation = pd.DataFrame({'MAE': MAE,
                           'MSE': MSE,
                           'RMSE': RMSE},
                           index=['Gradient Boost', 'Random Forest', 'Decision Tree'])

In [30]:
evaluation

Unnamed: 0,MAE,MSE,RMSE
Gradient Boost,15.747927,416.520403,20.408832
Random Forest,16.199642,445.512179,21.107159
Decision Tree,19.367477,732.444143,27.063705


As you can see it looks like Gradient Boost is the best performing algorithm for predicting the Average Salary