In [1]:
#Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

In [2]:
#Import dataset
dataset_v1 = pd.read_csv('Glassdoor Gender Pay Gap.csv')

#Locating indexes
X = dataset_v1.iloc[:, :-2].values

#The depend variable is salary
y = dataset_v1.iloc[:, -2].values


In [3]:
#Count unique values prior to encoding
#Categorical independent variables will need to be encoded: JobTitle, Gender, Education, Dept
dataset_v1.nunique()

JobTitle      10
Gender         2
Age           48
PerfEval       5
Education      4
Dept           5
Seniority      5
BasePay      992
Bonus        941
dtype: int64

In [4]:
dataset_v1.head()

Unnamed: 0,JobTitle,Gender,Age,PerfEval,Education,Dept,Seniority,BasePay,Bonus
0,Graphic Designer,Female,18,5,College,Operations,2,42363,9938
1,Software Engineer,Male,21,5,College,Management,5,108476,11128
2,Warehouse Associate,Female,19,4,PhD,Administration,5,90208,9268
3,Software Engineer,Male,20,5,Masters,Sales,4,108080,10154
4,Graphic Designer,Male,26,5,Masters,Engineering,5,99464,9319


In [5]:
#Encoding the categorical variables - JobTitle - 1st Column
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[0])], remainder='passthrough')
X = ct.fit_transform(X)


In [6]:
#Checking encoding: JobTitle
print(X[:] [0])

[0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 'Female' 18 5 'College'
 'Operations' 2]


In [7]:
#Encoding the categorical variables - Gender - 11th column
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#Gender is performed on the wrong column here - we have to check how many job titles there are....
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[10])], remainder='passthrough')
X = ct.fit_transform(X)

In [8]:
#Checking
print(X[:] [0])


[1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 18 5 'College'
 'Operations' 2]


In [9]:
#Encoding the categorical variables - Education - 15th Column
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#Gender is performed on the wrong column here - we have to check how many job titles there are....
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[14])], remainder='passthrough')
X = ct.fit_transform(X)

In [10]:
#Checking
print(X[:] [0])

[1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 18 5
 'Operations' 2]


In [11]:
#Encoding the categorical variables - Department - 19th Column
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#Gender is performed on the wrong column here - we have to check how many job titles there are....
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[18])], remainder='passthrough')
X = ct.fit_transform(X)

In [12]:
#Checking
#Column Labels:
#Department: 1:5
#Education: 6:9
#Gender: 10:11
#JobTitle: 12:21
#Age: 22
#PerfEval: 23
#Seniority: 24

print(X[:] [0])

[0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
 0.0 0.0 0.0 18 5 2]


In [13]:
#Splitting data into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [19]:
#Training the model
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

In [20]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 85253.   80789. ]
 [ 80494.   57500. ]
 [116751.  130251. ]
 [135781.  126269. ]
 [ 63695.   57672. ]
 [128737.5 106045. ]
 [ 74938.   88566. ]
 [ 84725.   61164. ]
 [ 75397.   84930. ]
 [111041.  102745. ]
 [ 82790.   78132. ]
 [111896.  127013. ]
 [ 34208.   38613. ]
 [112392.  109345. ]
 [148178.  121506. ]
 [ 78623.   71105. ]
 [ 62600.   69928. ]
 [103133.  110049. ]
 [114479.   95584. ]
 [ 96061.  103384. ]
 [100183.  103007. ]
 [ 62753.   91566. ]
 [100433.   78750. ]
 [101173.   93306. ]
 [101249.  109645. ]
 [103156.  114621. ]
 [ 83031.   81503. ]
 [ 92067.  109369. ]
 [141005.  124847. ]
 [ 76654.  108476. ]
 [121589.   83308. ]
 [ 91049.   96584. ]
 [109623.   89225. ]
 [114693.  104290. ]
 [ 69734.   62377. ]
 [121151.  100819. ]
 [ 85948.   87314. ]
 [133910.  150467. ]
 [ 93855.   86375. ]
 [ 51906.   56318. ]
 [ 79208.   65711. ]
 [ 95798.  108977. ]
 [108778.  129620. ]
 [ 66611.  119522. ]
 [ 64468.   67749. ]
 [105601.   99134. ]
 [ 55885.   39741. ]
 [ 79707.   9

In [21]:
#Evaluating Model Performance
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)
#Based on this result, we see that regression tree is not a reliable model for salary prediction

0.5587732044168616

In [22]:
#Make a prediction for Software Engineer, Female, 50, 5, Masters, Operations, 3
print(regressor.predict([[1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 50, 5, 3]]))

[106279.]
