In [1]:
#Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

In [2]:
#Import dataset
dataset_v1 = pd.read_csv('Glassdoor Gender Pay Gap.csv')

#Locating indexes
X = dataset_v1.iloc[:, :-2].values

#The depend variable is salary
y = dataset_v1.iloc[:, -2].values


In [3]:
#Count unique values prior to encoding
#Categorical independent variables will need to be encoded: JobTitle, Gender, Education, Dept
dataset_v1.nunique()

JobTitle      10
Gender         2
Age           48
PerfEval       5
Education      4
Dept           5
Seniority      5
BasePay      992
Bonus        941
dtype: int64

In [4]:
dataset_v1.head()

Unnamed: 0,JobTitle,Gender,Age,PerfEval,Education,Dept,Seniority,BasePay,Bonus
0,Graphic Designer,Female,18,5,College,Operations,2,42363,9938
1,Software Engineer,Male,21,5,College,Management,5,108476,11128
2,Warehouse Associate,Female,19,4,PhD,Administration,5,90208,9268
3,Software Engineer,Male,20,5,Masters,Sales,4,108080,10154
4,Graphic Designer,Male,26,5,Masters,Engineering,5,99464,9319


In [5]:
#Encoding the categorical variables - JobTitle - 1st Column
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[0])], remainder='passthrough')
X = ct.fit_transform(X)


In [6]:
#Checking encoding: JobTitle
print(X[:] [0])

[0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 'Female' 18 5 'College'
 'Operations' 2]


In [7]:
#Encoding the categorical variables - Gender - 11th column
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#Gender is performed on the wrong column here - we have to check how many job titles there are....
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[10])], remainder='passthrough')
X = ct.fit_transform(X)

In [8]:
#Checking
print(X[:] [0])


[1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 18 5 'College'
 'Operations' 2]


In [9]:
#Encoding the categorical variables - Education - 15th Column
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#Gender is performed on the wrong column here - we have to check how many job titles there are....
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[14])], remainder='passthrough')
X = ct.fit_transform(X)

In [10]:
#Checking
print(X[:] [0])

[1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 18 5
 'Operations' 2]


In [11]:
#Encoding the categorical variables - Department - 19th Column
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#Gender is performed on the wrong column here - we have to check how many job titles there are....
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[18])], remainder='passthrough')
X = ct.fit_transform(X)

In [12]:
#Checking
#Column Labels:
#Department: 1:5
#Education: 6:9
#Gender: 10:11
#JobTitle: 12:21
#Age: 22
#PerfEval: 23
#Seniority: 24

print(X[:] [0])

[0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
 0.0 0.0 0.0 18 5 2]


In [13]:
#Splitting data into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [14]:
#Training the model
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [15]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 79157.62  80789.  ]
 [ 75132.1   57500.  ]
 [116444.52 130251.  ]
 [117586.76 126269.  ]
 [ 53322.11  57672.  ]
 [118957.64 106045.  ]
 [ 87703.54  88566.  ]
 [ 74749.39  61164.  ]
 [ 79137.51  84930.  ]
 [102277.42 102745.  ]
 [ 95235.55  78132.  ]
 [130094.67 127013.  ]
 [ 38264.51  38613.  ]
 [104253.26 109345.  ]
 [127336.71 121506.  ]
 [ 79413.46  71105.  ]
 [ 80244.24  69928.  ]
 [112618.78 110049.  ]
 [ 94664.77  95584.  ]
 [ 93508.63 103384.  ]
 [109885.63 103007.  ]
 [ 85154.1   91566.  ]
 [ 84548.44  78750.  ]
 [101334.54  93306.  ]
 [108667.84 109645.  ]
 [ 91744.4  114621.  ]
 [ 90874.91  81503.  ]
 [103071.57 109369.  ]
 [118887.44 124847.  ]
 [103464.38 108476.  ]
 [103730.51  83308.  ]
 [ 91689.66  96584.  ]
 [ 92416.16  89225.  ]
 [ 96247.05 104290.  ]
 [ 69366.39  62377.  ]
 [100371.46 100819.  ]
 [ 97020.54  87314.  ]
 [149094.44 150467.  ]
 [ 81249.07  86375.  ]
 [ 66527.46  56318.  ]
 [ 66102.48  65711.  ]
 [106016.28 108977.  ]
 [111561.11 129620.  ]
 [103317.38

In [16]:
#Evaluating Model Performance
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8061085018482304

In [17]:
#Print equation
print(f"intercept: {regressor.intercept_}")
print(f"coefficients: {regressor.coef_}")

intercept: 24636.28465713936
coefficients: [ -2826.16   1033.81    757.81  -3223.42   4257.96  -2958.11  -3161.71
   2808.1    3311.72   -195.62    195.62  -2126.91  -5555.71   2153.83
  -5435.25  -3306.97  28760.19 -19810.01  -2102.24  11200.97  -3777.89
    997.02     62.54   9676.35]


In [18]:
#Make a prediction for Software Engineer, Female, 50, 5, Masters, Operations, 3
print(regressor.predict([[1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 50, 5, 3]]))

[98059.51]


In [19]:
#The R2 Score indicates that the regression equation accounts for 81% of the variation in salary, however, VIF should be analyzed to determine if multicollinearity is present in independent variables.