## Estimate the total compensation to be provided to an employee

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the data
data = pd.read_csv("train_set.csv")
data.head()

Unnamed: 0,Year,OGC,OG,DC,Dept,UC,Union,JF,Job,EI,Salaries,Overtime,H/D,YT,Total_Compensation
0,2015,4,Community Health,DPH,Public Health,250,"SEIU - Health Workers, Local 1021",Med Therapy & Auxiliary,Morgue Attendant,6725,12196,0,0.0,Calendar,16158
1,2013,4,Community Health,DPH,Public Health,39,"Stationary Engineers, Local 39",Journeyman Trade,Stationary Engineer,25058,74639,2820,12703.31,Fiscal,115784
2,2015,6,General Administration & Finance,ASR,Assessor/Recorder,21,"Prof & Tech Engineers - Miscellaneous, Local 21",Appraisal & Taxation,Senior Real Property Appraiser,46108,100554,0,12424.5,Calendar,144708
3,2016,1,Public Protection,POL,Police,911,Police Officers' Association,Police Services,Sergeant 3,33369,140164,52754,13043.87,Fiscal,242323
4,2013,2,"Public Works, Transportation & Commerce",HHP,PUC Hetch Hetchy,21,"Prof & Tech Engineers - Miscellaneous, Local 21",Information Systems,IS Engineer-Journey,28684,58813,0,7655.28,Calendar,82106


In [3]:
#Find the shape of the Data
print("SHAPE: ",data.shape)

print('*******************************')

# Checking for any missing values
print(data.isnull().any())

SHAPE:  (287836, 15)
*******************************
Year                  False
OGC                   False
OG                    False
DC                    False
Dept                  False
UC                    False
Union                  True
JF                     True
Job                   False
EI                    False
Salaries              False
Overtime              False
H/D                   False
YT                    False
Total_Compensation    False
dtype: bool


In [4]:
# Select relevant features
features = ['Salaries', 'Overtime', 'H/D','Total_Compensation']
data = data[features]
data

Unnamed: 0,Salaries,Overtime,H/D,Total_Compensation
0,12196,0,0.00,16158
1,74639,2820,12703.31,115784
2,100554,0,12424.50,144708
3,140164,52754,13043.87,242323
4,58813,0,7655.28,82106
...,...,...,...,...
287831,78980,30115,13068.80,147079
287832,30704,0,5465.57,40174
287833,104451,0,13054.94,164669
287834,14425,0,4051.90,19594


In [5]:
data.describe()

Unnamed: 0,Salaries,Overtime,H/D,Total_Compensation
count,287836.0,287836.0,287836.0,287836.0
mean,63262.713139,4401.037115,8932.876472,97990.329882
std,44638.657748,11079.137749,4894.072024,67750.020573
min,-68771.0,-12308.0,-2940.47,-74082.0
25%,23406.0,0.0,4358.3475,35977.0
50%,62504.5,0.0,11982.035,98033.0
75%,93000.25,2738.0,12801.79,142138.25
max,515101.0,227313.0,21872.8,653498.0


In [6]:
# Feature scaling on dataset
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
rs

RobustScaler()

In [7]:
col=["Salaries","Overtime","H/D","Total_Compensation"]
data[col] = rs.fit_transform(data[col])
data.head()

Unnamed: 0,Salaries,Overtime,H/D,Total_Compensation
0,-0.722883,0.0,-1.419094,-0.771232
1,0.174361,1.029949,0.085424,0.167208
2,0.546733,0.0,0.052403,0.439661
3,1.11589,19.267348,0.125759,1.359159
4,-0.053043,0.0,-0.51244,-0.150026


In [8]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = data.drop('Total_Compensation', axis=1)
y = data['Total_Compensation']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Build and train a linear regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr

LinearRegression()

In [10]:
lr.fit(X_train, y_train)

LinearRegression()

In [11]:
y_pred = lr.predict(X_test)


In [12]:
# Calculate the Accuracy
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print("r2_score:", r2)

r2_score: 0.9871713964122532
