In [44]:
import numpy as np
import pandas as pd

In [45]:
data = pd.read_csv('train.csv')

In [46]:
data

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2.0,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1.0,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2.0,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1.0,1.0,2.6,0.20
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3.0,7.0,6.9,0.52
...,...,...,...,...,...,...,...,...,...
22745,fffe31003500370039003100,2008-12-30,Female,Service,No,1.0,3.0,,0.41
22746,fffe33003000350031003800,2008-01-19,Female,Product,Yes,3.0,6.0,6.7,0.59
22747,fffe390032003000,2008-11-05,Male,Service,Yes,3.0,7.0,,0.72
22748,fffe33003300320036003900,2008-01-10,Female,Service,No,2.0,5.0,5.9,0.52


In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22750 entries, 0 to 22749
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           22750 non-null  object 
 1   Date of Joining       22750 non-null  object 
 2   Gender                22750 non-null  object 
 3   Company Type          22750 non-null  object 
 4   WFH Setup Available   22750 non-null  object 
 5   Designation           22750 non-null  float64
 6   Resource Allocation   21369 non-null  float64
 7   Mental Fatigue Score  20633 non-null  float64
 8   Burn Rate             21626 non-null  float64
dtypes: float64(4), object(5)
memory usage: 1.6+ MB


In [48]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_inputs(df):
    df = df.copy()
    
    df = df.drop('Employee ID', axis=1)
    
    missing_target_rows = df.loc[df['Burn Rate'].isna(), :].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)
    
    for column in ['Resource Allocation', 'Mental Fatigue Score']:
        df[column] = df[column].fillna(df[column].mean())
    
    df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])
    df['Join Month'] = df['Date of Joining'].apply(lambda x: x.month)
    df['Join Day'] = df['Date of Joining'].apply(lambda x: x.day)
    df = df.drop('Date of Joining', axis=1)
    
    df['Gender'] = df['Gender'].replace({'Female': 0, 'Male': 1})
    df['Company Type'] = df['Company Type'].replace({'Product': 0, 'Service': 1})
    df['WFH Setup Available'] = df['WFH Setup Available'].replace({'No': 0, 'Yes': 1})
    
    y = df['Burn Rate']
    X = df.drop('Burn Rate', axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [49]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)   

In [50]:
X_train

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Join Month,Join Day
8275,-0.954022,-1.379211,-1.087295,0.725025,0.768001,0.475128,0.433442,-0.649693
21284,1.048194,0.725052,-1.087295,1.604608,1.270205,1.131455,1.596251,-0.536187
16802,1.048194,0.725052,-1.087295,-0.154557,0.768001,0.420434,1.305549,0.371860
3271,1.048194,-1.379211,-1.087295,1.604608,2.274612,1.733089,0.142739,1.620424
5302,-0.954022,-1.379211,-1.087295,-0.154557,-0.236406,0.475128,0.724144,-0.422682
...,...,...,...,...,...,...,...,...
10955,-0.954022,0.725052,-1.087295,-0.154557,0.768001,0.803292,-1.020070,-1.444234
17289,-0.954022,0.725052,0.919713,0.725025,-0.236406,-0.509363,-0.147963,0.712377
5192,-0.954022,0.725052,0.919713,0.725025,0.265797,-1.165690,1.014847,0.031342
12172,1.048194,-1.379211,0.919713,-1.913723,-1.743017,-1.220384,0.433442,-1.671246


In [51]:
y_train

8275     0.61
21284    0.81
16802    0.62
3271     0.73
5302     0.43
         ... 
10955    0.58
17289    0.39
5192     0.24
12172    0.18
235      0.00
Name: Burn Rate, Length: 15138, dtype: float64

In [52]:
from sklearn.metrics import r2_score

In [53]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model=GradientBoostingRegressor()
gb_model.fit(X_train,y_train)

train_pred_gb=gb_model.predict(X_train)
train_r2=r2_score(y_train,train_pred_gb)
test_pred_gb=gb_model.predict(X_test)
test_r2=r2_score(y_test,test_pred_gb)

print("Accuracy score of train data: ",str(round(100*train_r2,4))," %")
print("Accuracy score of test data: ",str(round(100*test_r2,4))," %")

Accuracy score of train data:  90.6068  %
Accuracy score of test data:  90.2569  %


In [54]:
from sklearn.ensemble import ExtraTreesRegressor

et_model=ExtraTreesRegressor()
et_model.fit(X_train,y_train)

train_pred_et=et_model.predict(X_train)
train_r2=r2_score(y_train,train_pred_et)
test_pred_et=et_model.predict(X_test)
test_r2=r2_score(y_test,test_pred_et)

print("Accuracy score of train data: ",str(round(100*train_r2,4))," %")
print("Accuracy score of test data: ",str(round(100*test_r2,4))," %")

Accuracy score of train data:  99.945  %
Accuracy score of test data:  88.4191  %


In [55]:
from sklearn.ensemble import RandomForestRegressor

rd_model=RandomForestRegressor()
rd_model.fit(X_train,y_train)

train_pred_rd=rd_model.predict(X_train)
train_r2=r2_score(y_train,train_pred_rd)
test_pred_rd=rd_model.predict(X_test)
test_r2=r2_score(y_test,test_pred_rd)

print("Accuracy score of train data: ",str(round(100*train_r2,4))," %")
print("Accuracy score of test data: ",str(round(100*test_r2,4))," %")

Accuracy score of train data:  98.5495  %
Accuracy score of test data:  89.7407  %
