In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np

In [2]:
train_df = pd.read_csv("data/train_new.csv")

# Drop the null rows
train_df = train_df.dropna()
train_df = train_df.drop(columns = ['Date of Joining'])

train_df.count()

Gender                  18590
Company Type            18590
WFH Setup Available     18590
Designation             18590
Resource Allocation     18590
Mental Fatigue Score    18590
Burn Rate               18590
dtype: int64

In [3]:
train_df

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,1,0,0,2.0,3.0,3.8,0.16
1,0,0,1,1.0,2.0,5.0,0.36
3,0,0,1,1.0,1.0,2.6,0.20
4,1,0,0,3.0,7.0,6.9,0.52
5,0,1,1,2.0,4.0,3.6,0.29
...,...,...,...,...,...,...,...
22743,1,1,1,1.0,3.0,6.0,0.48
22744,0,1,0,3.0,7.0,6.2,0.54
22746,1,1,1,3.0,6.0,6.7,0.59
22748,1,0,0,2.0,5.0,5.9,0.52


In [4]:
selected_features = train_df.drop(columns = ['Burn Rate'])

selected_features

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score
0,1,0,0,2.0,3.0,3.8
1,0,0,1,1.0,2.0,5.0
3,0,0,1,1.0,1.0,2.6
4,1,0,0,3.0,7.0,6.9
5,0,1,1,2.0,4.0,3.6
...,...,...,...,...,...,...
22743,1,1,1,1.0,3.0,6.0
22744,0,1,0,3.0,7.0,6.2
22746,1,1,1,3.0,6.0,6.7
22748,1,0,0,2.0,5.0,5.9


In [5]:
#split train test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_features, train_df['Burn Rate'], random_state=1, stratify = train_df['Burn Rate'])

In [6]:
y_test

10810    0.28
726      0.27
19977    0.54
6272     0.43
19653    0.39
         ... 
10706    0.39
226      0.37
16714    0.13
3100     0.77
10587    0.44
Name: Burn Rate, Length: 4648, dtype: float64

In [7]:
#Scale data
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_scale_train = X_scaler.transform(X_train)
X_scale_test = X_scaler.transform(X_test)


In [8]:
#Train the model 
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn import utils 

lin_regr_model = LinearRegression()
lin_regr_model


LinearRegression()

In [9]:
#fit model

lin_regr_model.fit(X_train, y_train)

LinearRegression()

In [10]:
print(f"Training Data Score: {lin_regr_model.score(X_train, y_train)}")
print(f"Testing Data Score: {lin_regr_model.score(X_test, y_test)}")

Training Data Score: 0.9212250346164508
Testing Data Score: 0.9189457447498501


In [11]:
predictions = lin_regr_model.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0.26291879 0.25952856 0.56807454 0.35155855 0.3564176  0.00536597
 0.64550972 0.31858356 0.42900025 0.26280427]
First 10 Actual labels: [0.28, 0.27, 0.54, 0.43, 0.39, 0.03, 0.64, 0.36, 0.35, 0.27]


In [12]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0.262919,0.28
1,0.259529,0.27
2,0.568075,0.54
3,0.351559,0.43
4,0.356418,0.39
...,...,...
4643,0.432285,0.39
4644,0.391905,0.37
4645,0.191535,0.13
4646,0.769862,0.77


In [13]:
#SVR

from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([('scaler', StandardScaler()), ('svr', SVR())])
# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
pipe.fit(X_train, y_train)
# Pipeline(steps=[('scaler', StandardScaler()), ('svr', SVR())])
# pipe.score(X_test, y_test)
pipe.predict(X_test)

array([0.20964583, 0.17565904, 0.5503784 , ..., 0.21998923, 0.79421038,
       0.47853786])

In [14]:
#Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

train_pred_rf = rf_model.score(X_train, y_train)

print (f'Score {train_pred_rf}')


Score 0.9521855987247506


In [17]:
# Save the model

import pickle
pickle.dump(lin_regr_model, open('linear_regression_model.pkl','wb'))


In [20]:
# Load the model
loaded_lin_model = pickle.load(open('linear_regression_model.pkl','rb'))


In [21]:
#evaluating loaded model 

score = loaded_lin_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))



Test score: 91.89 %


In [22]:
# New data to predict
pr = pd.read_csv('data/test_new.csv')

# apply the whole pipeline to data
pred = pipe.predict(X_test)
print (pred)

[0.20964583 0.17565904 0.5503784  ... 0.21998923 0.79421038 0.47853786]


In [23]:
#testing random input 

new_data = [1, 0, 1, 3, 10, 8.2]

pred = pipe.predict([new_data])
print(pred)

[0.78340068]


In [24]:
#Order of Columns
# Gender: 0 = Male, 1 = Female 
# Company type: 0 = Service, 1 = Product 
# WFH Setup: 0 = No, 1 = Yes 
# Designation = position of emplyee in workplace (range 0-5) high number is high designation
# Resource Allocation = number of working hours (range 1-10) 
# Menatal Fatigue = level of fatigue mentally the employee is facing. (range 0.0-10.0)

new_data = [0, 0, 1, 3, 10, 8.2]

pred = pipe.predict([new_data])
print(f'Burn Rate: {pred}')

Burn Rate: [0.79300679]


In [25]:
new_data = [1, 1, 1, 1, 3, 2.6]

pred = pipe.predict([new_data])
print(f'Burn Rate: {pred}')


Burn Rate: [0.15673477]


In [26]:
new_data = [0, 1, 0, 8, 5, 6.7]

pred = pipe.predict([new_data])
print(f'Burn Rate: {pred}')

Burn Rate: [0.61577179]


In [27]:
new_data = [1, 1, 0, 1, 1, 3.5]

pred = pipe.predict([new_data])
print(f'Burn Rate: {pred}')

Burn Rate: [0.21995525]


In [30]:
print('Calculate your Burn Rate. Enter your info below')
print('* Your Burn Rate will be on a scale between 0-1')

user_input = []

while True:
    
    gender = input('Enter your gender. 0 = male, 1 = female')
    user_input.append(gender)
    
    company_type = input('Enter your company type. 0 = service, 1 = product')
    user_input.append(company_type)
    
    wfh = input('Do you have a "Work From Home" setup?. 0 = No, 1 = yes')
    user_input.append(wfh)
    
    designation = input('Rank your position in your workplace. range 0-5. Low - High, whole numbers only')
    user_input.append(designation)
    
    resource_allocation = input('How many hours do you work in a day? range 0-10, whole numbers only')
    user_input.append(resource_allocation)
    
    mental_fatigue = input('Rank your mental fatigue. 0.0 - 10.0')
    user_input.append(mental_fatigue)
    
    break
    
print(f'Your inputs: {user_input}')

pred = pipe.predict([user_input])
print(f'Your Burn Rate is: {pred}')




Calculate your Burn Rate. Enter your info below
* Your Burn Rate will be on a scale between 0-1
Enter your gender. 0 = male, 1 = female1
Enter your company type. 0 = service, 1 = product1
Do you have a "Work From Home" setup?. 0 = No, 1 = yes0
Rank your position in your workplace. range 0-5. Low - High, whole numbers only4
How many hours do you work in a day? range 0-10, whole numbers only4
Rank your mental fatigue. 0.0 - 10.07
Your inputs: ['1', '1', '0', '4', '4', '7']
Your Burn Rate is: [0.56770334]


In [31]:
pred = pipe.predict([user_input])
print(f'Your Burn Rate is: {pred}')

Your Burn Rate is: [0.56770334]
