# **1. Install Dependencies**

In [43]:
import numpy as np
import pandas as pd

# for visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# for ignore the Warnings
import warnings
warnings.filterwarnings("ignore")

# **2. Import the Datasets**

In [44]:
df = pd.read_csv('Employee_attrition_dataset.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


# **5. Data Preprocessing**

In [45]:
df.drop(columns=['Over18'], inplace=True)

In [46]:
df.duplicated().sum()

0

In [47]:
X = df.drop('PercentSalaryHike',axis=1)
y = df.PercentSalaryHike

In [48]:
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import SelectKBest,chi2

from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor

from imblearn.over_sampling import SMOTE

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [50]:
pd.set_option('display.max_columns',None)

In [51]:
# column transformer for encoding categorical columns

encoder = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(), ['Attrition','BusinessTravel', 'Department', 'EducationField', 'JobRole', 'OverTime']),  # Ordinal encoding
        ('ohe', OneHotEncoder(drop='first', sparse_output=False), ['Gender', 'MaritalStatus']),  # One-hot encoding with drop_first=True for 'Gender'
    ],
    remainder='passthrough'  # Keep the other columns unchanged
)

# setting to get a pandas df
encoder.set_output(transform='pandas')

In [52]:
pipe = Pipeline([
    ('preprocessing', encoder),  # Assuming `encoder` is your previously defined encoder
    ('scaling', MinMaxScaler()),  # Scaling step
    ('feature_selection', SelectKBest(score_func=chi2, k=15)),  # Feature selection step
])

# Fit the pipeline on the training data
pipe.fit(X_train, y_train)


In [53]:
import pickle

# Assuming `pipe` is your fitted pipeline
# Save the pipeline to a file
with open('preprocessing_pipeline_appraisal.pkl', 'wb') as f:
    pickle.dump(pipe, f)


In [54]:
X_train_transformed = pd.DataFrame(pipe.transform(X_train))
X_test_transformed = pd.DataFrame(pipe.transform(X_test))

# **6. Model Building**

# 6.1 Random Forest

In [55]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [56]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_transformed , y_train)
y_pred = rf_regressor.predict(X_test_transformed)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Squared Error:', mse)
print('Mean Absolute Error (MAE):', mae)

Mean Squared Error: 5.901104761904762
Mean Absolute Error (MAE): 2.0027755102040814


In [57]:
Pkl_Filename = "appraisal_prediction_model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(rf_regressor, file)

# 6.2 Polynomial Regression

In [58]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [59]:
degree = 2

# Transform the features to polynomial features
poly_features = PolynomialFeatures(degree=degree)
X_train_poly = poly_features.fit_transform(X_train_transformed)
X_test_poly = poly_features.transform(X_test_transformed)

In [60]:
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Make predictions
y_train_pred = model.predict(X_train_poly)
y_test_pred = model.predict(X_test_poly)


In [61]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Squared Error:', mse)
print('Mean Absolute Error (MAE):', mae)

Mean Squared Error: 5.901104761904762
Mean Absolute Error (MAE): 2.0027755102040814


In [62]:
import pickle