In [99]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder

pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', None)

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [100]:
# Importing the Data into pandas DataFrame

raw_Data = pd.read_csv('/Users/pritigirishvankudre/DayX_Lockdown Practice/Emp_Attr_Pred/train.csv')
raw_Data.head(15)

Unnamed: 0,MMM-YY,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating
0,2016-01-01,1,28,Male,C23,Master,57387,2015-12-24,,1,1,2381060,2
1,2016-02-01,1,28,Male,C23,Master,57387,2015-12-24,,1,1,-665480,2
2,2016-03-01,1,28,Male,C23,Master,57387,2015-12-24,2016-03-11,1,1,0,2
3,2017-11-01,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1
4,2017-12-01,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1
5,2016-12-01,4,43,Male,C13,Master,65603,2016-12-07,,2,2,0,1
6,2017-01-01,4,43,Male,C13,Master,65603,2016-12-07,,2,2,0,1
7,2017-02-01,4,43,Male,C13,Master,65603,2016-12-07,,2,2,0,1
8,2017-03-01,4,43,Male,C13,Master,65603,2016-12-07,,2,2,350000,1
9,2017-04-01,4,43,Male,C13,Master,65603,2016-12-07,2017-04-27,2,2,0,1


In [101]:
# Extracting Data Information

raw_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19104 entries, 0 to 19103
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   MMM-YY                19104 non-null  object
 1   Emp_ID                19104 non-null  int64 
 2   Age                   19104 non-null  int64 
 3   Gender                19104 non-null  object
 4   City                  19104 non-null  object
 5   Education_Level       19104 non-null  object
 6   Salary                19104 non-null  int64 
 7   Dateofjoining         19104 non-null  object
 8   LastWorkingDate       1616 non-null   object
 9   Joining Designation   19104 non-null  int64 
 10  Designation           19104 non-null  int64 
 11  Total Business Value  19104 non-null  int64 
 12  Quarterly Rating      19104 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 1.9+ MB


In [102]:
# Describing the Data

raw_Data.describe()

Unnamed: 0,Emp_ID,Age,Salary,Joining Designation,Designation,Total Business Value,Quarterly Rating
count,19104.0,19104.0,19104.0,19104.0,19104.0,19104.0,19104.0
mean,1415.591133,34.650283,65652.025126,1.690536,2.25267,571662.1,2.008899
std,810.705321,6.264471,30914.515344,0.836984,1.026512,1128312.0,1.009832
min,1.0,21.0,10747.0,1.0,1.0,-6000000.0,1.0
25%,710.0,30.0,42383.0,1.0,1.0,0.0,1.0
50%,1417.0,34.0,60087.0,1.0,2.0,250000.0,2.0
75%,2137.0,39.0,83969.0,2.0,3.0,699700.0,3.0
max,2788.0,58.0,188418.0,5.0,5.0,33747720.0,4.0


In [103]:
# To find and remove duplicate rows if any

duplicate = raw_Data[raw_Data.duplicated()]
print (duplicate)

Empty DataFrame
Columns: [MMM-YY, Emp_ID, Age, Gender, City, Education_Level, Salary, Dateofjoining, LastWorkingDate, Joining Designation, Designation, Total Business Value, Quarterly Rating]
Index: []


In [104]:
# Mehtod to understand Null values distribution

col_null = pd.DataFrame(raw_Data.isnull().sum()).reset_index()
col_null.columns = ['DataColumns','NullCount']
col_null['NullCount_Pct'] = round((col_null['NullCount']/raw_Data.shape[0])*100,2)
display (col_null)

Unnamed: 0,DataColumns,NullCount,NullCount_Pct
0,MMM-YY,0,0.0
1,Emp_ID,0,0.0
2,Age,0,0.0
3,Gender,0,0.0
4,City,0,0.0
5,Education_Level,0,0.0
6,Salary,0,0.0
7,Dateofjoining,0,0.0
8,LastWorkingDate,17488,91.54
9,Joining Designation,0,0.0


In [105]:
# Method to understand Unique values distribution

col_uniq = pd.DataFrame(raw_Data.nunique()).reset_index()
col_uniq.columns = ['DataColumns','UniqCount']
col_uniq_cnt = pd.DataFrame(raw_Data.count(axis=0)).reset_index()
col_uniq_cnt.columns = ['DataColumns','UniqCount']
col_uniq['UniqCount_Pct'] = round((col_uniq['UniqCount']/col_uniq_cnt['UniqCount'])*100,2)
display (col_uniq)

Unnamed: 0,DataColumns,UniqCount,UniqCount_Pct
0,MMM-YY,24,0.13
1,Emp_ID,2381,12.46
2,Age,36,0.19
3,Gender,2,0.01
4,City,29,0.15
5,Education_Level,3,0.02
6,Salary,2383,12.47
7,Dateofjoining,869,4.55
8,LastWorkingDate,493,30.51
9,Joining Designation,5,0.03


Here in reality, information for 2381 unique employees is present over the years of 2016 and 2017. So we may try clubbing those details and then represent it as a unique data row.

The Target is to classify if the employee will resign or not. So we can convert LastWorkingDate column into 1 and 0 basis presence/absence of date value.

In [106]:
#Creating a copy of original dataset for data cleaning and preprocessing and transformation.

raw_Data_1 = raw_Data.copy()
raw_Data_1.drop(['MMM-YY'], axis=1, inplace=True)
raw_Data_1.head()

Unnamed: 0,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating
0,1,28,Male,C23,Master,57387,2015-12-24,,1,1,2381060,2
1,1,28,Male,C23,Master,57387,2015-12-24,,1,1,-665480,2
2,1,28,Male,C23,Master,57387,2015-12-24,2016-03-11,1,1,0,2
3,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1
4,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1


In [107]:
# Converting data column to datetime

raw_Data_1['Dateofjoining'] = pd.to_datetime(raw_Data_1['Dateofjoining'])
raw_Data_1['LastWorkingDate'] = pd.to_datetime(raw_Data_1['LastWorkingDate'])

In [108]:
# Assigning College as 0, Bachelor as 1 and Master as 2 for Education_Level column

end = OrdinalEncoder(categories=[['College', 'Bachelor', 'Master']],dtype=int)
raw_Data_1['Edu_Lvl_Encd'] = end.fit_transform(raw_Data_1[['Education_Level']])
raw_Data_1.drop(['Education_Level'], axis=1, inplace=True)

In [109]:
raw_Data_grpby = raw_Data_1[['Emp_ID','Gender','City','Edu_Lvl_Encd','Joining Designation']]
raw_Data_grpby.head()

Unnamed: 0,Emp_ID,Gender,City,Edu_Lvl_Encd,Joining Designation
0,1,Male,C23,2,1
1,1,Male,C23,2,1
2,1,Male,C23,2,1
3,2,Male,C7,2,2
4,2,Male,C7,2,2


In [110]:
raw_Data_grpby.drop_duplicates(subset =['Emp_ID','Gender','City','Edu_Lvl_Encd','Joining Designation'] ,
                               keep = 'last', inplace = True)
raw_Data_grpby.head()

Unnamed: 0,Emp_ID,Gender,City,Edu_Lvl_Encd,Joining Designation
2,1,Male,C23,2,1
4,2,Male,C7,2,2
9,4,Male,C13,2,2
12,5,Male,C9,0,1
17,6,Female,C11,1,3


In [112]:
# To calculate total tenure of employee and Target variable (employee resigned or not)

emp_Ten = raw_Data_1[['Emp_ID','Dateofjoining','LastWorkingDate']]
emp_Ten.drop_duplicates(subset =['Emp_ID','Dateofjoining'] ,keep = 'last', inplace = True)
emp_Ten['Target'] = emp_Ten['LastWorkingDate'].apply(lambda x: 0 if x is pd.NaT else 1)

# Since we have to validate data till 31-Dec-2017, for those where Last working date is not available (not resigned)
# we assume LastWorkingDate as 31-Dec-2017 for calculation purpose
emp_Ten['LastWorkingDate'].fillna('2017-12-31', inplace=True)
emp_Ten ['Emp_Tenure'] =  (emp_Ten ['LastWorkingDate'] - emp_Ten ['Dateofjoining']).dt.days
emp_Ten.head()

Unnamed: 0,Emp_ID,Dateofjoining,LastWorkingDate,Target,Emp_Tenure
2,1,2015-12-24,2016-03-11,1,78
4,2,2017-11-06,2017-12-31,0,55
9,4,2016-12-07,2017-04-27,1,141
12,5,2016-01-09,2016-03-07,1,58
17,6,2017-07-31,2017-12-31,0,153


In [113]:
# To calculate Total Business Earned and Total Business Loss

emp_Bus = raw_Data_1[['Emp_ID','Total Business Value']]
emp_Bus['Business_Earn'] = emp_Bus['Total Business Value'].apply(lambda x: x if x >= 0 else 0)
emp_Bus['Business_Loss'] = emp_Bus['Total Business Value'].apply(lambda x: x if x < 0 else 0)
emp_Bus.head(15)

Unnamed: 0,Emp_ID,Total Business Value,Business_Earn,Business_Loss
0,1,2381060,2381060,0
1,1,-665480,0,-665480
2,1,0,0,0
3,2,0,0,0
4,2,0,0,0
5,4,0,0,0
6,4,0,0,0
7,4,0,0,0
8,4,350000,350000,0
9,4,0,0,0


In [114]:
emp_Bus_grp = pd.DataFrame(emp_Bus.groupby(['Emp_ID'])[['Business_Earn','Business_Loss']].sum())
emp_Bus_grp.head(15)

Unnamed: 0_level_0,Business_Earn,Business_Loss
Emp_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2381060,-665480
2,0,0
4,350000,0
5,120360,0
6,1265000,0
8,0,0
11,0,0
12,2607180,0
13,10213040,0
14,0,0


In [115]:
#Function to groupby Emp_ID, to select maximum (latest value) of columns like Age, Designation

def column_process(df, uni_id, col_name):
    temp = pd.DataFrame(df.groupby([uni_id])[col_name].max())
    return (temp)

In [116]:
emp_Age = column_process(raw_Data_1,'Emp_ID','Age')
emp_Age.head()

Unnamed: 0_level_0,Age
Emp_ID,Unnamed: 1_level_1
1,28
2,31
4,43
5,29
6,31


In [117]:
emp_Designation = column_process(raw_Data_1,'Emp_ID','Designation')
emp_Designation.head()

Unnamed: 0_level_0,Designation
Emp_ID,Unnamed: 1_level_1
1,1
2,2
4,2
5,1
6,3


In [118]:
# To calculate latest Salary and total increment received from joining date to till date

emp_Sal_max = pd.DataFrame(raw_Data_1.groupby(['Emp_ID'])['Salary'].max())
emp_Sal_max.columns = ['Max_Sal']
emp_Sal_max.head()

Unnamed: 0_level_0,Max_Sal
Emp_ID,Unnamed: 1_level_1
1,57387
2,67016
4,65603
5,46368
6,78728


In [119]:
emp_Sal_min = pd.DataFrame(raw_Data_1.groupby(['Emp_ID'])['Salary'].min())
emp_Sal_min.columns = ['Min_Sal']
emp_Sal_min.head()

Unnamed: 0_level_0,Min_Sal
Emp_ID,Unnamed: 1_level_1
1,57387
2,67016
4,65603
5,46368
6,78728


In [120]:
emp_Sal = pd.merge(emp_Sal_max, emp_Sal_min, how="inner", on=['Emp_ID'])
emp_Sal['Increment'] = emp_Sal['Max_Sal'] - emp_Sal['Min_Sal']
emp_Sal.head()

Unnamed: 0_level_0,Max_Sal,Min_Sal,Increment
Emp_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,57387,57387,0
2,67016,67016,0
4,65603,65603,0
5,46368,46368,0
6,78728,78728,0


In [121]:
# To calculate overall rating recived during the Tenure

emp_Rating_max = pd.DataFrame(raw_Data_1.groupby(['Emp_ID'])['Quarterly Rating'].max())
emp_Rating_max.columns = ['Max_Qua_Rating']
emp_Rating_min = pd.DataFrame(raw_Data_1.groupby(['Emp_ID'])['Quarterly Rating'].min())
emp_Rating_min.columns = ['Min_Qua_Rating']
emp_Rating = pd.merge(emp_Rating_max, emp_Rating_min, how="inner", on=['Emp_ID'])
emp_Rating['Increment_Rating'] = emp_Rating['Max_Qua_Rating'] - emp_Rating['Max_Qua_Rating']
emp_Rating.head()

Unnamed: 0_level_0,Max_Qua_Rating,Min_Qua_Rating,Increment_Rating
Emp_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2,2,0
2,1,1,0
4,1,1,0
5,1,1,0
6,2,1,0


# Joining the data with cleaned and transformed information

In [122]:
# Using OHE to convert gender to numerical value

raw_Data_grpby = pd.get_dummies(raw_Data_grpby,columns=['Gender'],drop_first=False)
raw_Data_grpby.head()

Unnamed: 0,Emp_ID,City,Edu_Lvl_Encd,Joining Designation,Gender_Female,Gender_Male
2,1,C23,2,1,0,1
4,2,C7,2,2,0,1
9,4,C13,2,2,0,1
12,5,C9,0,1,0,1
17,6,C11,1,3,1,0


In [123]:
# Joining with Age and Designation

raw_Data_grpby = pd.merge(raw_Data_grpby, emp_Age, how="inner", on=['Emp_ID'])
raw_Data_grpby = pd.merge(raw_Data_grpby, emp_Designation, how="inner", on=['Emp_ID'])
raw_Data_grpby.head()

Unnamed: 0,Emp_ID,City,Edu_Lvl_Encd,Joining Designation,Gender_Female,Gender_Male,Age,Designation
0,1,C23,2,1,0,1,28,1
1,2,C7,2,2,0,1,31,2
2,4,C13,2,2,0,1,43,2
3,5,C9,0,1,0,1,29,1
4,6,C11,1,3,1,0,31,3


In [124]:
# Joining with Employee Tenure and Target variabl

raw_Data_grpby = pd.merge(raw_Data_grpby, emp_Ten[['Emp_ID','Emp_Tenure','Target']], how="inner", on=['Emp_ID'])
raw_Data_grpby.head()

Unnamed: 0,Emp_ID,City,Edu_Lvl_Encd,Joining Designation,Gender_Female,Gender_Male,Age,Designation,Emp_Tenure,Target
0,1,C23,2,1,0,1,28,1,78,1
1,2,C7,2,2,0,1,31,2,55,0
2,4,C13,2,2,0,1,43,2,141,1
3,5,C9,0,1,0,1,29,1,58,1
4,6,C11,1,3,1,0,31,3,153,0


In [125]:
# Joining with Total Business Earned and Total Business Lost during the Tenure

raw_Data_grpby = pd.merge(raw_Data_grpby, emp_Bus_grp, how="inner", on=['Emp_ID'])
raw_Data_grpby.head()

Unnamed: 0,Emp_ID,City,Edu_Lvl_Encd,Joining Designation,Gender_Female,Gender_Male,Age,Designation,Emp_Tenure,Target,Business_Earn,Business_Loss
0,1,C23,2,1,0,1,28,1,78,1,2381060,-665480
1,2,C7,2,2,0,1,31,2,55,0,0,0
2,4,C13,2,2,0,1,43,2,141,1,350000,0
3,5,C9,0,1,0,1,29,1,58,1,120360,0
4,6,C11,1,3,1,0,31,3,153,0,1265000,0


In [126]:
# Joining with Max Salary (Current) and Increament earned during the Tenure

raw_Data_grpby = pd.merge(raw_Data_grpby, emp_Sal[['Max_Sal','Increment']], how="inner", on=['Emp_ID'])
raw_Data_grpby.head()

Unnamed: 0,Emp_ID,City,Edu_Lvl_Encd,Joining Designation,Gender_Female,Gender_Male,Age,Designation,Emp_Tenure,Target,Business_Earn,Business_Loss,Max_Sal,Increment
0,1,C23,2,1,0,1,28,1,78,1,2381060,-665480,57387,0
1,2,C7,2,2,0,1,31,2,55,0,0,0,67016,0
2,4,C13,2,2,0,1,43,2,141,1,350000,0,65603,0
3,5,C9,0,1,0,1,29,1,58,1,120360,0,46368,0
4,6,C11,1,3,1,0,31,3,153,0,1265000,0,78728,0


In [127]:
# Joining with Max Quarterly Rating (Current) and Increase in Rating earned during the Tenure

raw_Data_grpby = pd.merge(raw_Data_grpby, emp_Rating[['Max_Qua_Rating','Increment_Rating']], 
                          how="inner", on=['Emp_ID'])
raw_Data_grpby.head()

Unnamed: 0,Emp_ID,City,Edu_Lvl_Encd,Joining Designation,Gender_Female,Gender_Male,Age,Designation,Emp_Tenure,Target,Business_Earn,Business_Loss,Max_Sal,Increment,Max_Qua_Rating,Increment_Rating
0,1,C23,2,1,0,1,28,1,78,1,2381060,-665480,57387,0,2,0
1,2,C7,2,2,0,1,31,2,55,0,0,0,67016,0,1,0
2,4,C13,2,2,0,1,43,2,141,1,350000,0,65603,0,1,0
3,5,C9,0,1,0,1,29,1,58,1,120360,0,46368,0,1,0
4,6,C11,1,3,1,0,31,3,153,0,1265000,0,78728,0,2,0


In [128]:
end_1 = OrdinalEncoder(dtype=int)
raw_Data_grpby['City_Encd'] = end_1.fit_transform(raw_Data_grpby[['City']])
raw_Data_grpby.drop(['City'], axis=1, inplace=True)
raw_Data_grpby.head()

Unnamed: 0,Emp_ID,Edu_Lvl_Encd,Joining Designation,Gender_Female,Gender_Male,Age,Designation,Emp_Tenure,Target,Business_Earn,Business_Loss,Max_Sal,Increment,Max_Qua_Rating,Increment_Rating,City_Encd
0,1,2,1,0,1,28,1,78,1,2381060,-665480,57387,0,2,0,15
1,2,2,2,0,1,31,2,55,0,0,0,67016,0,1,0,26
2,4,2,2,0,1,43,2,141,1,350000,0,65603,0,1,0,4
3,5,0,1,0,1,29,1,58,1,120360,0,46368,0,1,0,28
4,6,1,3,1,0,31,3,153,0,1265000,0,78728,0,2,0,2


# Building the Classification Algorithm to Predict Target Column

In [129]:
raw_Data_model = raw_Data_grpby.copy()

In [130]:
X = raw_Data_model.drop(['Emp_ID','Target'], axis=1)
y = raw_Data_model['Target']

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=7)

In [131]:
# Define Common Function to train the model

def train_model(classifier, train_x, train_y, test_x, test_y):
    # fit the training dataset on the classifier
    classifier.fit(train_x, train_y)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(test_x)
    
    # calculating accuracy score
    acc_Score = accuracy_score(predictions, test_y)
    class_Report = classification_report(test_y, predictions)
    auc_roc_Score = roc_auc_score(test_y, predictions)
    f1_Score = f1_score(test_y, predictions)
    
    return (acc_Score, class_Report, auc_roc_Score, f1_Score)

In [132]:
# Naive Bayes Classification Model

acc_Score, class_Report, auc_roc_Score, f1_Score = train_model(GaussianNB(), train_x, train_y, test_x, test_y)
print ("\n NB, Accuracy: \n", acc_Score)
print ("\n NB, Classification Report: \n", class_Report)
print ("\n NB, AUC_ROC Accuracy: \n", auc_roc_Score)
print ("\n NB, f1_Score: \n", f1_Score)


 NB, Accuracy: 
 0.7314685314685314

 NB, Classification Report: 
               precision    recall  f1-score   support

           0       0.69      0.30      0.42       230
           1       0.74      0.93      0.83       485

    accuracy                           0.73       715
   macro avg       0.71      0.62      0.62       715
weighted avg       0.72      0.73      0.70       715


 NB, AUC_ROC Accuracy: 
 0.6191842223218288

 NB, f1_Score: 
 0.825136612021858


In [133]:
# Linear Model for Classification

acc_Score, class_Report, auc_roc_Score, f1_Score = train_model(LogisticRegression(solver='lbfgs'), 
                                                               train_x, train_y, test_x, test_y)
print ("\n LR, Accuracy: \n", acc_Score)
print ("\n LR, Classification Report: \n", class_Report)
print ("\n LR, AUC_ROC Accuracy: \n", auc_roc_Score)
print ("\n LR, f1_Score: \n", f1_Score)


 LR, Accuracy: 
 0.737062937062937

 LR, Classification Report: 
               precision    recall  f1-score   support

           0       0.69      0.33      0.44       230
           1       0.74      0.93      0.83       485

    accuracy                           0.74       715
   macro avg       0.72      0.63      0.64       715
weighted avg       0.73      0.74      0.70       715


 LR, AUC_ROC Accuracy: 
 0.6290228597041685

 LR, f1_Score: 
 0.8278388278388279


In [134]:
# Ensemble Model for Classification

acc_Score, class_Report, auc_roc_Score, f1_Score = train_model(RandomForestClassifier(n_estimators=100), 
                                                               train_x, train_y, test_x, test_y)
print ("\n RF, Accuracy: \n", acc_Score)
print ("\n RF, Classification Report: \n", class_Report)
print ("\n RF, AUC_ROC Accuracy: \n", auc_roc_Score)
print ("\n RF, f1_Score: \n", f1_Score)


 RF, Accuracy: 
 0.7524475524475525

 RF, Classification Report: 
               precision    recall  f1-score   support

           0       0.64      0.51      0.57       230
           1       0.79      0.87      0.83       485

    accuracy                           0.75       715
   macro avg       0.72      0.69      0.70       715
weighted avg       0.74      0.75      0.74       715


 RF, AUC_ROC Accuracy: 
 0.6895114298520842

 RF, f1_Score: 
 0.8259587020648969


In [135]:
# Defining Common Function with GridSearchCV

def train_model_GridSearchCV(classifier, train_x, train_y, test_x, test_y, search_param):
    # define search
    search = GridSearchCV(classifier, search_param, scoring='f1', cv=10, error_score=0.0)
    
    # execute search
    search.fit(train_x, train_y)
    
    # predict the labels on validation dataset
    predictions = search.best_estimator_.predict(test_x)
    
    # calculating accuracy score
    f1_Score = search.best_score_
    best_Params = search.best_params_
    class_Report = classification_report(test_y, predictions)
    
    return (f1_Score, best_Params, class_Report)

In [136]:
# Linear Model for Classification

# Defining search parameters for Logistic Regression
search_param = dict()
search_param['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
search_param['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
search_param['C'] = [0.001, 0.01, 0.1, 1, 10, 100]

# Logistic Regression
f1_Score, best_Params, class_Report = train_model_GridSearchCV(LogisticRegression(), train_x, train_y, 
                                                               test_x, test_y, search_param)
print ("\n LR, f1_Score: \n", f1_Score)
print ("\n LR, Best Hyperparameters: \n", best_Params)
print ("\n LR, Classification Report: \n", class_Report)


 LR, f1_Score: 
 0.8448522930311158

 LR, Best Hyperparameters: 
 {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}

 LR, Classification Report: 
               precision    recall  f1-score   support

           0       0.69      0.35      0.47       230
           1       0.75      0.92      0.83       485

    accuracy                           0.74       715
   macro avg       0.72      0.64      0.65       715
weighted avg       0.73      0.74      0.71       715



In [137]:
# Ensemble Model for Classification

# Defining search parameters for Random Forest Classifier
search_param = dict()
search_param['n_estimators'] = [10, 100, 1000]
search_param['max_features'] = ['sqrt', 'log2']

# Random Forest Classifier
f1_Score, best_Params, class_Report = train_model_GridSearchCV(RandomForestClassifier(), train_x, train_y, 
                                                               test_x, test_y, search_param)
print ("\n RF, f1_Score: \n", f1_Score)
print ("\n RF, Best Hyperparameters: \n", best_Params)
print ("\n RF, Classification Report: \n", class_Report)


 RF, f1_Score: 
 0.8551448084061296

 RF, Best Hyperparameters: 
 {'max_features': 'log2', 'n_estimators': 1000}

 RF, Classification Report: 
               precision    recall  f1-score   support

           0       0.66      0.50      0.57       230
           1       0.79      0.88      0.83       485

    accuracy                           0.76       715
   macro avg       0.72      0.69      0.70       715
weighted avg       0.75      0.76      0.75       715



In [138]:
# Boosting Algorithm for Classification

# Defining search parameter for Adaboost
search_param = dict()
search_param['n_estimators'] = [10, 100, 1000]
search_param['learning_rate'] = [0.001, 0.01, 0.1]

# Random Forest Classifier
f1_Score, best_Params, class_Report = train_model_GridSearchCV(AdaBoostClassifier(), train_x, train_y, 
                                                               test_x, test_y, search_param)
print ("\n AB, f1_Score: \n", f1_Score)
print ("\n AB, Best Hyperparameters: \n", best_Params)
print ("\n AB, Classification Report: \n", class_Report)


 AB, f1_Score: 
 0.8557820908013722

 AB, Best Hyperparameters: 
 {'learning_rate': 0.1, 'n_estimators': 1000}

 AB, Classification Report: 
               precision    recall  f1-score   support

           0       0.71      0.47      0.56       230
           1       0.78      0.91      0.84       485

    accuracy                           0.77       715
   macro avg       0.74      0.69      0.70       715
weighted avg       0.76      0.77      0.75       715



In [139]:
# Boosting Algorithm for Classification

# Defining search parameter for Gredient Boosting
search_param = dict()
search_param['n_estimators'] = [10, 100, 1000]
search_param['learning_rate'] = [0.001, 0.01, 0.1]

# Random Forest Classifier
f1_Score, best_Params, class_Report = train_model_GridSearchCV(GradientBoostingClassifier(), train_x, train_y, 
                                                               test_x, test_y, search_param)
print ("\n GB, f1_Score: \n", f1_Score)
print ("\n GB, Best Hyperparameters: \n", best_Params)
print ("\n GB, Classification Report: \n", class_Report)


 GB, f1_Score: 
 0.863656443174569

 GB, Best Hyperparameters: 
 {'learning_rate': 0.1, 'n_estimators': 1000}

 GB, Classification Report: 
               precision    recall  f1-score   support

           0       0.67      0.60      0.64       230
           1       0.82      0.86      0.84       485

    accuracy                           0.78       715
   macro avg       0.75      0.73      0.74       715
weighted avg       0.77      0.78      0.77       715



# Predicting the Target variable on Test Data

In [140]:
# Common function to test model

def test_model(test_x, classifier, train_x, train_y):

    # fit the training dataset on the classifier
    classifier.fit(train_x, train_y)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(test_x)
    
    return (predictions)

In [141]:
raw_Data_test = pd.read_csv('/Users/pritigirishvankudre/DayX_Lockdown Practice/Emp_Attr_Pred/test.csv')
raw_Data_test.head()

Unnamed: 0,Emp_ID
0,394
1,173
2,1090
3,840
4,308


In [142]:
# To find and remove duplicate rows if any

duplicate = raw_Data_test[raw_Data_test.duplicated()]
print (duplicate)

Empty DataFrame
Columns: [Emp_ID]
Index: []


In [143]:
# Merging the information from original file

raw_Data_temp = pd.merge(raw_Data_test, raw_Data_grpby, how="inner", on=['Emp_ID'])
raw_Data_temp.drop(['Emp_ID','Target'], axis=1, inplace=True)
raw_Data_temp.head()

Unnamed: 0,Edu_Lvl_Encd,Joining Designation,Gender_Female,Gender_Male,Age,Designation,Emp_Tenure,Business_Earn,Business_Loss,Max_Sal,Increment,Max_Qua_Rating,Increment_Rating,City_Encd
0,2,2,1,0,34,4,2400,27458220,0,97722,0,3,0,12
1,0,1,0,1,39,3,2393,19866290,0,56174,0,4,0,20
2,0,2,0,1,39,4,2379,50382490,0,96750,0,4,0,4
3,0,1,1,0,40,4,2365,17134040,0,88813,0,2,0,27
4,2,2,0,1,32,5,2330,58024490,0,188418,0,4,0,24


In [90]:
# Bayesian Model for Classification to test model

prediction_NB = test_model(raw_Data_temp, GaussianNB(), train_x, train_y)
raw_Data_test['Target'] = pd.DataFrame(prediction_NB)
raw_Data_test.to_csv('NB_Test_2011_01_2301.csv', columns=['Emp_ID','Target'], header=True, index=False)
raw_Data_test.drop('Target', axis=1, inplace=True)

In [91]:
# Linear Model for Classification to test model with best hyper parameters

prediction_LR = test_model(raw_Data_temp, LogisticRegression(C=1, penalty= 'l1', solver= 'liblinear'), 
                           train_x, train_y)
raw_Data_test['Target'] = pd.DataFrame(prediction_LR)
raw_Data_test.to_csv('LR_Test_2011_02_2301.csv', columns=['Emp_ID','Target'], header=True, index=False)
raw_Data_test.drop('Target', axis=1, inplace=True)

In [92]:
# Ensemble Model for Classification to test model with best hyper parameters

prediction_RF = test_model(raw_Data_temp, RandomForestClassifier(max_features='sqrt', n_estimators=1000), 
                           train_x, train_y)
raw_Data_test['Target'] = pd.DataFrame(prediction_RF)
raw_Data_test.to_csv('RF_Test_2011_03_2301.csv', columns=['Emp_ID','Target'], header=True, index=False)
raw_Data_test.drop('Target', axis=1, inplace=True)

In [97]:
# Boosting Model for Classification to test model with best hyper parameters

prediction_AB = test_model(raw_Data_temp, AdaBoostClassifier(learning_rate= 0.1, n_estimators= 1000), 
                           train_x, train_y)
raw_Data_test['Target'] = pd.DataFrame(prediction_AB)
raw_Data_test.to_csv('AB_Test_2011_04_2301.csv', columns=['Emp_ID','Target'], header=True, index=False)
raw_Data_test.drop('Target', axis=1, inplace=True)

In [98]:
# Boosting Model for Classification to test model with best hyper parameters

prediction_GB = test_model(raw_Data_temp, GradientBoostingClassifier(learning_rate= 0.1, n_estimators= 1000), 
                           train_x, train_y)
raw_Data_test['Target'] = pd.DataFrame(prediction_GB)
raw_Data_test.to_csv('GB_Test_2011_05_2301.csv', columns=['Emp_ID','Target'], header=True, index=False)
raw_Data_test.drop('Target', axis=1, inplace=True)