In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [73]:
#Importing the train and test data
df_train = pd.read_csv("Train_data.csv")
df_test = pd.read_csv("Test_data.csv")

#Splitting the data into features and target.
X_train = df_train.drop("Target", axis = 1)
y_train = df_train["Target"]

X_test = df_test.drop("Target", axis = 1)
y_test = df_test["Target"]

In [74]:
X_train.shape, y_train.shape

((3722, 12), (3722,))

In [75]:
X_test.shape, y_test.shape

((931, 12), (931,))

In [76]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [77]:
dtc_model = DecisionTreeClassifier()

In [78]:
param_grid = {"criterion" : ["gini", "entropy"], "max_depth": [2,4,6,8,10,12]}

In [79]:
cv = GridSearchCV(dtc_model, param_grid= param_grid)

In [80]:
cv.fit(X_train, y_train)

In [81]:
cv.best_params_

{'criterion': 'gini', 'max_depth': 6}

In [82]:
pd.DataFrame(cv.cv_results_)[["param_criterion","param_max_depth","params", "mean_test_score", "rank_test_score"]].sort_values(by ='rank_test_score')

Unnamed: 0,param_criterion,param_max_depth,params,mean_test_score,rank_test_score
2,gini,6,"{'criterion': 'gini', 'max_depth': 6}",0.841757,1
8,entropy,6,"{'criterion': 'entropy', 'max_depth': 6}",0.841756,2
10,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.839341,3
3,gini,8,"{'criterion': 'gini', 'max_depth': 8}",0.839074,4
9,entropy,8,"{'criterion': 'entropy', 'max_depth': 8}",0.837998,5
11,entropy,12,"{'criterion': 'entropy', 'max_depth': 12}",0.832888,6
4,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.832623,7
5,gini,12,"{'criterion': 'gini', 'max_depth': 12}",0.827517,8
1,gini,4,"{'criterion': 'gini', 'max_depth': 4}",0.803066,9
7,entropy,4,"{'criterion': 'entropy', 'max_depth': 4}",0.800377,10


##### It can be seen that with this dataset, criterion hyperparameter almost didn't affect the results. 
##### But the max depth of the tree takes very important role on the accuracy score.


In [83]:
#Testing the model
predictions = cv.predict(X_test)

In [84]:
from sklearn.metrics import classification_report, confusion_matrix

In [85]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89       611
           1       0.82      0.72      0.77       320

    accuracy                           0.85       931
   macro avg       0.84      0.82      0.83       931
weighted avg       0.85      0.85      0.85       931



In [86]:
confusion_matrix(y_test, predictions)

array([[562,  49],
       [ 91, 229]])

##### Our model also performed well in test scenario. The reason why we got lower recall score on "1" labeled samples is that there are low amount of samples which labeled as "1" (unbalanced dataset).

#### Creating the model again with the removal of some features.

In [87]:
#Pearson Correlation Function
def cor_selector(X, y,num_feats):
    feature_name = X.columns.tolist()

    cor_feature = abs(pd.DataFrame.corrwith(X,y)).sort_values(ascending= False).head(num_feats)
    cor_feature = cor_feature.index.tolist()
    
    return  cor_feature

In [88]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [89]:
metrics = pd.DataFrame(columns = X_train.columns.tolist() + ['Feature_Count','Train_Accuracy', 'Test_Accuracy'])

for i in range(1, len(X_train.columns)):
    features = cor_selector(X_train, y_train, i) #i = number of features to be selected 
    new_X_train = X_train[features]
    new_X_test = X_test[features]
    cv.fit(new_X_train, y_train)
    
    my_dict = {} #Creating a dictionary to add rows into metrics dataframe.
    for item in features:
        my_dict[item] = 1
    predictions = cv.predict(new_X_test)
    my_dict['Test_Accuracy'] = accuracy_score(y_test, predictions)
    my_dict['Train_Accuracy'] = cv.best_score_
    my_dict['Feature_Count'] = len(features)
    metrics.loc[len(metrics)] = my_dict
    #With the code above, we will able to see the selected features and the accuracy scores on the metrics dataframe.

In [90]:
metrics = metrics.fillna(0)
metrics.sort_values('Train_Accuracy',ascending=False)

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,Education_Bachelors,Education_Masters,Education_PHD,City_Bangalore,City_New Delhi,City_Pune,Gender_Male,EverBenched_Yes,Feature_Count,Train_Accuracy,Test_Accuracy
8,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1,1.0,9,0.843099,0.857143
10,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1,1.0,11,0.840952,0.849624
9,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1,1.0,10,0.84095,0.849624
6,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1,0.0,7,0.832352,0.845328
5,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1,0.0,6,0.832351,0.844253
7,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1,1.0,8,0.832351,0.850698
4,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,0.0,5,0.827783,0.846402
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0.0,4,0.819187,0.846402
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0.0,3,0.808708,0.817401
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0.0,2,0.736968,0.747583


#### As a result, the model performs well all the way down to the 3 features which are "City_Pune, JoiningYear, Gender_Male"

### Final model with 3 features

In [91]:
X_train_3feats = X_train[["City_Pune", "JoiningYear", "Gender_Male"]]
X_test_3feats = X_test[["City_Pune", "JoiningYear", "Gender_Male"]]

In [92]:
cv.fit(X_train_3feats, y_train)

In [93]:
cv.best_params_

{'criterion': 'gini', 'max_depth': 4}

In [94]:
predictions_3feats = cv.predict(X_test_3feats) 

In [95]:
print(classification_report(y_test, predictions_3feats))

              precision    recall  f1-score   support

           0       0.80      0.96      0.87       611
           1       0.89      0.54      0.67       320

    accuracy                           0.82       931
   macro avg       0.84      0.75      0.77       931
weighted avg       0.83      0.82      0.80       931



In [96]:
confusion_matrix(y_test, predictions_3feats)

array([[589,  22],
       [148, 172]])

#### After reducing the number of features, the model began to produce more false positives. While it still maintains good accuracy for samples labeled as '0,' since we are attempting to predict employees who plan to leave the company, using the entire dataset may lead to better recall for samples labeled as '1.'