In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')
from pandas_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier 
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

In [3]:
tr = pd.read_csv('train.csv')
va = pd.read_csv('test.csv')

In [3]:
profile_train = ProfileReport(tr,title = "Train Report")
profile_validation = ProfileReport(va,title = "Validation Report")

In [4]:
profile_train

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [5]:
profile_validation

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [4]:
combined = pd.concat([tr,va],0,ignore_index=True)

In [5]:
combined.education = combined.education.fillna(combined.education.mode()[0])

In [6]:
combined.previous_year_rating = combined.previous_year_rating.fillna(combined.previous_year_rating.mode()[0])

In [7]:
combined.gender = combined.gender.map({'m':1,'f':0})

In [8]:
combined.education.value_counts()

Bachelor's          55690
Master's & above    21429
Below Secondary      1179
Name: education, dtype: int64

In [9]:
combined.education = combined.education.map({'Below Secondary':0,'Bachelor\'s':1,'Master\'s & above':2})

In [10]:
combined.recruitment_channel.value_counts()

other       43524
sourcing    33181
referred     1593
Name: recruitment_channel, dtype: int64

In [11]:
combined.recruitment_channel = combined.recruitment_channel.map({'other':0,'sourcing':1,'referred':2})

In [12]:
numeric_cols = combined.select_dtypes(exclude=['object'])
categorical_cols = combined.select_dtypes(include=['object'])
cat_le = categorical_cols.apply(LabelEncoder().fit_transform)  
combined = pd.concat([numeric_cols, cat_le], axis=1)    

In [13]:
combined.head()

Unnamed: 0,employee_id,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,department,region
0,65438,2,0,1,1,35,5.0,8,1,0,49,0.0,7,31
1,65141,1,1,0,1,30,5.0,4,0,0,60,0.0,4,14
2,7513,1,1,1,1,34,3.0,7,0,0,50,0.0,7,10
3,2542,1,1,0,2,39,1.0,10,0,0,50,0.0,7,15
4,48945,1,1,0,1,45,3.0,2,0,0,73,0.0,8,18


In [14]:
train = combined[combined.is_promoted.isnull()==False]
validate = combined[combined.is_promoted.isnull()==True]

In [15]:
train.drop('employee_id',1,inplace=True)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('is_promoted',1), train.is_promoted, test_size=0.2, random_state=42)

In [17]:
def calc(y_pred, y_true): 
    acc = accuracy_score(y_true, y_pred) 
    pre = precision_score(y_true, y_pred) 
    rec = recall_score(y_true, y_pred) 
    conf = confusion_matrix(y_true, y_pred) 
    f1 = f1_score(y_true, y_pred) 
    
    print("Accuracy:- ", acc)  
    print('------------------------------------------------------------------------------')
    print("Precision Score:- ", pre) 
    print('------------------------------------------------------------------------------')
    print("Recall Score:- ", rec) 
    print('------------------------------------------------------------------------------')
    print("Confution Matrx \n", conf) 
    print('------------------------------------------------------------------------------')
    print("F1 Score", f1) 


In [76]:
param_dict = {
    "criterion":['gini', 'entropy'],
    "max_depth":range(1,10), 
    "min_samples_split":range(1,10),
    "min_samples_leaf":range(1,5)
} 
dtc = DecisionTreeClassifier() 

grid = GridSearchCV(dtc, param_grid=param_dict, cv=5, verbose=1, n_jobs=-1) 
grid.fit(X_train, y_train)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   31.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   44.5s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 3240 out of 3240 | elapsed:  4.0min finished


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 10),
                         'min_samples_leaf': range(1, 5),
                         'min_samples_split': range(1, 10)},
             verbose=1)

In [78]:
pred_dtc = grid.best_estimator_.predict(X_test)
calc(pred_dtc,y_test)

Accuracy:-  0.936325488049626
------------------------------------------------------------------------------
Precision Score:-  0.8697183098591549
------------------------------------------------------------------------------
Recall Score:-  0.2720264317180617
------------------------------------------------------------------------------
Confution Matrx 
 [[10017    37]
 [  661   247]]
------------------------------------------------------------------------------
F1 Score 0.41442953020134227


In [79]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the param grid

param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_Model = RandomForestClassifier()

rf_Grid = GridSearchCV(estimator = rf_Model, param_grid = param_grid, cv = 3, verbose=2, n_jobs = 4) 
rf_Grid.fit(X_train, y_train)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   21.3s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:  2.9min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done 960 out of 960 | elapsed:  7.5min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 4],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72,
                                          80]},
             verbose=2)

In [81]:
pred_rfc = rf_Grid.best_estimator_.predict(X_test)
calc(pred_rfc,y_test)

Accuracy:-  0.9258347016967706
------------------------------------------------------------------------------
Precision Score:-  0.9439252336448598
------------------------------------------------------------------------------
Recall Score:-  0.11123348017621146
------------------------------------------------------------------------------
Confution Matrx 
 [[10048     6]
 [  807   101]]
------------------------------------------------------------------------------
F1 Score 0.19901477832512318


In [18]:
lgb = LGBMClassifier(subsample_freq = 2, objective ="binary",importance_type = "gain",
                                             verbosity = -1, max_bin = 60,num_leaves = 300,
                                             boosting_type = 'dart',learning_rate=0.15, 
                                             n_estimators=494, max_depth=5, scale_pos_weight=2.5)
lgb.fit(X_train, y_train)

lgb_pred = lgb.predict(X_test)

print("Training Accuracy :", lgb.score(X_train, y_train))
calc(lgb_pred, y_test)

Training Accuracy : 0.946494549103681
Accuracy:-  0.9360518153621602
------------------------------------------------------------------------------
Precision Score:-  0.6956521739130435
------------------------------------------------------------------------------
Recall Score:-  0.4052863436123348
------------------------------------------------------------------------------
Confution Matrx 
 [[9893  161]
 [ 540  368]]
------------------------------------------------------------------------------
F1 Score 0.5121781489213639


In [19]:
cbc = CatBoostClassifier(learning_rate=0.15, n_estimators=494, subsample=0.085, 
                                                 max_depth=5, scale_pos_weight=2.5)
cbc.fit(X_train, y_train)

cbc_pred = cbc.predict(X_test)

print("Training Accuracy :", cbc.score(X_train, y_train))

0:	learn: 0.5645572	total: 159ms	remaining: 1m 18s
1:	learn: 0.4991436	total: 176ms	remaining: 43.2s
2:	learn: 0.4566770	total: 196ms	remaining: 32.1s
3:	learn: 0.4364418	total: 216ms	remaining: 26.5s
4:	learn: 0.3906334	total: 234ms	remaining: 22.9s
5:	learn: 0.3822528	total: 252ms	remaining: 20.5s
6:	learn: 0.3757616	total: 270ms	remaining: 18.8s
7:	learn: 0.3628092	total: 288ms	remaining: 17.5s
8:	learn: 0.3550671	total: 305ms	remaining: 16.4s
9:	learn: 0.3430419	total: 324ms	remaining: 15.7s
10:	learn: 0.3358510	total: 340ms	remaining: 14.9s
11:	learn: 0.3306956	total: 367ms	remaining: 14.7s
12:	learn: 0.3285883	total: 385ms	remaining: 14.3s
13:	learn: 0.3186626	total: 405ms	remaining: 13.9s
14:	learn: 0.3174693	total: 423ms	remaining: 13.5s
15:	learn: 0.3142572	total: 440ms	remaining: 13.1s
16:	learn: 0.3134095	total: 457ms	remaining: 12.8s
17:	learn: 0.3108266	total: 473ms	remaining: 12.5s
18:	learn: 0.3085301	total: 491ms	remaining: 12.3s
19:	learn: 0.3068804	total: 508ms	remain

167:	learn: 0.2531259	total: 4.08s	remaining: 7.92s
168:	learn: 0.2530706	total: 4.1s	remaining: 7.89s
169:	learn: 0.2528855	total: 4.12s	remaining: 7.86s
170:	learn: 0.2528272	total: 4.14s	remaining: 7.82s
171:	learn: 0.2527358	total: 4.16s	remaining: 7.78s
172:	learn: 0.2525957	total: 4.17s	remaining: 7.74s
173:	learn: 0.2525250	total: 4.19s	remaining: 7.71s
174:	learn: 0.2524298	total: 4.21s	remaining: 7.67s
175:	learn: 0.2522848	total: 4.22s	remaining: 7.63s
176:	learn: 0.2521849	total: 4.25s	remaining: 7.61s
177:	learn: 0.2520920	total: 4.27s	remaining: 7.58s
178:	learn: 0.2520187	total: 4.29s	remaining: 7.54s
179:	learn: 0.2519683	total: 4.3s	remaining: 7.51s
180:	learn: 0.2518642	total: 4.32s	remaining: 7.47s
181:	learn: 0.2517354	total: 4.34s	remaining: 7.43s
182:	learn: 0.2516467	total: 4.36s	remaining: 7.41s
183:	learn: 0.2515389	total: 4.38s	remaining: 7.38s
184:	learn: 0.2515168	total: 4.39s	remaining: 7.34s
185:	learn: 0.2514534	total: 4.42s	remaining: 7.32s
186:	learn: 0.

327:	learn: 0.2378032	total: 7.12s	remaining: 3.6s
328:	learn: 0.2377669	total: 7.15s	remaining: 3.58s
329:	learn: 0.2377186	total: 7.17s	remaining: 3.56s
330:	learn: 0.2376120	total: 7.18s	remaining: 3.54s
331:	learn: 0.2374934	total: 7.2s	remaining: 3.51s
332:	learn: 0.2374334	total: 7.21s	remaining: 3.49s
333:	learn: 0.2373043	total: 7.23s	remaining: 3.46s
334:	learn: 0.2372683	total: 7.25s	remaining: 3.44s
335:	learn: 0.2371874	total: 7.27s	remaining: 3.42s
336:	learn: 0.2371627	total: 7.29s	remaining: 3.39s
337:	learn: 0.2370615	total: 7.31s	remaining: 3.37s
338:	learn: 0.2369994	total: 7.33s	remaining: 3.35s
339:	learn: 0.2369633	total: 7.34s	remaining: 3.33s
340:	learn: 0.2368879	total: 7.36s	remaining: 3.3s
341:	learn: 0.2367432	total: 7.38s	remaining: 3.28s
342:	learn: 0.2363569	total: 7.39s	remaining: 3.25s
343:	learn: 0.2362230	total: 7.42s	remaining: 3.23s
344:	learn: 0.2361636	total: 7.44s	remaining: 3.21s
345:	learn: 0.2360524	total: 7.46s	remaining: 3.19s
346:	learn: 0.2

490:	learn: 0.2256451	total: 10.7s	remaining: 65.4ms
491:	learn: 0.2256302	total: 10.7s	remaining: 43.6ms
492:	learn: 0.2255534	total: 10.8s	remaining: 21.8ms
493:	learn: 0.2254785	total: 10.8s	remaining: 0us
Training Accuracy : 0.945536651005793


In [20]:
calc(y_test, cbc_pred)

Accuracy:-  0.9360518153621602
------------------------------------------------------------------------------
Precision Score:-  0.41740088105726875
------------------------------------------------------------------------------
Recall Score:-  0.6878402903811253
------------------------------------------------------------------------------
Confution Matrx 
 [[9882  529]
 [ 172  379]]
------------------------------------------------------------------------------
F1 Score 0.5195339273474984


In [26]:
final_pred = cbc.predict(validate.drop(['employee_id','is_promoted'],1))
final_pred = final_pred.astype(int)

In [27]:
final_pred

array([0, 0, 0, ..., 0, 0, 1])

In [28]:
Result_Promoted = pd.DataFrame({'employee_id': validate["employee_id"], 'is_promoted' : final_pred})
Result_Promoted

Unnamed: 0,employee_id,is_promoted
54808,8724,0
54809,74430,0
54810,72255,0
54811,38562,0
54812,64486,0
...,...,...
78293,53478,0
78294,25600,0
78295,45409,0
78296,1186,0


In [30]:
file_name = "Submission"
pd.DataFrame(Result_Promoted).to_csv(file_name+".csv",index=False)

In [31]:
lgb_pred = lgb.predict(validate.drop(['employee_id','is_promoted'],1))
lgb_pred = lgb_pred.astype(int)

In [32]:
Result_Promoted2 = pd.DataFrame({'employee_id': validate["employee_id"], 'is_promoted' : lgb_pred})
Result_Promoted2

Unnamed: 0,employee_id,is_promoted
54808,8724,0
54809,74430,0
54810,72255,0
54811,38562,0
54812,64486,0
...,...,...
78293,53478,0
78294,25600,0
78295,45409,0
78296,1186,0


In [33]:
file_name = "Submission_lgb"
pd.DataFrame(Result_Promoted2).to_csv(file_name+".csv",index=False)