# Import libraries and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, confusion_matrix, f1_score, precision_recall_curve, roc_curve, plot_roc_curve, plot_precision_recall_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

from lightgbm import LGBMClassifier, plot_importance

In [None]:
# Unzip archive
!unzip -o employee_promotion.csv.zip

In [None]:
# Load data
data = pd.read_csv('employee_promotion.csv')
data

## Dataset Overview
|        Column        |                                         Descriptions                                        |
|:--------------------:|:-------------------------------------------------------------------------------------------:|
| employee_id          | Unique ID for the employee                                                                  |
| department           | Department of employee                                                                      |
| region               | Region of employment(unordered)                                                             |
| education            | Education level                                                                             |
| gender               | Gender of Employee                                                                          |
| recruitment_channel  | Channel of recruitment for employee                                                         |
| no_of_trainings      | no of other trainings completed in the previous year on soft skills, technical skills, etc. |
| age                  | Age of Employee                                                                             |
| previous_year_rating | Employee Rating for the previous year                                                       |
| length_of_service    | Length of service in years                                                                  |
| awards_won           | if awards won during the previous year then 1 else 0                                        |
| avg_training_score   | Average score in current training evaluations                                               |
| is_promoted          | Recommended for promotion                                                                   |

# Data Preprocessing

In [None]:
# Check whether any columns contain NaN or Null values
data.isnull().sum()

In [None]:
# We have lots of data, specifically over 10000
# Therefore, I decided to drop rows contain NaN or Null
data.dropna(axis=0, inplace=True)
data.reset_index(drop=True, inplace=True)
data

In [None]:
# 'emplye_id' column is unnecessary so, I gonna drop for it
data.drop(columns=['employee_id'], inplace=True)
data

In [None]:
# Convert 'gender' column into binary values
data['gender_new'] = pd.Series()
data.loc[data['gender'] == 'f', 'gender_new'] = 0
data.loc[data['gender'] == 'm', 'gender_new'] = 1
data = data.astype({'gender_new' : 'int'})
data.drop(columns=['gender'], inplace=True)
data

## Comment
Finally, we got preprocessed DataFrame named 'data'.  
It has no NaN or Null values, therefore it is good to go for machine learning.

# Data Visualization

In [None]:
# Check distributions of features for training by pie charts
fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(40, 20))

for i, feature in enumerate(data.columns):
    row = int(i/4)
    col = i%4
    pd.value_counts(data.iloc[:, i]).plot.pie(autopct="%.1f%%", ax=axs[row][col])

plt.suptitle('Distribution of features')
plt.tight_layout()

In [None]:
# Check distributions of features contain numbers by distplot
columns = ['gender_new', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'awards_won', 'avg_training_score', 'is_promoted']
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))

for i, feature in enumerate(data[columns]):
    row = int(i/4)
    col = i%4
    sns.distplot(data[columns].iloc[:, i], ax=axs[row][col])

plt.suptitle('Distirbution of features')
plt.tight_layout

## Comment
As you can see above, none of columns have normal distribution which is proper for training  
Therefore, we can try converting following columns for normal distribution: 'age', 'length_of_service' and 'avg_training_score' 

In [None]:
data

In [None]:
# Log Transformation
age_log = np.log1p(data['age'])
service_log = np.log1p(data['length_of_service'])
score_log = np.log1p(data['avg_training_score'])

data.insert(6, 'age_log', age_log)
data.insert(9, 'length_of_service_log', service_log)
data.insert(12, 'avg_training_score_log', score_log)

data

In [None]:
# Check distributions of log converted columns
log_columns = ['age_log', 'length_of_service_log', 'avg_training_score_log']

fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(20, 10))

sns.distplot(data['age_log'], ax=ax1)
ax1.set_title('Distribution of age_log')
sns.distplot(data['length_of_service_log'], ax=ax2)
ax2.set_title('Distribution of length_of_service_log')
sns.distplot(data['avg_training_score_log'], ax=ax3)
ax3.set_title('Distribution of avg_training_score_log')

plt.suptitle('Distribution of log converted features', fontweight='bold')
plt.tight_layout()
plt.show()

## Comment
'age_log' became nearly normal distributed but other features didn't  
Log transformation is one of the most powerful strategies of preparing data for training

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 8))
plt.title('Correlation of features')
sns.heatmap(data.corr(), annot=True, linewidths=.5, cmap="YlGnBu")

## Comment
As I anticipated, 'age', 'length of service' and 'avg_training_score' was highly correlated  
So, maybe I can try decomposition for those features (To be Continued)

# Split Datasets

In [None]:
# Get One-Hot encoded DataFrame
data_oh = pd.get_dummies(data)
data_oh

In [None]:
# Scaling of features
features = np.array(data_oh.columns).reshape(-1, 1)

for feature in features:
    scaler = StandardScaler()
    scaler.fit(data_oh[feature])
    data_oh[feature] = scaler.transform(data_oh[feature])

data_oh

In [None]:
# Define features and label for training
train_features = data_oh.drop(columns=['is_promoted'], inplace=False)
train_label = data_oh['is_promoted'].astype(int)

In [None]:
# Split datasets
X_train, X_test, y_train, y_test = train_test_split(train_features, train_label, test_size=0.2, random_state=11)

print('Shape of X_train: ', X_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of y_test: ', y_test.shape)

# Classification

## Basic Estimators

In [None]:
# Utility Function
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average="macro")
    recall = recall_score(y_test, pred, average="macro")
    f1 = f1_score(y_test, pred, average="macro")
    roc_auc = roc_auc_score(y_test, pred_proba, average="macro")
    print('Confusion Matrix')
    print(confusion)
    print('Accuracy: {0:.4f}, Precision: {1:.4f}, Recall {2:.4f}, F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:
# Process fitting, prediction and evalution by Logistic Regression
# Create Estimator CLass
dt_clf = DecisionTreeClassifier()
lr_clf = LogisticRegression()
rf_clf = RandomForestClassifier()

# Fitting
dt_clf.fit(X_train, y_train)
lr_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)

# Prediction
dt_pred = dt_clf.predict(X_test)
lr_pred = lr_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)

# Pred_Proba
dt_pred_proba = dt_clf.predict_proba(X_test)[:, 1]
lr_pred_proba = lr_clf.predict_proba(X_test)[:, 1]
rf_pred_proba = rf_clf.predict_proba(X_test)[:, 1]

# Evaluation
get_clf_eval(y_test, dt_pred, dt_pred_proba)
get_clf_eval(y_test, lr_pred, lr_pred_proba)
get_clf_eval(y_test, rf_pred, rf_pred_proba)

In [None]:
# Plot Precision-Recall curve
plot_precision_recall_curve(dt_clf, X_test, y_test)
plot_precision_recall_curve(lr_clf, X_test, y_test)
plot_precision_recall_curve(rf_clf, X_test, y_test)
plt.show()

In [None]:
# Plot ROC curve
plot_roc_curve(dt_clf, X_test, y_test)
plot_roc_curve(lr_clf, X_test, y_test)
plot_roc_curve(rf_clf, X_test, y_test)
plt.show()

### Comment
As you can see above, LogisticRegression and RandomForestClassifier are both basic models but really powerful

## LightGBM

In [None]:
# Create estimator and process fitting, prediction and evaluation for model after applying SMOTE
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average=False)

lgbm_clf.fit(X_train, y_train)
lgbm_preds_over = lgbm_clf.predict(X_test)
lgbm_pred_proba = lgbm_clf.predict_proba(X_test)[:, 1]

get_clf_eval(y_test, lgbm_preds_over, lgbm_pred_proba)
plot_roc_curve(lgbm_clf, X_test, y_test)

In [None]:
# Create estimator and process fitting, prediction and evaluation for model
lgbm_wrapper = LGBMClassifier(n_estimators=400, num_leaves=64, n_jobs=-1, boost_from_average=False)

evals = [(X_test, y_test)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=1200, eval_metric='logloss', eval_set=evals, verbose=True)
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, preds, pred_proba)

In [None]:
# Plot Feature importance
fig, ax = plt.subplots(figsize=(10, 12))
plot_importance(lgbm_wrapper, ax=ax)

### Comment
LightGBM is also another powerful model which runs by Boosting(Boot Strapping)  
It is more lighter than GradientBoostingClassifier(literally)  
Specifically, comparing to basic estimators, AUC score was the highest

## Stacking Ensemble

In [None]:
# Create individual ML model
knn_clf = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=11)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)

# Create the model which will be fitted by dataset Stacking processed
lr_final = LogisticRegression(C=10)

In [None]:
# Fitting each models
knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)

In [None]:
# Predict each models and predict them

knn_pred = knn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)

print('Accuracy Score of KNN: {0:.4f}'.format(accuracy_score(y_test, knn_pred)))
print('Accuracy Score of RandomForestClassifier: {0:.4f}'.format(accuracy_score(y_test, rf_pred)))
print('Accuracy Score of DeicisionTreeClassifier: {0:.4f}'.format(accuracy_score(y_test, dt_pred)))
print('Accuracy Score of AdaBoostClassifier: {0:.4f}'.format(accuracy_score(y_test, ada_pred)))

In [None]:
# Combine preds to one ndarray
pred = np.array([knn_pred, rf_pred, dt_pred, ada_pred])
print(pred.shape)

# Transponse 'pred' in order to convert as Feature
pred = np.transpose(pred)
print(pred.shape)

In [None]:
# Fit, Predict, Evaluate for final model
lr_final.fit(pred, y_test)
final = lr_final.predict(pred)

print('Accuracy Score of Final Model: {0:.4f}'.format(accuracy_score(y_test, final)))

# Conclusion
**Accuracy Score of Final Model: 0.9428**

1. Through trraining basic estimators, LightGBM and ensemble models, I could try lots of estimators for this datset.  
2. Especially, the label for training was binary (which is 0 or 1) so, the evaluation score seems much higher than those from multi-classification.  
3. Also, ensemble and nearest-neighbor model became ditinguished, therefore, visualzing scatter plot could be another good way for analyzing data.

Thanks for reviewing my notebooks  
If you'd like it, please give me upvotes and leave comments  
Any questions or comments are always welcome