In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import sqlite3
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import RandomForestClassifier

In [None]:
# connect to database

con = sqlite3.connect("employees_df")
cur = con.cursor()

In [None]:
employees = pd.read_csv("Resources/Employee.csv")
employees['EmployeeIndex'] = employees.index
employees

In [None]:
predicated_df = employees[['EmployeeIndex', 'LeaveOrNot']]

cur.execute('CREATE TABLE IF NOT EXISTS Predicated (EmployeeIndex INTEGER, LeaveOrNot INTEGER)')
con.commit()

predicated_df.to_sql('Predicated', con, if_exists='replace', index = False)

cur.execute('''  
SELECT * FROM Predicated
          ''')

In [None]:
features_df = employees[['EmployeeIndex', 'Education', 'JoiningYear', 'City', 'PaymentTier', 'Age', 'Gender', 'EverBenched', 'ExperienceInCurrentDomain']]

cur.execute('CREATE TABLE IF NOT EXISTS Features (EmployeeIndex INTEGER, Education TEXT, JoiningYear TEXT, City TEXT, PaymentTier INTEGER, Age INTEGER, Gender TEXT, EverBenched TEXT, ExperienceInCurrentDomain INTEGER)')
con.commit()

features_df.to_sql('Features', con, if_exists='replace', index = False)

cur.execute('''  
SELECT * FROM Features
          ''')
con.commit()

In [None]:
con = sqlite3.connect('employees_df')
cur = con.cursor()

In [None]:
# To turn database into dataframe

cur.execute('''CREATE TABLE EMPLOYEES as
SELECT * from FEATURES
JOIN Predicated
ON Features.EmployeeIndex = Predicated.EmployeeIndex; ''')
con.commit()

In [None]:
df = pd.read_sql_query("SELECT * FROM Employees", con)
employee_data_df = df.drop(['EmployeeIndex:1', 'EmployeeIndex'], axis=1)
employee_data_df

In [None]:
employee_data_df.info()

In [None]:
columns = ["Education", "JoiningYear", "City", "PaymentTier", "Age", "Gender", "EverBenched", "ExperienceInCurrentDomain"
]

target = ["LeaveOrNot"]

In [None]:
# Remove the `LeaveOrNot` status
LeaveOrNot_mask = employee_data_df['LeaveOrNot'] != 'LeaveOrNot'
employee_df = employee_data_df.loc[LeaveOrNot_mask]

In [None]:
# Create our features
# Create the training variables by converting the string values into numerical ones using the get_dummies() method.
X = pd.get_dummies(employee_data_df, columns=["Education", "City", "Gender", "EverBenched"]).drop("LeaveOrNot", axis=1)


# Create our target
y =employee_data_df["LeaveOrNot"]

In [None]:
X.describe()

In [None]:
# Check the balance of our target values (1 = yes or 0 = no)
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
Counter(y_train)

In [None]:
X_train.shape

In [None]:
# Resample the training data with the BalancedRandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators =100, random_state=1)
brfc.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Calculate confusion matrix.
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
# imbalanced classification report

print( classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
importances = brfc.feature_importances_
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)

In [None]:
print(f'Training Score: {brfc.score(X_train, y_train)}')
print(f'Testing Score: {brfc.score(X_test, y_test)}')