# Dataset Description

In [None]:
"""

The dataset contains information about employees in a company. Here is a description of each column:

Age: The age of the employee (integer).
Attrition: Whether the employee has left the company or not (object).
BusinessTravel: Frequency of business travel (object).
DailyRate: The daily rate of pay for the employee (integer).
Department: Department in which the employee works (object).
DistanceFromHome: Distance from home to work in miles (integer).
Education: Level of education of the employee (integer).
EducationField: Field of education of the employee (object).
EmployeeCount: Number of employees (always 1) (integer).
EmployeeNumber: Unique identifier for each employee (integer).
EnvironmentSatisfaction: Satisfaction level with the work environment (integer).
Gender: Gender of the employee (object).
HourlyRate: Hourly rate of pay for the employee (integer).
JobInvolvement: Level of job involvement (integer).
JobLevel: Level of job within the company (integer).
JobRole: Role of the employee in the company (object).
JobSatisfaction: Satisfaction level with the job (integer).
MaritalStatus: Marital status of the employee (object).
MonthlyIncome: Monthly income of the employee (integer).
MonthlyRate: Monthly rate of pay for the employee (integer).
NumCompaniesWorked: Number of companies the employee has worked for (integer).
Over18: Whether the employee is over 18 years old (object).
OverTime: Whether the employee works overtime or not (object).
PercentSalaryHike: Percentage increase in salary (integer).
PerformanceRating: Performance rating of the employee (integer).
RelationshipSatisfaction: Satisfaction level with work relationships (integer).
StandardHours: Standard number of working hours (always 80) (integer).
StockOptionLevel: Level of stock option (integer).
TotalWorkingYears: Total number of years worked (integer).
TrainingTimesLastYear: Number of training sessions attended last year (integer).
WorkLifeBalance: Level of work-life balance (integer).
YearsAtCompany: Number of years spent at the company (integer).
YearsInCurrentRole: Number of years in the current role (integer).
YearsSinceLastPromotion: Number of years since the last promotion (integer).
YearsWithCurrManager: Number of years with the current manager (integer)"""

# Import Needed Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report , accuracy_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

import data

In [None]:
df= pd.read_csv("data.csv")

In [None]:
df.head(10)

# Exploratory Data Analysis (EDA)

In [None]:
#Show the dimensionality of the dataFrame (rows, cols)
df.shape

In [None]:
df.describe()

In [None]:
df.describe(include='object')

In [None]:
df.columns.tolist()

In [None]:
# Show the data type of each column
df.dtypes

In [None]:
#Check if there are any null values
df.isnull().sum()

No missing values

In [None]:
#check repeated rows
df.duplicated().sum()

In [None]:
#Categorical Columns
cat = df.select_dtypes(['object']).columns
#Numerical Columns
num = df.select_dtypes(['number']).columns


print(cat)
print(num)

In [None]:
#Show the number of unique observations for each column
df.nunique()

In [None]:
#PrintUnique values of categorical columns 
for i in cat:
    print("unique values of:",i,set(df[i]))

In [None]:
#"StandardHours" is 80 for everyone , "EmployeeCount" and "Over18" is 1 for everyone
# We delete these variable from the dataset
df = df.drop(['EmployeeCount','StandardHours','Over18','EmployeeNumber'], axis =1)

# target column

In [None]:
df['Attrition'].unique()

In [None]:
##Count the unique values in the target column
df['Attrition'].value_counts()

In [None]:
sns.countplot(x=df['Attrition'])
plt.title('Attrition values');

In [None]:
# 1 for NO , 0 for Yes
df["Attrition"] = df["Attrition"].map({"Yes": 0, "No":1})

In [None]:
# select all categorical columns to graph piechart
cat_cols = df.select_dtypes(['object']).columns

cat_cols

In [None]:
plt.figure(figsize=(7, 5))

def cat_summary(df, col_name, plot=False):
    print(pd.DataFrame({col_name: df[col_name].value_counts(),
                        "Ratio": 100 * df[col_name].value_counts() / len(df)}))

    if plot:
        ax = sns.countplot(x=df[col_name], data=df)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
        plt.show(block=True)

for col in cat_cols:
    if df[col].dtypes == "bool":
        df[col] = df[col].astype(int)
        cat_summary(df, col, plot=True)

    else:
        cat_summary(df, col, plot=True)
        print("##########################################")

# select columns which are categorical except Attrition to graph VS Attrition

In [None]:
cat_cols= df.drop('Attrition',axis=1).select_dtypes(['object']).columns
cat_cols

In [None]:
for col in cat_cols:
    sns.countplot(x=col, hue='Attrition', data=df)
    plt.title("Attrition distribution by " + col)
    
    ax = plt.gca()
    total_height = len(df['Attrition'])
    for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height() / total_height)
        x = p.get_x() + p.get_width() / 2
        y = p.get_height()
        ax.annotate(percentage, (x, y), ha='center')
        
    plt.show()

# look at the averages of numeric variables for the Target variable

In [None]:
def target_summary_with_num(df, target, numerical_col):
    print(df.groupby(target).agg({numerical_col: "mean"}), end="\n\n\n")


for col in num_cols:
    target_summary_with_num(df, "Attrition", col)

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x="Age", hue='Attrition', data=df)
plt.title("Attrition distrition by Age" )
plt.show()

In [None]:
plt.figure(figsize=(5,10))
sns.relplot(data=df,y='MonthlyIncome',x='Age',hue='Attrition',col='Gender')
plt.show()

In [None]:
plt.figure(figsize=(5,10))
sns.relplot(data=df,y='MonthlyIncome',x='MaritalStatus',hue='Attrition',col='Gender')
plt.show()

In [None]:
plt.figure(figsize=(5,10))
sns.relplot(data=df,y='MonthlyIncome',x='Department',hue='Attrition',col='Gender')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x="DistanceFromHome", hue='Attrition', data=df)
plt.title("Attrition distrition by DistanceFromHome" )
plt.show()

In [None]:
# encoding categorical variables

df = df.copy()

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder() 

for col in cat_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Split dataframe into X and y

In [None]:
X=df.drop(columns='Attrition')
y=df['Attrition']

# split dataframe into train and test

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.25 , random_state=42)

# Random Forest Classifier 

In [None]:
RF_model = RandomForestClassifier(n_estimators=100)
RF_model.fit(X_train , y_train)
RF_model.score(X_train , y_train)
RF_pred = RF_model.predict(X_test)
print( 'accuracy score: ' ,accuracy_score(y_test , RF_pred))

In [None]:
print(classification_report(y_test , RF_pred))

In [None]:
cm1 = confusion_matrix(y_test,RF_pred)
plt.figure(figsize=(10,7))
sns.heatmap(cm1,annot=True)
plt.xlabel('predicted')
plt.ylabel('Truh')