<a href="https://colab.research.google.com/github/guangyitan/Udemy-Unsupervised-Machine-Learning-with-2-Capstone-ML-Projects/blob/main/Dimensionality%20Reduction/Implementing_RFE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# mount to google drive
from google.colab import drive 
drive.mount('/content/MyDrive')
project_path = '/content/MyDrive/My Drive/Colab Notebooks/Udemy/Dimensionality Reduction/'

#read the excel file
data = pd.read_csv(project_path + "Employee_Data.csv")
data.shape

Drive already mounted at /content/MyDrive; to attempt to forcibly remount, call drive.mount("/content/MyDrive", force_remount=True).


(1470, 35)

In [3]:
# Visualise the dataset

# set num of columns to display
pd.set_option('max_columns', 35)
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


# Removing Usless Columns

In [4]:
# look for useless columns

print("Number of Records in the Dataset: ", data.shape[0])
print("Number of Unique Values in EmployeeCount Column: ", data['EmployeeCount'].nunique())
print("Number of Unique Values in EmployeeNumber Column: ", data['EmployeeNumber'].nunique())
print("Number of Unique Values in Over18 Column: ", data['Over18'].nunique())
print("Number of Unique Values in StandardHours Column: ", data['StandardHours'].nunique())


Number of Records in the Dataset:  1470
Number of Unique Values in EmployeeCount Column:  1
Number of Unique Values in EmployeeNumber Column:  1470
Number of Unique Values in Over18 Column:  1
Number of Unique Values in StandardHours Column:  1


For EmployeeNumber Column, each row has unique values in all records.

For EmployeeCount, Over18 and StandardHours Columns, they have only one unique value in all records.

These columns are useless as they do not have any trends or patterns which can be used for predictive analytics.

In [5]:
# remove the useless columns from the dataset

data = data.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis = 1)
print("Number of Columns left: ", data.shape[1])

Number of Columns left:  31


In [6]:
# check for Missing values in the dataset
data.isnull().sum().sum()

0

There are no null/ missing values in the dataset

In [7]:
# check for columns with Object Data Type
data.select_dtypes('object').head()

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,OverTime
0,Yes,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Yes
1,No,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,No
2,Yes,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Yes
3,No,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Yes
4,No,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,No


# Encode Object Type Data into Numerical Type Data

In [8]:
# Encode the Object Type data into Numerical Data Type
# Attrition Column is excluded as it is our Target Column
# For Attrition Column, if 'Yes', employee will leave the organization in near future
# For Attrition Column, if 'NO', employee will not leave the organization in near future

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['BusinessTravel'] = le.fit_transform(data['BusinessTravel'])
data['Department'] = le.fit_transform(data['Department'])
data['EducationField'] = le.fit_transform(data['EducationField'])
data['Gender'] = le.fit_transform(data['Gender'])
data['JobRole'] = le.fit_transform(data['JobRole'])
data['MaritalStatus'] = le.fit_transform(data['MaritalStatus'])
data['OverTime'] = le.fit_transform(data['OverTime'])

# double check to ensure all object data type are converted
data.select_dtypes('object').columns

Index(['Attrition'], dtype='object')

In [9]:
# Split dataset into x and y

# axis = 0 means along row
# axis = 1 means along column
x = data.drop(['Attrition'], axis = 1)
y = pd.DataFrame(data['Attrition'], columns = ['Attrition'])

print("shape of x", x.shape)
print("shape of y", y.shape)

shape of x (1470, 30)
shape of y (1470, 1)


# Balancing the dataset using 'SMOTEENN' sampling technique

In [10]:
# check if the dataset is balance or not 
# check on the target column which is 'Attrition' Column

data['Attrition'].value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64

From above we can see that the dataset has very large imbalance for **No** and **Yes** records

In [11]:
# balance the dataset distribution using 'SMOTEENN' Sampling Technique

from imblearn.combine import SMOTEENN

# defining the model
sn = SMOTEENN(random_state = 0)

x_temp = x.copy()
# train the model
sn.fit(x, y)

# Making the samples
x, y = sn.fit_sample(x, y)

#convert y from ndarray to dataframe
y = pd.DataFrame(y, columns = ['Attrition'])
x = pd.DataFrame(x)
x.columns = x_temp.columns

# Distribution of the dataset
y.value_counts()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Attrition
Yes          838
No           640
dtype: int64

In [12]:
x_temp.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,2,1102,2,1,2,1,2,0,94,3,2,7,4,2,5993,19479,8,1,11,3,1,0,8,0,1,6,4,0,5
1,49,1,279,1,8,1,1,3,1,61,2,2,6,2,1,5130,24907,1,0,23,4,4,1,10,3,3,10,7,1,7
2,37,2,1373,1,2,2,4,4,1,92,2,1,2,3,2,2090,2396,6,1,15,3,2,0,7,3,3,0,0,0,0
3,33,1,1392,1,3,4,1,4,0,56,3,1,6,3,1,2909,23159,1,1,11,3,3,0,8,3,3,8,7,3,0
4,27,2,591,1,2,1,3,1,1,40,3,1,2,2,1,3468,16632,9,0,12,3,4,1,6,3,3,2,2,2,2


# Splitting the Dataset into Testing and Training

In [13]:
# splitting the dataset

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

print("Shape x_train: ", x_train.shape)
print("Shape x_test: ", x_test.shape)
print("Shape y_train: ", y_train.shape)
print("Shape y_test: ", y_test.shape)

Shape x_train:  (1182, 30)
Shape x_test:  (296, 30)
Shape y_train:  (1182, 1)
Shape y_test:  (296, 1)


# Recursive Feature Elimination(RFE)

Recursivly eliminate features from the dataset to reduce time and space complexity

In [14]:
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

# define RFECV model
# step is the number of features to remove at each iteration
rfecv = RFECV(estimator = model, step = 1, cv = 5, scoring = 'accuracy')
rfecv = rfecv.fit(x, y)

print('Optimal number of features: ', rfecv.n_features_)
print('Best features: ', x_train.columns[rfecv.support_])

  y = column_or_1d(y, warn=True)


Optimal number of features:  3
Best features:  Index(['JobLevel', 'MonthlyIncome', 'MonthlyRate'], dtype='object')


In [15]:
# define new dataset with features selected using RFECV model

x_train_rfecv = x_train[['JobLevel', 'MonthlyIncome', 'MonthlyRate']]
x_test_rfecv = x_test[['JobLevel', 'MonthlyIncome', 'MonthlyRate']]
y_train_rfecv = y_train['Attrition']
y_test_rfecv = y_test['Attrition']

print("Shape x_train_rfecv: ", x_train_rfecv.shape)
print("Shape x_test_rfecv: ", x_test_rfecv.shape)
print("Shape y_train_rfecv: ", y_train_rfecv.shape)
print("Shape y_test_rfecv: ", y_test_rfecv.shape)

Shape x_train_rfecv:  (1182, 3)
Shape x_test_rfecv:  (296, 3)
Shape y_train_rfecv:  (1182,)
Shape y_test_rfecv:  (296,)


# Decision Tree Classification Model

In [16]:
# Train a Decision Tree Classification Model

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

model = DecisionTreeClassifier()
model.fit(x_train_rfecv, y_train_rfecv)
y_pred_rfecv = model.predict(x_test_rfecv)

cr_rfecv = classification_report(y_pred_rfecv, y_test_rfecv)
print(cr_rfecv)

              precision    recall  f1-score   support

          No       0.84      0.90      0.87       129
         Yes       0.92      0.87      0.89       167

    accuracy                           0.88       296
   macro avg       0.88      0.88      0.88       296
weighted avg       0.88      0.88      0.88       296



From the result above, we can see that the model is performing well despite using only 3 features from the dataset.