## Step 1. Imports

*   Importing required packages
*   Loading dataset



In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
# Packages for data modeling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,OneHotEncoder

# Packages for model evaluation
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,\
confusion_matrix,ConfusionMatrixDisplay,classification_report,roc_auc_score
from sklearn.tree import plot_tree

# for saving models
import pickle

### Loading dataset

In [13]:
# Loading dataset into a dataframe named data
data = pd.read_csv("https://raw.githubusercontent.com/MFaisalQureshi/Hr-Analytics-Job-Prediction/main/HR_comma_sep.csv")

# to display data
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


## Step 2. Data Exploration (EDA)


- Understanding data variables
- Cleaning dataset (missing data, redundant data, outliers)


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [15]:
# Renaming columns as needed
data = data.rename(columns = {'average_montly_hours': 'average_monthly_hours',
                            'time_spend_company': 'tenure',
                            'Work_accident': 'work_accident',
                            'Department': 'department'})

# Displaying columns
data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'tenure', 'work_accident', 'left',
       'promotion_last_5years', 'department', 'salary'],
      dtype='object')

### Checking Missing, duplicate values in dataset

In [16]:
print(data.isna().sum())
data.dropna(inplace=True)

# duplicates
print("no of duplicates-",data.duplicated().sum())
data.drop_duplicates(inplace=True,keep="first")
data.info()

satisfaction_level       0
last_evaluation          0
number_project           0
average_monthly_hours    0
tenure                   0
work_accident            0
left                     0
promotion_last_5years    0
department               0
salary                   0
dtype: int64
no of duplicates- 3008
<class 'pandas.core.frame.DataFrame'>
Index: 11991 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     11991 non-null  float64
 1   last_evaluation        11991 non-null  float64
 2   number_project         11991 non-null  int64  
 3   average_monthly_hours  11991 non-null  int64  
 4   tenure                 11991 non-null  int64  
 5   work_accident          11991 non-null  int64  
 6   left                   11991 non-null  int64  
 7   promotion_last_5years  11991 non-null  int64  
 8   department             11991 non-null  object 
 9   salary       

### Removing Outliers

In [17]:
# Removing outliers from 'tenure' and saving it in new variable
Q1=data["tenure"].quantile(0.25)
Q3=data["tenure"].quantile(0.75)
IQR = Q3-Q1
lower_limit = Q1-1.5*IQR
upper_limit = Q3+1.5*IQR

data_cleaned = data[(data["tenure"]>=lower_limit) & (data["tenure"]<=upper_limit)]
data_cleaned.reset_index(inplace=True,drop=True)


### Encoding the data

In [18]:

# convert salary cat to numerical using labelencoder
print(data_cleaned["salary"].value_counts())
encoder = OrdinalEncoder(categories=[['low', 'medium', 'high']])
data_cleaned['salary'] = encoder.fit_transform(data_cleaned[['salary']])
data_cleaned["salary"]=data_cleaned["salary"]+1
data_cleaned["salary"]=data_cleaned["salary"].astype(int)
print(data_cleaned['salary'].value_counts())
data_cleaned

salary
low       5429
medium    4854
high       884
Name: count, dtype: int64
salary
1    5429
2    4854
3     884
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['salary'] = encoder.fit_transform(data_cleaned[['salary']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned["salary"]=data_cleaned["salary"]+1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned["salary"]=data_cleaned["salary"].astype(int)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,tenure,work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,1
1,0.11,0.88,7,272,4,0,1,0,sales,2
2,0.72,0.87,5,223,5,0,1,0,sales,1
3,0.37,0.52,2,159,3,0,1,0,sales,1
4,0.41,0.50,2,153,3,0,1,0,sales,1
...,...,...,...,...,...,...,...,...,...,...
11162,0.72,0.64,4,192,3,0,0,0,sales,2
11163,0.48,0.50,5,142,4,0,0,0,IT,2
11164,0.19,0.79,4,229,4,0,0,0,product_mng,2
11165,0.62,0.85,3,237,3,1,0,0,IT,2


In [19]:
# using onehotencoder
print(data_cleaned["department"].value_counts())
ohe_encoder = OneHotEncoder(sparse_output=False)
encoded_data = ohe_encoder.fit_transform(data_cleaned[["department"]])

df_ohe_data = pd.DataFrame(encoded_data,columns= ohe_encoder.get_feature_names_out(['department']))
df_ohe_data.head()


department
sales          2997
technical      2114
support        1721
IT              915
RandD           649
product_mng     641
marketing       617
accounting      583
hr              570
management      360
Name: count, dtype: int64


Unnamed: 0,department_IT,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [20]:
data_2 = data_cleaned.drop(columns=["department"])
data_2.reset_index(drop=True, inplace=True)
df_ohe_data.reset_index(drop=True, inplace=True)
data_encoded = pd.concat([data_2,df_ohe_data],axis=1)
data_encoded.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,tenure,work_accident,left,promotion_last_5years,salary,department_IT,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical
0,0.38,0.53,2,157,3,0,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.11,0.88,7,272,4,0,1,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.72,0.87,5,223,5,0,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.37,0.52,2,159,3,0,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.41,0.5,2,153,3,0,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [21]:
# To save encoders to pickle file 
with open("Ordinal_encoder_salary.pkl","wb") as file:
    pickle.dump(encoder,file)

with open("ohe_encoder.pkl",'wb') as file:
    pickle.dump(ohe_encoder,file)

### Splitting the data and scaling the data

In [22]:
# Isolating target variable
Y = data_encoded["left"]

# Selecting the required features
X = data_encoded.drop("left",axis=1)

# Split the data into training set and testing set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, stratify=Y, random_state=42)


# scaling the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

with open("data_scaler.pkl",'wb') as file:
    pickle.dump(scaler,file)



### RANDOM FOREST MODEL

In [23]:
# Random forest model
rf= RandomForestClassifier(random_state=42)

# Dictionary of hyperparameters to search over
cv_params = {'max_depth': [3,5, None], 
             'max_features': [1.0],
             'max_samples': [0.7, 1.0],
             'min_samples_leaf': [1,2,3],
             'min_samples_split': [2,3,4],
             'n_estimators': [300, 500],
             }  

# Scoring metrics to capture
scoring = ('accuracy','precision','recall','f1','roc_auc')

# Instantiate GridSearch
rf1 = GridSearchCV(rf, cv_params, scoring=scoring, cv=4, refit='roc_auc')

In [24]:
%%time
rf1.fit(X_train, Y_train)

CPU times: total: 18min 30s
Wall time: 18min 38s


In [32]:
# saving rf model
with open("rfmodel.pkl",'wb') as file:
    pickle.dump(rf1,file)