In [1]:
# Import dependencies

import pandas as pd
import tensorflow as tf
import matplotlib as plt
import sklearn as skl

import sqlite3

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [2]:
# Import and read the csv

employees_df = pd.read_csv("Resources/Employee.csv")
employees_df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1
...,...,...,...,...,...,...,...,...,...
4648,Bachelors,2013,Bangalore,3,26,Female,No,4,0
4649,Masters,2013,Pune,2,37,Male,No,2,1
4650,Masters,2018,New Delhi,3,27,Male,No,5,1
4651,Bachelors,2012,Bangalore,3,30,Male,Yes,2,0


In [None]:
# Drop the non-beneficial columns: clean dataset, no columns eliminated

# employees_df = employees_df.drop([], axis=1)
# employees_df.head

In [3]:
# Determine the number of unique values in each column.

count = employees_df.nunique()
count

Education                     3
JoiningYear                   7
City                          3
PaymentTier                   3
Age                          20
Gender                        2
EverBenched                   2
ExperienceInCurrentDomain     8
LeaveOrNot                    2
dtype: int64

In [4]:
# Look at Age value counts for binning

Age = employees_df.Age.value_counts()
Age

# will not bin by age as it might be a crotocal 

26    645
28    630
27    625
25    418
24    385
29    230
30    220
37    141
36    139
34    136
38    136
40    134
32    132
39    131
31    125
33    124
35    123
41     82
22     49
23     48
Name: Age, dtype: int64

In [7]:
# Generate categorical variable lists
employees_cat = employees_df.dtypes[employees_df.dtypes == "object"].index.tolist()
employees_cat

['Education', 'City', 'Gender', 'EverBenched']

In [11]:
# Encode variables: ['Education', 'City', 'Gender', 'EverBenched']

enc = OneHotEncoder(sparse=False)
employees_encoded_df = pd.DataFrame(enc.fit_transform(employees_df[employees_cat]))
employees_encoded_df.columns = enc.get_feature_names_out(employees_cat)
employees_encoded_df.head()

Unnamed: 0,Education_Bachelors,Education_Masters,Education_PHD,City_Bangalore,City_New Delhi,City_Pune,Gender_Female,Gender_Male,EverBenched_No,EverBenched_Yes
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [12]:
# Merge one-hot encoded features and drop the originals

employees_df = employees_df.merge(employees_encoded_df,left_index=True, right_index=True)
employees_df = employees_df.drop(employees_cat,1)
employees_df.head()

  after removing the cwd from sys.path.


Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,LeaveOrNot,Education_Bachelors,Education_Masters,Education_PHD,City_Bangalore,City_New Delhi,City_Pune,Gender_Female,Gender_Male,EverBenched_No,EverBenched_Yes
0,2017,3,34,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,2013,1,28,3,1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,2014,3,38,2,0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,2016,3,27,5,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,2017,3,24,2,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [15]:
employees_df.to_csv("Resources/employees_df.csv")