In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [3]:
df = pd.read_csv('Employee_Complete_Dataset.csv')

In [4]:
df

Unnamed: 0,Employee_number,Employee_name,Employee_age,Maritial_Status,Current_Salary,Number_of_Children,years_experience,past_projects,current_projects,Divorced_earlier,Father_alive,Mother_alive,performance_rating,Education_level,Department,Role,Job_Satisfaction,Work_Life_Balance,is_outlier
0,10001,Karen Anderson,36,True,116138,2,12,6,1,No,Yes,Yes,3,Bachelor's,R&D,Researcher,10.000000,1.936454,0
1,10002,David Taylor,34,False,82171,3,10,5,0,Yes,Yes,No,5,Diploma,HR,HR Executive,10.000000,4.077728,0
2,10003,Nina Kumar,36,False,48600,1,0,2,3,Yes,Yes,Yes,4,Diploma,Sales,Sales Manager,10.000000,3.975622,0
3,10004,John Patel,42,True,39675,1,2,4,2,Yes,Yes,Yes,3,Diploma,Engineering,Software Engineer,1.000000,3.790951,0
4,10005,Emily Sharma,27,True,161304,3,3,11,2,Yes,Yes,Yes,2,High School,R&D,Scientist,8.251833,10.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,59996,Olivia Singh,34,True,30000,1,10,8,2,No,No,Yes,3,Diploma,Support,Customer Specialist,1.620645,10.000000,0
49996,59997,Rohit Anderson,63,False,39817,0,30,5,1,Yes,Yes,Yes,2,Bachelor's,Support,Support Engineer,9.653380,1.826064,0
49997,59998,Olivia Taylor,39,True,30000,0,12,8,3,No,Yes,Yes,2,Diploma,Engineering,ML Engineer,4.690698,2.470172,0
49998,59999,Luke Brown,40,True,50384,1,7,3,2,No,Yes,Yes,3,Bachelor's,Finance,Senior Analyst,1.031014,2.462067,0


In [5]:
df['performance_rating'].unique()

array([3, 5, 4, 2, 1])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Employee_number     50000 non-null  int64  
 1   Employee_name       50000 non-null  object 
 2   Employee_age        50000 non-null  int64  
 3   Maritial_Status     50000 non-null  bool   
 4   Current_Salary      50000 non-null  int64  
 5   Number_of_Children  50000 non-null  int64  
 6   years_experience    50000 non-null  int64  
 7   past_projects       50000 non-null  int64  
 8   current_projects    50000 non-null  int64  
 9   Divorced_earlier    50000 non-null  object 
 10  Father_alive        50000 non-null  object 
 11  Mother_alive        50000 non-null  object 
 12  performance_rating  50000 non-null  int64  
 13  Education_level     50000 non-null  object 
 14  Department          50000 non-null  object 
 15  Role                50000 non-null  object 
 16  Job_

In [7]:
X = df.drop(['performance_rating','Employee_number'],axis=1)
y = df['performance_rating']

In [8]:
cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(exclude='object').columns

In [9]:
num_cols

Index(['Employee_age', 'Maritial_Status', 'Current_Salary',
       'Number_of_Children', 'years_experience', 'past_projects',
       'current_projects', 'Job_Satisfaction', 'Work_Life_Balance',
       'is_outlier'],
      dtype='object')

In [10]:
cat_cols

Index(['Employee_name', 'Divorced_earlier', 'Father_alive', 'Mother_alive',
       'Education_level', 'Department', 'Role'],
      dtype='object')

In [11]:
num_pipeline = Pipeline(steps=[
    ('num_impute',SimpleImputer(strategy='mean')),
    ('scaling',StandardScaler())])

In [12]:
num_pipeline.fit(X[num_cols])

0,1,2
,steps,"[('num_impute', ...), ('scaling', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [13]:
transformed_data = num_pipeline.transform(X[num_cols])

In [14]:
transformed_data

array([[ 0.1348147 ,  0.90427823,  1.83803587, ...,  1.15921854,
        -0.90406485, -0.12036142],
       [-0.07336118, -1.10585433,  0.75680012, ...,  1.15921854,
        -0.15561799, -0.12036142],
       [ 0.1348147 , -1.10585433, -0.31183018, ...,  1.15921854,
        -0.19130724, -0.12036142],
       ...,
       [ 0.44707853,  0.90427823, -0.9039044 , ..., -0.62680345,
        -0.71751241, -0.12036142],
       [ 0.55116648,  0.90427823, -0.25504199, ..., -1.85790236,
        -0.7203456 , -0.12036142],
       [ 0.75934236, -1.10585433,  1.82119677, ...,  0.67872188,
        -0.56294687, -0.12036142]], shape=(50000, 10))

In [15]:
cat_pipeline = Pipeline(steps=[
    ('cat_impute',SimpleImputer(strategy='most_frequent')),
    ('encoding',OneHotEncoder(sparse_output=False))
])

In [16]:
cat_pipeline.fit(X[cat_cols])

0,1,2
,steps,"[('cat_impute', ...), ('encoding', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [17]:
cat_transformed_data = cat_pipeline.transform(X[cat_cols])

In [18]:
pd.DataFrame(cat_transformed_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,403,404,405,406,407,408,409,410,411,412
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
49997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [19]:
transformer = ColumnTransformer([
    ('num',num_pipeline,num_cols),
    ('cat',cat_pipeline,cat_cols)
])

In [20]:
transformer.fit(X)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [21]:
Transformed_data = transformer.transform(X)

In [22]:
pd.DataFrame(Transformed_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,413,414,415,416,417,418,419,420,421,422
0,0.134815,0.904278,1.838036,0.716944,0.428397,-0.234347,-0.449105,1.159219,-0.904065,-0.120361,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-0.073361,-1.105854,0.756800,1.608266,0.196725,-0.465541,-1.345629,1.159219,-0.155618,-0.120361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.134815,-1.105854,-0.311830,-0.174378,-0.961638,-1.159123,1.343944,1.159219,-0.191307,-0.120361,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.759342,0.904278,-0.595930,-0.174378,-0.729966,-0.696735,0.447419,-1.868335,-0.255856,-0.120361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.801977,0.904278,3.275758,1.608266,-0.614129,0.921622,0.447419,0.571144,1.914414,-0.120361,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-0.073361,0.904278,-0.903904,-0.174378,0.196725,0.228040,0.447419,-1.659553,1.914414,-0.120361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,2.945189,-1.105854,-0.591410,-1.065700,2.513451,-0.465541,-0.449105,1.042617,-0.942650,-0.120361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
49997,0.447079,0.904278,-0.903904,-1.065700,0.428397,0.228040,1.343944,-0.626803,-0.717512,-0.120361,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,0.551166,0.904278,-0.255042,-0.174378,-0.150784,-0.927929,0.447419,-1.857902,-0.720346,-0.120361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
