In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('Employee.csv')
df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [3]:
df.isnull().sum()

Education                    0
JoiningYear                  0
City                         0
PaymentTier                  0
Age                          0
Gender                       0
EverBenched                  0
ExperienceInCurrentDomain    0
LeaveOrNot                   0
dtype: int64

In [4]:
df.dtypes

Education                    object
JoiningYear                   int64
City                         object
PaymentTier                   int64
Age                           int64
Gender                       object
EverBenched                  object
ExperienceInCurrentDomain     int64
LeaveOrNot                    int64
dtype: object

In [5]:
# Separate features (X) and target (y)
X = df.drop(columns=['LeaveOrNot'])
y = df['LeaveOrNot']

In [6]:
# Define numerical and categorical column names
numerical_cols = X.select_dtypes(exclude='object').columns.to_list()
print(numerical_cols)
categorical_cols = X.select_dtypes(include='object').columns.to_list()
print(categorical_cols)

['JoiningYear', 'PaymentTier', 'Age', 'ExperienceInCurrentDomain']
['Education', 'City', 'Gender', 'EverBenched']


In [7]:
for i in categorical_cols:
    display(X[i].value_counts())

Education
Bachelors    3601
Masters       873
PHD           179
Name: count, dtype: int64

City
Bangalore    2228
Pune         1268
New Delhi    1157
Name: count, dtype: int64

Gender
Male      2778
Female    1875
Name: count, dtype: int64

EverBenched
No     4175
Yes     478
Name: count, dtype: int64

In [8]:
# Create transformers for preprocessing
#imputer is used suppose eventhough we dont have nulls in given data , but will be useful when we get nulls in real data
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',drop='if_binary'))
])

In [9]:
# Combine transformers into a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [10]:
len(df)

4653

In [11]:
df['LeaveOrNot'].value_counts()/len(df)

LeaveOrNot
0    0.656136
1    0.343864
Name: count, dtype: float64

In [12]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234,stratify=y)

# Create a full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [13]:
y_train.value_counts()/len(y_train)

LeaveOrNot
0    0.656099
1    0.343901
Name: count, dtype: float64

In [14]:
y_test.value_counts()/len(y_test)

LeaveOrNot
0    0.656284
1    0.343716
Name: count, dtype: float64

In [15]:
pipeline.fit_transform(X_train)

array([[-0.57507408, -3.03861582,  1.5751245 , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.03952918,  0.5313741 , -1.1205206 , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.03952918,  0.5313741 , -0.29109134, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.57507408,  0.5313741 ,  0.95305255, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.50132809,  0.5313741 ,  1.36776718, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.50132809,  0.5313741 , -1.53523523, ...,  0.        ,
         1.        ,  0.        ]])

In [16]:
cat_cols = list(pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols))

In [17]:
# Fit and transform the training data and keep it as a DataFrame
X_train_scaled = pd.DataFrame(pipeline.fit_transform(X_train), columns=numerical_cols + cat_cols)
X_train_scaled.index = X_train.index
# Transform the test data and keep it as a DataFrame
X_test_scaled = pd.DataFrame(pipeline.transform(X_test), columns=numerical_cols + cat_cols)
X_test_scaled.index = X_test.index

In [18]:
train_data = X_train_scaled.copy()
test_data = X_test_scaled.copy()

In [19]:
X_train.head(5)

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
3165,Bachelors,2014,Bangalore,1,37,Male,No,5
758,Masters,2017,Pune,3,24,Male,No,2
444,Bachelors,2017,Bangalore,3,28,Male,No,1
2319,Bachelors,2015,Pune,2,27,Female,No,5
3670,Bachelors,2015,Pune,1,31,Female,No,3


In [20]:
y_train[0:5]

3165    0
758     1
444     0
2319    1
3670    1
Name: LeaveOrNot, dtype: int64

In [21]:
display(X_test.head(5))
display(y_test[0:5])

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
4448,Bachelors,2016,Bangalore,3,35,Male,No,3
3677,Bachelors,2014,Bangalore,1,34,Female,No,5
4390,Masters,2017,New Delhi,2,26,Female,No,4
1984,Bachelors,2014,Pune,2,27,Female,No,5
3086,Bachelors,2016,Bangalore,3,32,Male,No,1


4448    0
3677    0
4390    0
1984    1
3086    0
Name: LeaveOrNot, dtype: int64

In [22]:
train_data['Target'] = y_train
test_data['Target'] = y_test

In [23]:
train_data.head()

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,Education_Bachelors,Education_Masters,Education_PHD,City_Bangalore,City_New Delhi,City_Pune,Gender_Male,EverBenched_Yes,Target
3165,-0.575074,-3.038616,1.575124,1.345546,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
758,1.039529,0.531374,-1.120521,-0.574255,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1
444,1.039529,0.531374,-0.291091,-1.214189,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
2319,-0.036873,-1.253621,-0.498449,1.345546,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3670,-0.036873,-3.038616,0.330981,0.065678,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1


In [24]:
train_data.isnull().sum()

JoiningYear                  0
PaymentTier                  0
Age                          0
ExperienceInCurrentDomain    0
Education_Bachelors          0
Education_Masters            0
Education_PHD                0
City_Bangalore               0
City_New Delhi               0
City_Pune                    0
Gender_Male                  0
EverBenched_Yes              0
Target                       0
dtype: int64

In [25]:
test_data.head()

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,Education_Bachelors,Education_Masters,Education_PHD,City_Bangalore,City_New Delhi,City_Pune,Gender_Male,EverBenched_Yes,Target
4448,0.501328,0.531374,1.16041,0.065678,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
3677,-0.575074,-3.038616,0.953053,1.345546,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
4390,1.039529,-1.253621,-0.705806,0.705612,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1984,-0.575074,-1.253621,-0.498449,1.345546,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3086,0.501328,0.531374,0.538338,-1.214189,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0


In [26]:
test_data.isnull().sum()

JoiningYear                  0
PaymentTier                  0
Age                          0
ExperienceInCurrentDomain    0
Education_Bachelors          0
Education_Masters            0
Education_PHD                0
City_Bangalore               0
City_New Delhi               0
City_Pune                    0
Gender_Male                  0
EverBenched_Yes              0
Target                       0
dtype: int64

In [27]:
train_data.to_csv('Train_data.csv',index=False)
test_data.to_csv('Test_data.csv',index=False)