In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read the data
df = pd.read_csv('large_employee_data.csv')
print("Original DataFrame head:")
print(df.head())
print("\nOriginal DataFrame info:")
print(df.info())

Original DataFrame head:
    ID              Name  Age Department  Salary
0  497  Charlie Martinez   45         IT   39866
1  395       Diana Smith   49    Finance   73108
2  590       Diana Davis   41    Finance   76486
3  918     Charlie Jones   58         HR   36971
4  654  George Rodriguez   26  Marketing  103445

Original DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          1050 non-null   int64 
 1   Name        1026 non-null   object
 2   Age         1050 non-null   int64 
 3   Department  1050 non-null   object
 4   Salary      1050 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 41.1+ KB
None


In [3]:
# 1. Label Encoding
print("\n1. Label Encoding")
label_encoder = LabelEncoder()


1. Label Encoding


In [4]:
# Create a copy of the dataframe
df_label = df.copy()

In [5]:
# Apply label encoding to 'Department' and 'Name'
df_label['Department_encoded'] = label_encoder.fit_transform(df['Department'])
df_label['Name_encoded'] = label_encoder.fit_transform(df['Name'])

In [6]:
print("\nDataFrame after label encoding:")
print(df_label[['Department', 'Department_encoded', 'Name', 'Name_encoded']].head())


DataFrame after label encoding:
  Department  Department_encoded              Name  Name_encoded
0         IT                   2  Charlie Martinez            24
1    Finance                   0       Diana Smith            37
2    Finance                   0       Diana Davis            30
3         HR                   1     Charlie Jones            23
4  Marketing                   3  George Rodriguez            66


In [7]:
# 2. One-Hot Encoding
print("\n2. One-Hot Encoding")
# Initialize the OneHotEncoder
onehot = OneHotEncoder(sparse_output=False)


2. One-Hot Encoding


In [8]:
# Perform one-hot encoding on 'Department'
department_encoded = onehot.fit_transform(df[['Department']])
department_columns = onehot.get_feature_names_out(['Department'])

In [9]:
# Create DataFrame with one-hot encoded columns
df_onehot = pd.concat([
    df,
    pd.DataFrame(department_encoded, columns=department_columns)
], axis=1)

In [10]:
print("\nDataFrame after one-hot encoding:")
print(df_onehot.head())


DataFrame after one-hot encoding:
    ID              Name  Age Department  Salary  Department_Finance  \
0  497  Charlie Martinez   45         IT   39866                 0.0   
1  395       Diana Smith   49    Finance   73108                 1.0   
2  590       Diana Davis   41    Finance   76486                 1.0   
3  918     Charlie Jones   58         HR   36971                 0.0   
4  654  George Rodriguez   26  Marketing  103445                 0.0   

   Department_HR  Department_IT  Department_Marketing  Department_Operations  \
0            0.0            1.0                   0.0                    0.0   
1            0.0            0.0                   0.0                    0.0   
2            0.0            0.0                   0.0                    0.0   
3            1.0            0.0                   0.0                    0.0   
4            0.0            0.0                   1.0                    0.0   

   Department_Sales  
0               0.0  
1      

In [11]:
# 3. Feature Scaling
print("\n3. Feature Scaling")


3. Feature Scaling


In [12]:
# Create copies for different scaling methods
df_standard = df.copy()
df_minmax = df.copy()

In [13]:
# Initialize scalers
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

In [14]:
# Standardization (z-score normalization)
numeric_columns = ['Age', 'Salary']
df_standard[['Age_standardized', 'Salary_standardized']] = standard_scaler.fit_transform(df[numeric_columns])

In [15]:
# Min-Max Normalization
df_minmax[['Age_normalized', 'Salary_normalized']] = minmax_scaler.fit_transform(df[numeric_columns])

In [16]:
print("\nStandardization results:")
print("Original vs Standardized values:")
print(pd.DataFrame({
    'Original_Age': df['Age'],
    'Standardized_Age': df_standard['Age_standardized'],
    'Original_Salary': df['Salary'],
    'Standardized_Salary': df_standard['Salary_standardized']
}).head())


Standardization results:
Original vs Standardized values:
   Original_Age  Standardized_Age  Original_Salary  Standardized_Salary
0            45          0.157312            39866            -1.287893
1            49          0.479454            73108             0.008795
2            41         -0.164829            76486             0.140562
3            58          1.204271            36971            -1.400820
4            26         -1.372859           103445             1.192166


In [17]:
print("\nMin-Max Normalization results:")
print("Original vs Normalized values:")
print(pd.DataFrame({
    'Original_Age': df['Age'],
    'Normalized_Age': df_minmax['Age_normalized'],
    'Original_Salary': df['Salary'],
    'Normalized_Salary': df_minmax['Salary_normalized']
}).head())


Min-Max Normalization results:
Original vs Normalized values:
   Original_Age  Normalized_Age  Original_Salary  Normalized_Salary
0            45        0.547619            39866           0.108160
1            49        0.642857            73108           0.478672
2            41        0.452381            76486           0.516323
3            58        0.857143            36971           0.075893
4            26        0.095238           103445           0.816806


In [18]:
# 4. Combining all features
print("\n4. Creating final feature-engineered dataset")


4. Creating final feature-engineered dataset


In [19]:
# Combine numeric features (standardized) with one-hot encoded categories
final_df = pd.concat([
    df_standard[['Age_standardized', 'Salary_standardized']],
    pd.DataFrame(department_encoded, columns=department_columns),
    df[['ID']]  # Keep the ID column
], axis=1)

In [20]:
print("\nFinal feature-engineered dataset:")
print(final_df.head())


Final feature-engineered dataset:
   Age_standardized  Salary_standardized  Department_Finance  Department_HR  \
0          0.157312            -1.287893                 0.0            0.0   
1          0.479454             0.008795                 1.0            0.0   
2         -0.164829             0.140562                 1.0            0.0   
3          1.204271            -1.400820                 0.0            1.0   
4         -1.372859             1.192166                 0.0            0.0   

   Department_IT  Department_Marketing  Department_Operations  \
0            1.0                   0.0                    0.0   
1            0.0                   0.0                    0.0   
2            0.0                   0.0                    0.0   
3            0.0                   0.0                    0.0   
4            0.0                   1.0                    0.0   

   Department_Sales   ID  
0               0.0  497  
1               0.0  395  
2               0.

In [21]:
# 5. Save processed datasets
df_label.to_csv('label_encoded_data.csv', index=False)
df_onehot.to_csv('onehot_encoded_data.csv', index=False)
df_standard.to_csv('standardized_data.csv', index=False)
df_minmax.to_csv('normalized_data.csv', index=False)
final_df.to_csv('final_engineered_data.csv', index=False)

In [22]:
# 6. Summary statistics
print("\n5. Summary Statistics")
print("\nStandardized features statistics:")
print(final_df[['Age_standardized', 'Salary_standardized']].describe())


5. Summary Statistics

Standardized features statistics:
       Age_standardized  Salary_standardized
count      1.050000e+03         1.050000e+03
mean      -2.706829e-17        -1.454921e-16
std        1.000477e+00         1.000477e+00
min       -1.695000e+00        -1.666422e+00
25%       -8.896468e-01        -8.445922e-01
50%        7.677700e-02        -3.516680e-02
75%        8.821302e-01         8.458276e-01
max        1.687483e+00         1.833293e+00


In [23]:
# Print shapes of all datasets
print("\nDataset Shapes:")
print(f"Original data: {df.shape}")
print(f"Label encoded data: {df_label.shape}")
print(f"One-hot encoded data: {df_onehot.shape}")
print(f"Final engineered data: {final_df.shape}")


Dataset Shapes:
Original data: (1050, 5)
Label encoded data: (1050, 7)
One-hot encoded data: (1050, 11)
Final engineered data: (1050, 9)


In [24]:
# Print memory usage
print("\nMemory Usage of Different Encodings:")
print(f"Original data: {df.memory_usage().sum() / 1024**2:.2f} MB")
print(f"Label encoded data: {df_label.memory_usage().sum() / 1024**2:.2f} MB")
print(f"One-hot encoded data: {df_onehot.memory_usage().sum() / 1024**2:.2f} MB")
print(f"Final engineered data: {final_df.memory_usage().sum() / 1024**2:.2f} MB")


Memory Usage of Different Encodings:
Original data: 0.04 MB
Label encoded data: 0.05 MB
One-hot encoded data: 0.09 MB
Final engineered data: 0.07 MB
