# Get Data

In [167]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Load the dataset
train_split = pd.read_csv('train_split.csv')
# test_split = pd.read_csv('test_split.csv')

# Define the target variable
target = 'Segmentation'

In [149]:
train_split.head(1)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,461859,Female,Yes,46,No,Artist,5.0,Low,5.0,Cat_6,C


# Imputation and Encoding

## Imputation

In [183]:
import pandas as pd

# Sample data (for reference, replace with your actual data)
# train_split = pd.read_csv('path_to_your_train_split_data.csv')

# Redundant ID column removal
if 'ID' in train_split.columns:
    train_split.drop(columns=['ID'], inplace=True)

# Compute imputation statistics
imputation_stats = {}

# Mode imputation for categorical columns
mode_imputation_columns = ['Ever_Married', 'Graduated', 'Var_1']
for column in mode_imputation_columns:
    imputation_stats[column] = train_split[column].mode()[0]
    train_split[column].fillna(imputation_stats[column], inplace=True)

# Constant value imputation for Profession
constant_value_imputation_columns = {'Profession': 'Unknown'}
for column, value in constant_value_imputation_columns.items():
    imputation_stats[column] = value
    train_split[column].fillna(value, inplace=True)

# Median imputation for numerical columns
median_imputation_columns = ['Work_Experience', 'Family_Size']
for column in median_imputation_columns:
    imputation_stats[column] = train_split[column].median()
    train_split[column].fillna(imputation_stats[column], inplace=True)

# Confirming imputation
print(train_split.isnull().sum())

Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
Segmentation       0
dtype: int64


In [185]:
# Save imputation statistics
imputation_stats_df = pd.DataFrame(list(imputation_stats.items()), columns=['Column', 'Value'])
imputation_stats_df.to_csv('imputation_stats.csv', index=False)
print('imputation_stats saved')

imputation_stats saved


## Categorical Columns Encoding

In [188]:
# One-hot encoding for categorical variables
one_hot_columns = ['Gender', 'Profession', 'Ever_Married', 'Graduated', 'Var_1']
train_split = pd.get_dummies(train_split, columns=one_hot_columns, drop_first=True)

# Ordinal encoding for Spending_Score
spending_score_mapping = {'Average': 2, 'High': 1, 'Low': 3}
train_split['Spending_Score'] = train_split['Spending_Score'].map(spending_score_mapping)

# Convert boolean columns to 0 and 1
boolean_columns = [col for col in train_split.columns if train_split[col].dtype == 'bool']
train_split[boolean_columns] = train_split[boolean_columns].applymap(int)

  train_split[boolean_columns] = train_split[boolean_columns].applymap(int)


In [158]:
train_split.head()

Unnamed: 0,Age,Work_Experience,Spending_Score,Family_Size,Segmentation,Gender_Male,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,...,Profession_Marketing,Profession_Unknown,Ever_Married_Yes,Graduated_Yes,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7
0,46,5.0,3,5.0,C,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,48,1.0,3,3.0,C,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,52,0.0,2,2.0,C,0,1,0,0,0,...,0,0,1,1,0,0,0,0,1,0
3,23,0.0,3,4.0,C,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,60,1.0,2,5.0,B,1,0,0,1,0,...,0,0,1,1,0,1,0,0,0,0


## Target Encoding

"Segmentation" represents multiple classes without an ordinal relationship, one-hot encoding can be preferred.

In [162]:
# Apply one-hot encoding to the target column for multi-class classification
train_split = pd.get_dummies(train_split, columns=['Segmentation'], drop_first=False)

# Convert boolean columns to 0 and 1
boolean_columns = [col for col in train_split.columns if train_split[col].dtype == 'bool']
train_split[boolean_columns] = train_split[boolean_columns].applymap(int)

# Verify the one-hot encoded target variable columns
print(train_split[['Segmentation_A', 'Segmentation_B', 'Segmentation_C', 'Segmentation_D']].head())


   Segmentation_A  Segmentation_B  Segmentation_C  Segmentation_D
0               0               0               1               0
1               0               0               1               0
2               0               0               1               0
3               0               0               1               0
4               0               1               0               0


  train_split[boolean_columns] = train_split[boolean_columns].applymap(int)


In [164]:
train_split.to_csv('train_split_processed.csv', index=False)
print('File exported')

File exported
