In [38]:
import pandas as pd
import numpy as np

In [39]:
df = pd.DataFrame({
    'Age': [25, np.nan, 32, 45],
    'Income': [50000, 60000, np.nan, 80000],
    'SpendingScore': [39, 81, 6, np.nan],
    'Gender': ['Male', 'Female', 'Male', 'Female'],
    'MaritalStatus': ['Single', 'Married', 'Single', 'Married']
})
df

Unnamed: 0,Age,Income,SpendingScore,Gender,MaritalStatus
0,25.0,50000.0,39.0,Male,Single
1,,60000.0,81.0,Female,Married
2,32.0,,6.0,Male,Single
3,45.0,80000.0,,Female,Married


### Handling Missing Values (Imputation)

In [40]:
# Impute numerical features with mea
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Income'].fillna(df['Income'].median(), inplace=True)
df['SpendingScore'].fillna(df['SpendingScore'].mode()[0], inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Income'].fillna(df['Income'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Unnamed: 0,Age,Income,SpendingScore,Gender,MaritalStatus
0,25.0,50000.0,39.0,Male,Single
1,34.0,60000.0,81.0,Female,Married
2,32.0,60000.0,6.0,Male,Single
3,45.0,80000.0,6.0,Female,Married


In [41]:
#  Column-wise strategy with mapping
impute_map = {
    'Age': df['Age'].mean(),
    'Income': df['Income'].median(),
    'SpendingScore': df['SpendingScore'].mode()[0]
}
df.fillna(impute_map)
df

Unnamed: 0,Age,Income,SpendingScore,Gender,MaritalStatus
0,25.0,50000.0,39.0,Male,Single
1,34.0,60000.0,81.0,Female,Married
2,32.0,60000.0,6.0,Male,Single
3,45.0,80000.0,6.0,Female,Married


In [42]:
from sklearn.impute import SimpleImputer
# Only numeric columns
num_cols = df.select_dtypes(include=np.number).columns
# Mean imputation
imputer = SimpleImputer(strategy='mean')
df[num_cols] = imputer.fit_transform(df[num_cols].round(2))
df

Unnamed: 0,Age,Income,SpendingScore,Gender,MaritalStatus
0,25.0,50000.0,39.0,Male,Single
1,34.0,60000.0,81.0,Female,Married
2,32.0,60000.0,6.0,Male,Single
3,45.0,80000.0,6.0,Female,Married


###  Feature Scaling (Standardization / Normalization)

In [43]:
df['Income_Normalized'] = (df['Income'] - df['Income'].min()) / (df['Income'].max() - df['Income'].min())
df

Unnamed: 0,Age,Income,SpendingScore,Gender,MaritalStatus,Income_Normalized
0,25.0,50000.0,39.0,Male,Single,0.0
1,34.0,60000.0,81.0,Female,Married,0.333333
2,32.0,60000.0,6.0,Male,Single,0.333333
3,45.0,80000.0,6.0,Female,Married,1.0


In [44]:
df['Age_Standardized'] = (df['Age'] - df['Age'].mean()) / df['Age'].std()
df

Unnamed: 0,Age,Income,SpendingScore,Gender,MaritalStatus,Income_Normalized,Age_Standardized
0,25.0,50000.0,39.0,Male,Single,0.0,-1.086099
1,34.0,60000.0,81.0,Female,Married,0.333333,0.0
2,32.0,60000.0,6.0,Male,Single,0.333333,-0.241355
3,45.0,80000.0,6.0,Female,Married,1.0,1.327455


In [45]:
# from sklearn.preprocessing import MinMaxScaler, StandardScaler

# scaler = StandardScaler()
# min_max  = MinMaxScaler()
# df[['Age', 'Income', 'SpendingScore']] = scaler.fit_transform(df[['Age', 'Income', 'SpendingScore']])
# df[['Age', 'Income', 'SpendingScore']] = min_max.fit_transform(df[['Age', 'Income', 'SpendingScore']])
# df


### Feature Encoding (Categorical Variables)

In [46]:
# Label Encoding
df['Gender_Label'] = df['Gender'].map({'Male': 0, 'Female': 1})
df

Unnamed: 0,Age,Income,SpendingScore,Gender,MaritalStatus,Income_Normalized,Age_Standardized,Gender_Label
0,25.0,50000.0,39.0,Male,Single,0.0,-1.086099,0
1,34.0,60000.0,81.0,Female,Married,0.333333,0.0,1
2,32.0,60000.0,6.0,Male,Single,0.333333,-0.241355,0
3,45.0,80000.0,6.0,Female,Married,1.0,1.327455,1


In [47]:
# One-hot Encoding
df = pd.get_dummies(df, columns=['MaritalStatus'], drop_first=True)
df

Unnamed: 0,Age,Income,SpendingScore,Gender,Income_Normalized,Age_Standardized,Gender_Label,MaritalStatus_Single
0,25.0,50000.0,39.0,Male,0.0,-1.086099,0,True
1,34.0,60000.0,81.0,Female,0.333333,0.0,1,False
2,32.0,60000.0,6.0,Male,0.333333,-0.241355,0,True
3,45.0,80000.0,6.0,Female,1.0,1.327455,1,False


In [51]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer([
    ('onehot', OneHotEncoder(drop='first'), ['Gender']),
], remainder='passthrough')

df_transformed = ct.fit_transform(df)
df

Unnamed: 0,Age,Income,SpendingScore,Gender,Income_Normalized,Age_Standardized,Gender_Label,MaritalStatus_Single
0,25.0,50000.0,39.0,Male,0.0,-1.086099,0,True
1,34.0,60000.0,81.0,Female,0.333333,0.0,1,False
2,32.0,60000.0,6.0,Male,0.333333,-0.241355,0,True
3,45.0,80000.0,6.0,Female,1.0,1.327455,1,False


In [59]:
def preprocess_customer_data(df):
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    import pandas as pd

    # Select features
    num_cols = ['Age', 'Income', 'SpendingScore']
    cat_cols = ['Gender', 'MaritalStatus']

    # Pipelines
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first'))
    ])

    # Full pipeline
    full_pipeline = ColumnTransformer([
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ])

    # Fit-transform the data
    processed_data = full_pipeline.fit_transform(df)

    # Get feature names and create DataFrame
    feature_names = full_pipeline.get_feature_names_out()
    processed_df = pd.DataFrame(processed_data, columns=feature_names)

    # Return both processed DataFrame and fitted pipeline
    return processed_df, full_pipeline

In [61]:
# Create example data
df = pd.DataFrame({
    'Age': [25, np.nan, 32, 45],
    'Income': [50000, 60000, np.nan, 80000],
    'SpendingScore': [39, 81, 6, np.nan],
    'Gender': ['Male', 'Female', 'Male', 'Female'],
    'MaritalStatus': ['Single', 'Married', 'Single', 'Married']
})

# Get both processed data and pipeline
processed_df, pipeline = preprocess_customer_data(df)

processed_df


Unnamed: 0,num__Age,num__Income,num__SpendingScore,cat__Gender_Male,cat__MaritalStatus_Single
0,-1.254119,-1.234427,-0.112867,1.0,1.0
1,0.0,-0.308607,1.467265,0.0,0.0
2,-0.278693,0.0,-1.354398,1.0,1.0
3,1.532813,1.543033,0.0,0.0,0.0


In [62]:
new_data = pd.DataFrame({
    'Age': [28],
    'Income': [70000],
    'SpendingScore': [50],
    'Gender': ['Female'],
    'MaritalStatus': ['Single']
})

new_processed = pipeline.transform(new_data)
new_df = pd.DataFrame(new_processed, columns=pipeline.get_feature_names_out())

new_df

Unnamed: 0,num__Age,num__Income,num__SpendingScore,cat__Gender_Male,cat__MaritalStatus_Single
0,-0.83608,0.617213,0.300977,0.0,1.0


In [66]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [67]:
df = pd.read_csv("Mall_Customers.csv")
df.columns = df.columns.str.strip().str.replace(" ", "_")
df

Unnamed: 0,CustomerID,Gender,Age,Annual_Income_(k$),Spending_Score_(1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [69]:
# Simulate missing values
df.loc[2, 'Annual_Income_(k$)'] = np.nan
df.loc[4, 'Spending_Score_(1-100)'] = np.nan
df

Unnamed: 0,CustomerID,Gender,Age,Annual_Income_(k$),Spending_Score_(1-100)
0,1,Male,19,15.0,39.0
1,2,Male,21,15.0,81.0
2,3,Female,20,,6.0
3,4,Female,23,16.0,77.0
4,5,Female,31,17.0,


In [None]:
# Define Column Types
num_cols = ['Age', 'Annual_Income_(k$)', 'Spending_Score_(1-100)']
cat_cols = ['Gender']

In [71]:
# Numerical Pipeline – KNN Imputer + StandardScaler
num_pipeline = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=3)),
    ('scaler', StandardScaler())
])

# Categorical Pipeline – OneHotEncoding with drop
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])

# Full ColumnTransformer
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Transform the data
processed = full_pipeline.fit_transform(df)

# Final DataFrame (for model input)
processed_df = pd.DataFrame(
    processed,
    columns=[
        'Age_scaled', 'Income_scaled', 'Score_scaled',
        'Gender_Female'
    ]
)

print(processed_df.head())

   Age_scaled  Income_scaled  Score_scaled  Gender_Female
0   -0.882053      -1.069045     -0.526006            1.0
1   -0.417815      -1.069045      0.973467            1.0
2   -0.649934       0.267261     -1.704163            0.0
3    0.046424       0.267261      0.830660            0.0
4    1.903377       1.603567      0.426041            0.0


In [75]:
# # Add Age*SpendingScore interaction
# df['Age_Score_Interaction'] = df['Age'] * df['Spending_Score_(1-100)']

# # Add Income Bins (for decision tree-based models)
# df['Income_Binned'] = pd.qcut(df['Annual_Income_(k$)'], q=4, labels=False)

In [76]:
df = pd.read_csv("customer_data.csv")
df

Unnamed: 0,customer_id,signup_date,age,income,device_type,region,past_7d_clicks,avg_session_time,churned
0,123,2022-01-01,25.0,60000.0,mobile,US,12,5.1,0
1,124,2021-11-20,,72000.0,desktop,CA,5,,1
2,125,2023-03-05,33.0,,mobile,IN,9,4.7,0


In [77]:
# Grouped median imputation based on region
df['income'] = df.groupby('region')['income'].transform(lambda x: x.fillna(x.median()))

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [80]:
# ML-based imputation using IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
imp = IterativeImputer(estimator=BayesianRidge(), max_iter=10)
df[['age', 'income']] = imp.fit_transform(df[['age', 'income']])
df

Unnamed: 0,customer_id,signup_date,age,income,device_type,region,past_7d_clicks,avg_session_time,churned
0,123,2022-01-01,25.0,60000.0,mobile,US,12,5.1,0
1,124,2021-11-20,41.0,72000.0,desktop,CA,5,,1
2,125,2023-03-05,33.0,66000.000132,mobile,IN,9,4.7,0


In [81]:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [82]:
num_features = ['age', 'income', 'past_7d_clicks', 'avg_session_time']
num_pipeline = Pipeline([
    ('scaler', RobustScaler())  # More robust than StandardScaler to outliers
])

# Apply scaling only to numerical columns
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features)
])

In [83]:
# Frequency Encoding
for col in ['region', 'device_type']:
    freq = df[col].value_counts(normalize=True)
    df[col + '_freq'] = df[col].map(freq)

In [85]:
df['signup_date'] = pd.to_datetime(df['signup_date'])
df['signup_month'] = df['signup_date'].dt.month
df['account_age_days'] = (pd.Timestamp.now() - df['signup_date']).dt.days

In [86]:
df['income_per_session'] = df['income'] / (df['avg_session_time'] + 1)
df['age_squared'] = df['age'] ** 2
df['log_income'] = np.log1p(df['income'])

In [87]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define columns
num_cols = ['age', 'income', 'past_7d_clicks', 'avg_session_time']
cat_cols = ['device_type', 'region']

# Build pipelines
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine
full_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])

# Final pipeline with RandomForest
from sklearn.ensemble import RandomForestClassifier

full_model_pipeline = Pipeline([
    ('features', full_pipe),
    ('model', RandomForestClassifier(n_estimators=100, random_state=42))
])


In [89]:
X = df.drop('churned', axis=1)
y = df['churned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
full_model_pipeline.fit(X_train, y_train)

# Predict
y_pred = full_model_pipeline.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0
