In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("../data/raw_data.csv", sep="\t")
df.columns = df.columns.str.strip()

In [3]:
df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"], dayfirst=True)

In [4]:
threshold = len(df.columns) * 0.5
df = df.dropna(thresh=threshold)

num_cols = df.select_dtypes(include=["int64", "float64"]).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

cat_cols = df.select_dtypes(include="object").columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [5]:
def detect_outliers_iqr(dataframe, column):
    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return lower, upper

num_cols = df.select_dtypes(include=["int64", "float64"]).columns

for col in num_cols:
    lower, upper = detect_outliers_iqr(df, col)
    df[col] = df[col].clip(lower, upper)

In [6]:
cat_cols = df.select_dtypes(include="object").columns
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [7]:
exclude_cols = ["ID", "Response"]
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
scale_cols = [col for col in num_cols if col not in exclude_cols]

scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

In [8]:
df.info()
df.isna().sum()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 38 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   ID                       2240 non-null   int64         
 1   Year_Birth               2240 non-null   float64       
 2   Income                   2240 non-null   float64       
 3   Kidhome                  2240 non-null   float64       
 4   Teenhome                 2240 non-null   float64       
 5   Dt_Customer              2240 non-null   datetime64[ns]
 6   Recency                  2240 non-null   float64       
 7   MntWines                 2240 non-null   float64       
 8   MntFruits                2240 non-null   float64       
 9   MntMeatProducts          2240 non-null   float64       
 10  MntFishProducts          2240 non-null   float64       
 11  MntSweetProducts         2240 non-null   float64       
 12  MntGoldProds             2240 non-

Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,...,Education_Graduation,Education_Master,Education_PhD,Marital_Status_Alone,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Marital_Status_YOLO
0,5524,-1.007064,0.299651,-0.825218,-0.929894,2012-09-04,0.307039,1.005751,2.176816,2.210169,...,True,False,False,False,False,False,True,False,False,False
1,2174,-1.261969,-0.263808,1.032559,0.906934,2014-03-08,-0.383664,-0.879539,-0.758828,-0.810358,...,True,False,False,False,False,False,True,False,False,False
2,4141,-0.327318,0.94342,-0.825218,-0.929894,2013-08-21,-0.798086,0.3743,1.002558,-0.133536,...,True,False,False,False,False,False,False,True,False,False
3,6182,1.28708,-1.204881,1.032559,-0.929894,2014-02-10,-0.798086,-0.879539,-0.648741,-0.732048,...,True,False,False,False,False,False,False,True,False,False
4,5324,1.032175,0.307056,1.032559,-0.929894,2014-01-19,1.550305,-0.390089,0.782385,-0.183879,...,False,False,True,False,False,True,False,False,False,False


In [9]:
df.to_csv("../data/cleaned_data.csv", index=False)