In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    "CustomerID": np.arange(1, 201),
    "Age": np.random.randint(18, 60, 200),
    "Gender": np.random.choice(["Male", "Female"], 200),
    "PurchaseAmount": np.random.randint(200, 7000, 200),
    "City": np.random.choice(["Delhi", "Mumbai", "Kolkata", "Chennai", "Bangalore"], 200),
    "Membership": np.random.choice(["None", "Silver", "Gold", "Platinum"], 200)
})

df.to_csv("customer_data_raw.csv", index=False)

df.head()


Unnamed: 0,CustomerID,Age,Gender,PurchaseAmount,City,Membership
0,1,59,Male,2689,Chennai,Silver
1,2,53,Female,5558,Chennai,Gold
2,3,19,Male,2663,Delhi,Platinum
3,4,20,Male,755,Chennai,Gold
4,5,27,Female,1799,Bangalore,Gold


In [2]:
import pandas as pd

df = pd.read_csv("customer_data_raw.csv")
df.head()


Unnamed: 0,CustomerID,Age,Gender,PurchaseAmount,City,Membership
0,1,59,Male,2689,Chennai,Silver
1,2,53,Female,5558,Chennai,Gold
2,3,19,Male,2663,Delhi,Platinum
3,4,20,Male,755,Chennai,Gold
4,5,27,Female,1799,Bangalore,Gold


In [3]:
# Remove duplicates
df = df.drop_duplicates()

# Handle missing values
df = df.fillna({
    "Age": df["Age"].median(),
    "Gender": df["Gender"].mode()[0],
    "PurchaseAmount": df["PurchaseAmount"].median(),
    "City": df["City"].mode()[0],
    "Membership": df["Membership"].mode()[0]
})

# Convert types
df["Age"] = df["Age"].astype(int)
df["PurchaseAmount"] = df["PurchaseAmount"].astype(int)

# Summary stats
summary = df.describe(include="all")

df.head(), summary


(   CustomerID  Age  Gender  PurchaseAmount       City Membership
 0           1   59    Male            2689    Chennai     Silver
 1           2   53  Female            5558    Chennai       Gold
 2           3   19    Male            2663      Delhi   Platinum
 3           4   20    Male             755    Chennai       Gold
 4           5   27  Female            1799  Bangalore       Gold,
         CustomerID         Age Gender  PurchaseAmount     City Membership
 count   200.000000  200.000000    200      200.000000      200        200
 unique         NaN         NaN      2             NaN        5          3
 top            NaN         NaN   Male             NaN  Chennai     Silver
 freq           NaN         NaN    104             NaN       47        112
 mean    100.500000   38.945000    NaN     3401.925000      NaN        NaN
 std      57.879185   12.283583    NaN     1887.472112      NaN        NaN
 min       1.000000   18.000000    NaN      210.000000      NaN        NaN
 25

In [4]:
df.to_csv("customer_data_cleaned.csv", index=False)
