# **Importing the libraries**

In [24]:
# Data Handling
import pandas as pd                  # Data manipulation and analysis
import numpy as np                   # Numerical operations

# Data Visualization
import matplotlib.pyplot as plt       # Basic plotting
import seaborn as sns                 # Statistical data visualization
import plotly.express as px           # Interactive plots
import plotly.graph_objects as go     # Custom interactive plots
import polars as pl                   # Fast DataFrame library, alternative to pandas for big data

# Machine Learning - Models
from sklearn.linear_model import LinearRegression, LogisticRegression  # Linear models
from sklearn.tree import DecisionTreeClassifier                        # Decision Trees
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  # Ensembles
from sklearn.cluster import KMeans, DBSCAN                            # Clustering
from sklearn.svm import SVC                                            # Support Vector Classifier
from sklearn.neighbors import KNeighborsClassifier                     # K-Nearest Neighbors
from xgboost import XGBClassifier                                      # Gradient Boosting (XGBoost)
from lightgbm import LGBMClassifier                                    # Gradient Boosting (LightGBM)

# Machine Learning - Preprocessing and Metrics
from sklearn.model_selection import train_test_split, GridSearchCV     # Splitting and model tuning
from sklearn.preprocessing import StandardScaler, LabelEncoder         # Scaling and encoding
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # Metrics
from sklearn.impute import SimpleImputer                               # Handling missing values

# Additional EDA Libraries
from scipy import stats              # Statistical analysis
import missingno as msno             # Visualize missing values


# **Data Preprocessing**

# Step 1 : Reading the data

In [25]:
df = pl.read_csv("/content/customer_shopping_data.csv")
df

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""","""12/12/2021""","""Forum Istanbul"""
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""","""9/11/2021""","""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""","""16/05/2021""","""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""","""24/10/2021""","""Kanyon"""
…,…,…,…,…,…,…,…,…,…
"""I219422""","""C441542""","""Female""",45,"""Souvenir""",5,58.65,"""Credit Card""","""21/09/2022""","""Kanyon"""
"""I325143""","""C569580""","""Male""",27,"""Food & Beverage""",2,10.46,"""Cash""","""22/09/2021""","""Forum Istanbul"""
"""I824010""","""C103292""","""Male""",63,"""Food & Beverage""",2,10.46,"""Debit Card""","""28/03/2021""","""Metrocity"""
"""I702964""","""C800631""","""Male""",56,"""Technology""",4,4200.0,"""Cash""","""16/03/2021""","""Istinye Park"""


# Step 2: Data Profiling

In [26]:
df.describe()

statistic,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,str,f64,str,f64,f64,str,str,str
"""count""","""99457""","""99457""","""99457""",99457.0,"""99457""",99457.0,99457.0,"""99457""","""99457""","""99457"""
"""null_count""","""0""","""0""","""0""",0.0,"""0""",0.0,0.0,"""0""","""0""","""0"""
"""mean""",,,,43.427089,,3.003429,689.256321,,,
"""std""",,,,14.990054,,1.413025,941.184567,,,
"""min""","""I100008""","""C100004""","""Female""",18.0,"""Books""",1.0,5.23,"""Cash""","""1/1/2021""","""Cevahir AVM"""
"""25%""",,,,30.0,,2.0,45.45,,,
"""50%""",,,,43.0,,3.0,203.3,,,
"""75%""",,,,56.0,,4.0,1200.32,,,
"""max""","""I999994""","""C999995""","""Male""",69.0,"""Toys""",5.0,5250.0,"""Debit Card""","""9/9/2022""","""Zorlu Center"""


# Step 3: Data Cleansing

# 1. Remove duplicates if there are any

In [27]:
df = df.unique(subset=['invoice_no'])

# 2. Handle missing values by imputing or removing

In [28]:
# Fill missing categorical columns with mode, numerical columns with median
df = df.with_columns([
    pl.col("gender").fill_null(pl.col("gender").mode()),
    pl.col("age").fill_null(pl.col("age").median()),
    pl.col("price").fill_null(pl.col("price").median()),
    pl.col("payment_method").fill_null("Unknown"),
    pl.col("category").fill_null("Unknown"),
    pl.col("shopping_mall").fill_null("Unknown")
])

In [29]:
df.null_count()

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0


# Step 4: Data Transformation

In [30]:
df = df.with_columns(
    pl.col("invoice_date").str.strptime(pl.Date, "%d/%m/%Y")
)

In [31]:
df

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,f64,str,i64,f64,str,date,str
"""I502055""","""C801479""","""Female""",40.0,"""Toys""",3,107.52,"""Cash""",2022-02-28,"""Cevahir AVM"""
"""I518072""","""C217408""","""Male""",19.0,"""Clothing""",5,1500.4,"""Debit Card""",2021-05-04,"""Mall of Istanbul"""
"""I640144""","""C162081""","""Female""",41.0,"""Toys""",5,179.2,"""Credit Card""",2022-06-08,"""Metropol AVM"""
"""I297614""","""C324311""","""Male""",56.0,"""Clothing""",1,300.08,"""Cash""",2023-02-07,"""Mall of Istanbul"""
"""I549751""","""C137405""","""Male""",48.0,"""Souvenir""",3,35.19,"""Cash""",2021-05-27,"""Kanyon"""
…,…,…,…,…,…,…,…,…,…
"""I153981""","""C891871""","""Female""",54.0,"""Clothing""",3,900.24,"""Debit Card""",2022-11-04,"""Viaport Outlet"""
"""I241275""","""C258334""","""Male""",22.0,"""Cosmetics""",4,162.64,"""Cash""",2022-06-18,"""Istinye Park"""
"""I104069""","""C300457""","""Female""",33.0,"""Cosmetics""",1,40.66,"""Credit Card""",2021-12-19,"""Zorlu Center"""
"""I259343""","""C416091""","""Female""",52.0,"""Cosmetics""",4,162.64,"""Credit Card""",2021-03-02,"""Mall of Istanbul"""


# 2. Create new columns if needed for analysis

In [33]:
df = df.with_columns(
    (pl.col("quantity") * pl.col("price")).alias("total_spent")
)