In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Load the dataset
# Reading the CSV file into a pandas DataFrame
data = pd.read_csv('data/mall_customers.csv')

# Display first 5 rows to inspect the data
print(data.head())

# Check for missing values
# Identify if there are any missing entries in the dataset
print("Missing values per column:")
print(data.isnull().sum())

# Basic descriptive statistics
# Get a summary of numerical columns to understand distributions
print("\nDescriptive statistics:")
print(data.describe())

# Data visualization: Histograms
# Visualizing the distribution of numerical features
data[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].hist(bins=15, figsize=(12,6))
plt.suptitle('Distribution of Numerical Features', fontsize=16)
plt.show()

# Data visualization: Gender distribution
# Count plot to visualize the number of male and female customers
sns.countplot(x='Gender', data=data)
plt.title('Gender Distribution', fontsize=14)
plt.show()

# Data visualization: Scatter plot
# Visualizing Annual Income vs Spending Score colored by Gender
plt.figure(figsize=(8,6))
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', hue='Gender', data=data, palette='Set1')
plt.title('Annual Income vs Spending Score by Gender', fontsize=14)
plt.show()

# Prepare data for clustering
# Selecting features relevant for clustering
X = data[['Annual Income (k$)', 'Spending Score (1-100)']]

# KMeans clustering
# Creating the KMeans model with 5 clusters
kmeans = KMeans(n_clusters=5, random_state=42)

# Fitting the model to the data
kmeans.fit(X)

# Adding the cluster labels to the original dataset
data['Cluster'] = kmeans.labels_

# Display first 5 rows with cluster labels
print(data.head())

# Visualize clusters
# Scatter plot to visualize clusters in the dataset
plt.figure(figsize=(8,6))
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', 
                hue='Cluster', palette='Set1', data=data)
plt.title('KMeans Clustering of Customers', fontsize=14)
plt.show()