In [None]:
# Step 1: Understand Data
import pandas as pd

customers_data = pd.read_csv("Mall_Customers.csv")
print(customers_data.head())
print(customers_data.shape)
print(customers_data.columns)
print(customers_data.dtypes)
print(customers_data.info())
print(customers_data.describe())
print(customers_data['Gender'].value_counts())

# Step 2: Label encode gender
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
customers_data['Gender'] = le.fit_transform(customers_data['Gender'])

# Step 3: Check for variance
variance_check = customers_data.describe()
# If variance is high for float columns, you need to normalize. Otherwise, ignore

# Step 4: Check skewness
skewness_check = customers_data.skew()
# If skew value is greater than 0.75, then you can perform log transformation on those skew columns.

# Step 5: Pair plot
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(customers_data)
plt.show()

# Step 6: Build KMeans
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5)  # You can set clusters to any value
kmeans.fit(customers_data)
print("Labels:", kmeans.labels_)
print("Cluster Centers:", kmeans.cluster_centers_)

# Step 7: Scatter plot
sns.scatterplot(x='Age', y='Annual Income (k$)', data=customers_data, hue=kmeans.labels_)
plt.show()

# Step 8: Cluster Analysis
pred = kmeans.predict(customers_data)
frame = pd.DataFrame(customers_data)
frame['Cluster'] = pred
grouped_customers = frame.groupby('Cluster')
for name, group in grouped_customers:
    print(f"Cluster {name}:\n{group.describe()}\n")

# Step 9: Find the best number of clusters
inertia_values = []
for clusters in range(1, 21):
    kmeans = KMeans(n_clusters=clusters, init='k-means++')
    kmeans.fit(customers_data)
    inertia_values.append(kmeans.inertia_)

# Plot a line chart between cluster number and its inertia value
plt.plot(range(1, 21), inertia_values, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia Value')
plt.title('Elbow Method')
plt.show()

# Step 10: Reduce Dimensions using PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
reduced_data = pca.fit_transform(customers_data)

# Create KMeans model, fit on the reduced dataset
kmeans_pca = KMeans(n_clusters=5)
kmeans_pca.fit(reduced_data)
print("Cluster Centers:", kmeans_pca.cluster_centers_)
print("Labels:", kmeans_pca.labels_)

# Step 11: Scatter plot for reduced dimensions
sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], hue=kmeans_pca.labels_)
plt.show()

# Step 12: MeanShift clustering
from sklearn.cluster import MeanShift

meanshift = MeanShift()
meanshift.fit(reduced_data)
print("Cluster Centers:", meanshift.cluster_centers_)
print("Labels:", meanshift.labels_)

# Visualize MeanShift clusters
sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], hue=meanshift.labels_)
plt.show()
