In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import kagglehub


In [17]:
try:
    path = kagglehub.dataset_download("nantonio/a-hotels-customers-dataset")
    print("Path to dataset files:", path)
except Exception as e:
    print("Error downloading dataset:", e)


Path to dataset files: /root/.cache/kagglehub/datasets/nantonio/a-hotels-customers-dataset/versions/2


In [19]:
import os

dataset_path = '/root/.cache/kagglehub/datasets/nantonio/a-hotels-customers-dataset/versions/2'
print("Files in the dataset folder:", os.listdir(dataset_path))


Files in the dataset folder: ['HotelCustomersDataset.xlsx']


In [36]:
import pandas as pd
file_path = '/root/.cache/kagglehub/datasets/nantonio/a-hotels-customers-dataset/versions/2/HotelCustomersDataset.xlsx'

data = pd.read_excel(file_path)
print(data.head())


   ID Nationality   Age  DaysSinceCreation  \
0   1         PRT  51.0                150   
1   2         PRT   NaN               1095   
2   3         DEU  31.0               1095   
3   4         FRA  60.0               1095   
4   5         FRA  51.0               1095   

                                            NameHash  \
0  0x8E0A7AF39B633D5EA25C3B7EF4DFC5464B36DB7AF375...   
1  0x21EDE41906B45079E75385B5AA33287CA09DE1AB86DE...   
2  0x31C5E4B74E23231295FDB724AD578C02C4A723F4BA2B...   
3  0xFF534C83C0EF23D1CE516BC80A65D0197003D27937D4...   
4  0x9C1DEF02C9BE242842C1C1ABF2C5AA249A1EEB4763B4...   

                                           DocIDHash  AverageLeadTime  \
0  0x71568459B729F7A7ABBED6C781A84CA4274D571003AC...               45   
1  0x5FA1E0098A31497057C5A6B9FE9D49FD6DD47CCE7C26...               61   
2  0xC7CF344F5B03295037595B1337AC905CA188F1B5B3A5...                0   
3  0xBD3823A9B4EC35D6CAF4B27AE423A677C0200DB61E82...               93   
4  0xE175754CF77247B2

In [40]:
data = data.dropna().reset_index(drop=True)

In [41]:
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
numerical_data = data[numerical_cols]


In [42]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numerical_data)

In [43]:
X_train, X_test = train_test_split(scaled_data, test_size=0.3, random_state=42)

In [44]:
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [45]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_train_pca)

In [46]:
train_clusters = kmeans.predict(X_train_pca)
test_clusters = kmeans.predict(X_test_pca)


Evaluation part


In [47]:
silhouette_train = silhouette_score(X_train_pca, train_clusters)
silhouette_test = silhouette_score(X_test_pca, test_clusters)

In [48]:
print("PCA Explained Variance Ratio:", pca.explained_variance_ratio_)
print("Train Silhouette Score:", silhouette_train)
print("Test Silhouette Score:", silhouette_test)


PCA Explained Variance Ratio: [0.19645091 0.09590271 0.05432959 0.04768757 0.04532201]
Train Silhouette Score: 0.24200393601960135
Test Silhouette Score: 0.23953077482916002


In [49]:
with open("clustering_report.txt", "w") as report:
    report.write("PCA Explained Variance Ratio:\n")
    report.write(str(pca.explained_variance_ratio_))
    report.write("\n\nTrain Silhouette Score: " + str(silhouette_train))
    report.write("\nTest Silhouette Score: " + str(silhouette_test))
