<a href="https://colab.research.google.com/github/codeRSH/AAG-DS/blob/main/AAG_Cluster_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cluster Analysis Using Python

## Initial Setup

### Mount Drive to Read files

In [274]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import All Relevant Libraries

In [275]:
import glob
import pandas as pd
import numpy as np

### Make Plots show within the notebook output

In [276]:
import matplotlib as mp
import matplotlib.pyplot as plt
%matplotlib inline

### Ignore Warnings (for clean output)

In [277]:
import warnings
warnings.filterwarnings('ignore')

### Define File Paths

In [278]:
init_input_path = "/content/drive/MyDrive/AAG DS Folder/Capstone/"

input_cluster_path = init_input_path + "cluster/*csv"

init_output_path = "/content/drive/MyDrive/AAG DS Folder/Capstone/output/"
ouptut_cluster_path = init_output_path + "cluster_final_analysis.csv"

## Data Loading and Wrangling

### Read Cluster Input Data

In [279]:
# Input Cluster Data Frame
cluster_data = pd.DataFrame()

print(glob.glob(input_cluster_path))

# Read the Cluster data from all the available csv files
for file in glob.glob(input_cluster_path):
    data = pd.read_csv(file, parse_dates=True, encoding="latin1",
                       error_bad_lines=False, usecols = [*range(1, 10), *range(11, 20)] )
    cluster_data = cluster_data.append(data)

cluster_data.columns = ['Guest_ID', 'Reservation_Count', 'Hotel_Count', 'Gender', 
                     'Metro', 'Non_Metro', 'EARLY', 'LATE', 'MP', 'MEAL',	
                     'ROOM', 'VAS', 'Country_Key', 'Stay_Days', 'Revenue_Per_Pax',	
                     'Guest_Category', 'Lead_Days',	'Cluster' ]

cluster_data

['/content/drive/MyDrive/AAG DS Folder/Capstone/cluster/kmeans_clusters.csv']


Unnamed: 0,Guest_ID,Reservation_Count,Hotel_Count,Gender,Metro,Non_Metro,EARLY,LATE,MP,MEAL,ROOM,VAS,Country_Key,Stay_Days,Revenue_Per_Pax,Guest_Category,Lead_Days,Cluster
0,39853,1,1,Undisclosed,1,0,1.0,1.0,1.0,1.0,3275.774400,1.0,IN,1,4212.774400,Individual,1.000000,4
1,55125,4,1,Undisclosed,4,0,1.0,1.0,1.0,1.0,16078.568000,1.0,IN,2,9577.284000,Middle Management,2.750000,3
2,60873,3,2,Undisclosed,2,1,1.0,1.0,1.0,1.0,6137.915533,1.0,IN,6,20389.146600,Middle Management,2.000000,3
3,74138,8,1,Undisclosed,8,0,1.0,1.0,1.0,1.0,11308.820000,1.0,IN,1,4566.273333,Middle Management,0.666667,3
4,112568,1,1,Undisclosed,1,0,1.0,1.0,1.0,1.0,5341.320000,1.0,IN,1,5822.320000,Individual,1.000000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7193,3757436,1,1,Male,1,0,1.0,1.0,1.0,1.0,3457.245000,1.0,IN,2,10890.880000,Family,319.000000,5
7194,3757460,1,1,Undisclosed,1,0,1.0,1.0,1.0,1.0,3438.410000,1.0,IN,2,10603.005000,Family,319.000000,2
7195,3757466,1,1,Undisclosed,1,0,1.0,1.0,1.0,1.0,3438.410000,1.0,IN,2,10603.005000,Family,319.000000,2
7196,3757480,1,1,Female,1,0,1.0,1.0,1.0,1.0,3438.410000,1.0,IN,2,10603.005000,Family,319.000000,2


In [280]:
cluster_data["Lead_Days"] = cluster_data["Lead_Days"].round().astype(int)
cluster_data

Unnamed: 0,Guest_ID,Reservation_Count,Hotel_Count,Gender,Metro,Non_Metro,EARLY,LATE,MP,MEAL,ROOM,VAS,Country_Key,Stay_Days,Revenue_Per_Pax,Guest_Category,Lead_Days,Cluster
0,39853,1,1,Undisclosed,1,0,1.0,1.0,1.0,1.0,3275.774400,1.0,IN,1,4212.774400,Individual,1,4
1,55125,4,1,Undisclosed,4,0,1.0,1.0,1.0,1.0,16078.568000,1.0,IN,2,9577.284000,Middle Management,3,3
2,60873,3,2,Undisclosed,2,1,1.0,1.0,1.0,1.0,6137.915533,1.0,IN,6,20389.146600,Middle Management,2,3
3,74138,8,1,Undisclosed,8,0,1.0,1.0,1.0,1.0,11308.820000,1.0,IN,1,4566.273333,Middle Management,1,3
4,112568,1,1,Undisclosed,1,0,1.0,1.0,1.0,1.0,5341.320000,1.0,IN,1,5822.320000,Individual,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7193,3757436,1,1,Male,1,0,1.0,1.0,1.0,1.0,3457.245000,1.0,IN,2,10890.880000,Family,319,5
7194,3757460,1,1,Undisclosed,1,0,1.0,1.0,1.0,1.0,3438.410000,1.0,IN,2,10603.005000,Family,319,2
7195,3757466,1,1,Undisclosed,1,0,1.0,1.0,1.0,1.0,3438.410000,1.0,IN,2,10603.005000,Family,319,2
7196,3757480,1,1,Female,1,0,1.0,1.0,1.0,1.0,3438.410000,1.0,IN,2,10603.005000,Family,319,2


In [281]:
# Determine the Number of guests
num_of_guests = cluster_data["Guest_ID"].nunique()
num_of_guests

7198

In [282]:
cluster_analysis = pd.Series(index = ['Reservation_Count', 
                                      'Reserv_Count_1', 'Reserv_Count_2',
                                      'Reserv_Count_3', 'Reserv_Count_4', 
                                      'Hotel_Count', 
                                      'Hotel_Count_1', 'Hotel_Count_2',
                                      'Hotel_Count_3','Hotel_Count_4',
                                      'Gender_M', 'Gender_F', 'Gender_Un',
                                      'Metro', 'Non_Metro', 'EARLY', 'LATE', 
                                      'MP', 'MEAL', 'ROOM', 'VAS', 'Country_IN',
                                      'Country_Non_IN', 'Stay_Days', 'Stay_Days_0',
                                      'Stay_Days_1', 'Stay_Days_2', 'Stay_Days_3',
                                      'Stay_Days_4', 'Stay_Days_5', 'Stay_Days_6',
                                      'Stay_Days_0avg', 'Stay_Days_1avg', 'Stay_Days_2avg', 
                                      'Stay_Days_3avg', 'Stay_Days_4avg', 'Stay_Days_5avg', 
                                      'Stay_Days_6avg', 'Revenue_Per_Pax',	'Guest_Category_Ind',
                                      'Guest_Category_Fam', 'Guest_Category_MM', 'Guest_Category_JM', 
                                      'Lead_Days',  'Lead_Days_0',  'Lead_Days_1',	
                                      'Lead_Days_2',	 'Lead_Days_3',	 'Lead_Days_4',	
                                      'Lead_Days_5',	 'Lead_Days_6',	 'Lead_Days_7',	
                                      'Lead_Days_8',	'Cluster' ])

In [283]:
cluster_analysis.shape

(54,)

In [284]:
avg_of_cluster_data = round((cluster_data[['Reservation_Count', 'Hotel_Count', 
                           'Metro', 'Non_Metro', 'EARLY', 'LATE', 'MP', 'MEAL',	
                           'ROOM', 'VAS', 'Stay_Days', 'Revenue_Per_Pax',	
                           'Lead_Days'	 ]].sum() / num_of_guests), 2 )
avg_of_cluster_data

Reservation_Count        1.57
Hotel_Count              1.03
Metro                    1.55
Non_Metro                0.02
EARLY                    1.00
LATE                     1.00
MP                       1.00
MEAL                     1.00
ROOM                  5754.48
VAS                      1.00
Stay_Days                2.29
Revenue_Per_Pax      10916.55
Lead_Days               11.96
dtype: float64

In [285]:
for ind in avg_of_cluster_data.index:
  cluster_analysis[ind] = avg_of_cluster_data[ind]

In [286]:
def get_freq_dist(col_name, main_df, last_count, num_total_records, analysis_df, init_label):
    col_series = main_df[col_name].copy()
    dist_df = col_series.value_counts()
    # print(dist_df.head(10))
    
    # print("###############")
    dist_df.sort_index(inplace=True)
    # print(dist_df.head(10))

    # print("###############")
    # print(dist_df[last_count])

    # print("#############")
    dist_df[last_count] = dist_df.loc[last_count:].sum()
    # print(dist_df[last_count])
    # print(dist_df.loc[last_count:])
    
    
    # print("###############")
    dist_df = dist_df.loc[:last_count]
    # print(dist_df)


    # print("###############")
    dist_df_share = round(dist_df / num_total_records, 4) * 100
    # print(dist_df_share)
        
    for ind in dist_df_share.index:
        an_ind = init_label + str(ind)
        analysis_df[an_ind] = dist_df_share[ind]

    return analysis_df

In [287]:

cluster_analysis = get_freq_dist(col_name = "Reservation_Count", main_df = cluster_data, 
                                 last_count = 4 , num_total_records = num_of_guests, 
                                 analysis_df = cluster_analysis, init_label = 'Reserv_Count_'
                                )

# cluster_analysis

In [288]:
cluster_analysis = get_freq_dist(col_name = "Hotel_Count", main_df = cluster_data, 
                                 last_count = 4, num_total_records = num_of_guests, 
                                 analysis_df = cluster_analysis, init_label = 'Hotel_Count_'
                                )


# cluster_analysis

In [289]:
cluster_analysis = get_freq_dist(col_name = "Stay_Days", main_df = cluster_data, 
                                  last_count = 6, 
                                  num_total_records = num_of_guests,
                                  analysis_df = cluster_analysis, init_label = 'Stay_Days_'
                                )


# cluster_analysis

In [290]:
cluster_analysis = get_freq_dist(col_name = "Lead_Days", main_df = cluster_data, 
                                  last_count = 8, num_total_records = num_of_guests, 
                                  analysis_df = cluster_analysis, init_label = 'Lead_Days_'
                                )


cluster_analysis

Reservation_Count         1.57
Reserv_Count_1           75.90
Reserv_Count_2           12.96
Reserv_Count_3            4.74
Reserv_Count_4            6.40
Hotel_Count               1.03
Hotel_Count_1            97.25
Hotel_Count_2             2.32
Hotel_Count_3             0.31
Hotel_Count_4             0.13
Gender_M                   NaN
Gender_F                   NaN
Gender_Un                  NaN
Metro                     1.55
Non_Metro                 0.02
EARLY                     1.00
LATE                      1.00
MP                        1.00
MEAL                      1.00
ROOM                   5754.48
VAS                       1.00
Country_IN                 NaN
Country_Non_IN             NaN
Stay_Days                 2.29
Stay_Days_0                NaN
Stay_Days_1              47.93
Stay_Days_2              27.87
Stay_Days_3              11.70
Stay_Days_4               4.14
Stay_Days_5               2.20
Stay_Days_6               6.17
Stay_Days_0avg             NaN
Stay_Day

In [291]:
cluster_data["Lead_Days"].count()

7198

In [292]:
cluster_data["Country_Key"] = cluster_data["Country_Key"].apply(lambda x : "Country_Non_IN" if x != "IN" else "Country_IN")
country_count_data = cluster_data["Country_Key"].value_counts()

print(country_count_data)

country_share_data = round(country_count_data / num_of_guests, 4) * 100

for ind in country_share_data.index:
  cluster_analysis[ind] = country_share_data[ind]

cluster_analysis

Country_IN        6898
Country_Non_IN     300
Name: Country_Key, dtype: int64


Reservation_Count         1.57
Reserv_Count_1           75.90
Reserv_Count_2           12.96
Reserv_Count_3            4.74
Reserv_Count_4            6.40
Hotel_Count               1.03
Hotel_Count_1            97.25
Hotel_Count_2             2.32
Hotel_Count_3             0.31
Hotel_Count_4             0.13
Gender_M                   NaN
Gender_F                   NaN
Gender_Un                  NaN
Metro                     1.55
Non_Metro                 0.02
EARLY                     1.00
LATE                      1.00
MP                        1.00
MEAL                      1.00
ROOM                   5754.48
VAS                       1.00
Country_IN               95.83
Country_Non_IN            4.17
Stay_Days                 2.29
Stay_Days_0                NaN
Stay_Days_1              47.93
Stay_Days_2              27.87
Stay_Days_3              11.70
Stay_Days_4               4.14
Stay_Days_5               2.20
Stay_Days_6               6.17
Stay_Days_0avg             NaN
Stay_Day

In [293]:
cluster_data["Gender"] = cluster_data["Gender"].apply(lambda x : "Gender_M" if x == "Male" 
                                                            else "Gender_F" if x == "Female"
                                                            else "Gender_Un")
gender_count_data = cluster_data["Gender"].value_counts()
print(gender_count_data)

gender_share_data = round(gender_count_data / num_of_guests, 4) * 100

for ind in gender_share_data.index:
  cluster_analysis[ind] = gender_share_data[ind]

cluster_analysis

Gender_M     4745
Gender_Un    1825
Gender_F      628
Name: Gender, dtype: int64


Reservation_Count         1.57
Reserv_Count_1           75.90
Reserv_Count_2           12.96
Reserv_Count_3            4.74
Reserv_Count_4            6.40
Hotel_Count               1.03
Hotel_Count_1            97.25
Hotel_Count_2             2.32
Hotel_Count_3             0.31
Hotel_Count_4             0.13
Gender_M                 65.92
Gender_F                  8.72
Gender_Un                25.35
Metro                     1.55
Non_Metro                 0.02
EARLY                     1.00
LATE                      1.00
MP                        1.00
MEAL                      1.00
ROOM                   5754.48
VAS                       1.00
Country_IN               95.83
Country_Non_IN            4.17
Stay_Days                 2.29
Stay_Days_0                NaN
Stay_Days_1              47.93
Stay_Days_2              27.87
Stay_Days_3              11.70
Stay_Days_4               4.14
Stay_Days_5               2.20
Stay_Days_6               6.17
Stay_Days_0avg             NaN
Stay_Day

In [294]:
cluster_data["Guest_Category"] = cluster_data["Guest_Category"].apply(lambda x : 'Guest_Category_Ind' if x == "Individual" 
                                                            else 'Guest_Category_Fam' if x == "Family"
                                                            else 'Guest_Category_MM' if x == "Middle Management"
                                                            else 'Guest_Category_JM'  )
guest_cat_count_data = cluster_data["Guest_Category"].value_counts()
print(guest_cat_count_data)

guest_cat_share_data = round(guest_cat_count_data / num_of_guests, 4) * 100

for ind in guest_cat_share_data.index:
  cluster_analysis[ind] = guest_cat_share_data[ind]

cluster_analysis


# cluster_data["Guest_Category"].value_counts()

Guest_Category_MM     3208
Guest_Category_Ind    2182
Guest_Category_Fam    1388
Guest_Category_JM      420
Name: Guest_Category, dtype: int64


Reservation_Count         1.57
Reserv_Count_1           75.90
Reserv_Count_2           12.96
Reserv_Count_3            4.74
Reserv_Count_4            6.40
Hotel_Count               1.03
Hotel_Count_1            97.25
Hotel_Count_2             2.32
Hotel_Count_3             0.31
Hotel_Count_4             0.13
Gender_M                 65.92
Gender_F                  8.72
Gender_Un                25.35
Metro                     1.55
Non_Metro                 0.02
EARLY                     1.00
LATE                      1.00
MP                        1.00
MEAL                      1.00
ROOM                   5754.48
VAS                       1.00
Country_IN               95.83
Country_Non_IN            4.17
Stay_Days                 2.29
Stay_Days_0                NaN
Stay_Days_1              47.93
Stay_Days_2              27.87
Stay_Days_3              11.70
Stay_Days_4               4.14
Stay_Days_5               2.20
Stay_Days_6               6.17
Stay_Days_0avg             NaN
Stay_Day