<a href="https://colab.research.google.com/github/codeRSH/AAG-DS/blob/main/AAG_Cluster_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cluster Analysis Using Python

## Initial Setup

### Mount Drive to Read files

In [295]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import All Relevant Libraries

In [296]:
import glob
import pandas as pd
import numpy as np

### Make Plots show within the notebook output

In [297]:
import matplotlib as mp
import matplotlib.pyplot as plt
%matplotlib inline

### Ignore Warnings (for clean output)

In [298]:
import warnings
warnings.filterwarnings('ignore')

### Define File Paths

In [299]:
init_input_path = "/content/drive/MyDrive/AAG DS Folder/Capstone/"

input_cluster_path = init_input_path + "cluster/*csv"

init_output_path = "/content/drive/MyDrive/AAG DS Folder/Capstone/output/"
ouptut_cluster_path = init_output_path + "cluster_final_analysis.csv"

## Data Loading and Wrangling

### Read Cluster Input Data

In [316]:
# Input Cluster Data Frame
cluster_data = pd.DataFrame()

print(glob.glob(input_cluster_path))

# Read the Cluster data from all the available csv files
for file in glob.glob(input_cluster_path):
    data = pd.read_csv(file, parse_dates=True, encoding="latin1",
                       error_bad_lines=False, usecols = [*range(1, 10), *range(11, 20)] )
    cluster_data = cluster_data.append(data)

cluster_data.columns = ['Guest_ID', 'Reservation_Count', 'Hotel_Count', 'Gender', 
                     'Metro', 'Non_Metro', 'EARLY', 'LATE', 'MP', 'MEAL',	
                     'ROOM', 'VAS', 'Country_Key', 'Stay_Days', 'Revenue_Per_Pax',	
                     'Guest_Category', 'Lead_Days',	'Cluster' ]

cluster_data

['/content/drive/MyDrive/AAG DS Folder/Capstone/cluster/Cluster_Data.csv']


Unnamed: 0,Guest_ID,Reservation_Count,Hotel_Count,Gender,Metro,Non_Metro,EARLY,LATE,MP,MEAL,ROOM,VAS,Country_Key,Stay_Days,Revenue_Per_Pax,Guest_Category,Lead_Days,Cluster
0,3,1,1,Undisclosed,0,1,0.0,0.0,0.0,1.0,7.0800,0.0,IN,1,70.0800,Individual,0.0,2
1,26,1,1,Male,0,1,0.0,0.0,0.0,1.0,3113.9020,0.0,IN,14,65789.6280,Middle Management,1.0,5
2,33,2,1,Male,0,2,0.0,0.0,0.0,1.0,0.0000,0.0,IN,1,315.0000,Individual,0.0,1
3,51,1,1,Undisclosed,1,0,0.0,0.0,0.0,1.0,4982.8096,0.0,IN,3,8986.2144,Junior Management,4.0,2
4,133,1,1,Male,0,1,0.0,0.0,0.0,0.0,0.0000,1.0,IN,1,241.9000,Individual,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455818,90450512,1,1,Undisclosed,0,1,0.0,0.0,0.0,0.0,5437.4400,0.0,IN,1,9437.4400,Middle Management,4.0,4
455819,90450817,8,1,Undisclosed,0,8,0.0,0.0,0.0,0.0,27499.2800,0.0,IN,1,3437.4100,Middle Management,0.0,4
455820,90450995,2,1,Undisclosed,0,2,0.0,0.0,0.0,1.0,7770.0000,0.0,IN,1,7022.1500,Individual,1.5,2
455821,90451050,1,1,Undisclosed,0,1,0.0,0.0,0.0,1.0,0.0000,0.0,IN,1,1.0500,Middle Management,0.0,4


In [317]:
cluster_data["Lead_Days"] = cluster_data["Lead_Days"].round().astype(int)
cluster_data

Unnamed: 0,Guest_ID,Reservation_Count,Hotel_Count,Gender,Metro,Non_Metro,EARLY,LATE,MP,MEAL,ROOM,VAS,Country_Key,Stay_Days,Revenue_Per_Pax,Guest_Category,Lead_Days,Cluster
0,3,1,1,Undisclosed,0,1,0.0,0.0,0.0,1.0,7.0800,0.0,IN,1,70.0800,Individual,0,2
1,26,1,1,Male,0,1,0.0,0.0,0.0,1.0,3113.9020,0.0,IN,14,65789.6280,Middle Management,1,5
2,33,2,1,Male,0,2,0.0,0.0,0.0,1.0,0.0000,0.0,IN,1,315.0000,Individual,0,1
3,51,1,1,Undisclosed,1,0,0.0,0.0,0.0,1.0,4982.8096,0.0,IN,3,8986.2144,Junior Management,4,2
4,133,1,1,Male,0,1,0.0,0.0,0.0,0.0,0.0000,1.0,IN,1,241.9000,Individual,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455818,90450512,1,1,Undisclosed,0,1,0.0,0.0,0.0,0.0,5437.4400,0.0,IN,1,9437.4400,Middle Management,4,4
455819,90450817,8,1,Undisclosed,0,8,0.0,0.0,0.0,0.0,27499.2800,0.0,IN,1,3437.4100,Middle Management,0,4
455820,90450995,2,1,Undisclosed,0,2,0.0,0.0,0.0,1.0,7770.0000,0.0,IN,1,7022.1500,Individual,2,2
455821,90451050,1,1,Undisclosed,0,1,0.0,0.0,0.0,1.0,0.0000,0.0,IN,1,1.0500,Middle Management,0,4


In [318]:
# Determine the Number of guests
num_of_guests = cluster_data["Guest_ID"].nunique()
num_of_guests

455823

In [319]:
cluster_analysis = pd.Series(index = ['Reservation_Count', 
                                      'Reserv_Count_1', 'Reserv_Count_2',
                                      'Reserv_Count_3', 'Reserv_Count_4', 
                                      'Hotel_Count', 
                                      'Hotel_Count_1', 'Hotel_Count_2',
                                      'Hotel_Count_3','Hotel_Count_4',
                                      'Gender_M', 'Gender_F', 'Gender_Un',
                                      'Metro', 'Non_Metro', 'EARLY', 'LATE', 
                                      'MP', 'MEAL', 'ROOM', 'VAS', 'Country_IN',
                                      'Country_Non_IN', 'Stay_Days', 'Stay_Days_0',
                                      'Stay_Days_1', 'Stay_Days_2', 'Stay_Days_3',
                                      'Stay_Days_4', 'Stay_Days_5', 'Stay_Days_6',
                                      'Revenue_Per_Pax',	'Guest_Category_Ind',
                                      'Guest_Category_Fam', 'Guest_Category_MM', 'Guest_Category_JM', 
                                      'Lead_Days',  'Lead_Days_0',  'Lead_Days_1',	
                                      'Lead_Days_2',	 'Lead_Days_3',	 'Lead_Days_4',	
                                      'Lead_Days_5',	 'Lead_Days_6',	 'Lead_Days_7',	
                                      'Lead_Days_8',	'Cluster' ])

In [320]:
cluster_analysis.shape

(47,)

In [321]:
avg_of_cluster_data = round((cluster_data[['Reservation_Count', 'Hotel_Count',	
                                            'ROOM', 'Stay_Days', 
                                            'Revenue_Per_Pax',	'Lead_Days'
                                    	 ]].sum() / num_of_guests), 2 )
avg_of_cluster_data

Reservation_Count       1.41
Hotel_Count             1.05
Metro                   0.76
Non_Metro               0.65
EARLY                   0.07
LATE                    0.05
MP                      0.05
MEAL                    0.51
ROOM                 4691.51
VAS                     0.13
Stay_Days               2.06
Revenue_Per_Pax      6756.43
Lead_Days               9.16
dtype: float64

In [322]:
for ind in avg_of_cluster_data.index:
  cluster_analysis[ind] = avg_of_cluster_data[ind]

In [None]:
percentage_of_cluster_data = round(cluster_data[[ 'EARLY', 'LATE', 'MP', 'MEAL', 'VAS']].sum( ) / num_of_guests), 4) * 100
percentage_of_cluster_data

In [None]:
for ind in percentage_of_cluster_data.index:
  cluster_analysis[ind] = percentage_of_cluster_data[ind]

In [323]:
def get_freq_dist(col_name, main_df, last_count, num_total_records, analysis_df, init_label):
    col_series = main_df[col_name].copy()
    dist_df = col_series.value_counts()
    # print(dist_df.head(10))
    
    # print("###############")
    dist_df.sort_index(inplace=True)
    # print(dist_df.head(10))

    # print("###############")
    # print(dist_df[last_count])

    # print("#############")
    dist_df[last_count] = dist_df.loc[last_count:].sum()
    # print(dist_df[last_count])
    # print(dist_df.loc[last_count:])
    
    
    # print("###############")
    dist_df = dist_df.loc[:last_count]
    # print(dist_df)


    # print("###############")
    dist_df_share = round(dist_df / num_total_records, 4) * 100
    # print(dist_df_share)
        
    for ind in dist_df_share.index:
        an_ind = init_label + str(ind)
        analysis_df[an_ind] = dist_df_share[ind]

    return analysis_df

In [324]:

cluster_analysis = get_freq_dist(col_name = "Reservation_Count", main_df = cluster_data, 
                                 last_count = 4 , num_total_records = num_of_guests, 
                                 analysis_df = cluster_analysis, init_label = 'Reserv_Count_'
                                )

# cluster_analysis

In [325]:
cluster_analysis = get_freq_dist(col_name = "Hotel_Count", main_df = cluster_data, 
                                 last_count = 4, num_total_records = num_of_guests, 
                                 analysis_df = cluster_analysis, init_label = 'Hotel_Count_'
                                )


# cluster_analysis

In [326]:
cluster_analysis = get_freq_dist(col_name = "Stay_Days", main_df = cluster_data, 
                                  last_count = 6, 
                                  num_total_records = num_of_guests,
                                  analysis_df = cluster_analysis, init_label = 'Stay_Days_'
                                )


# cluster_analysis

In [327]:
cluster_analysis = get_freq_dist(col_name = "Lead_Days", main_df = cluster_data, 
                                  last_count = 8, num_total_records = num_of_guests, 
                                  analysis_df = cluster_analysis, init_label = 'Lead_Days_'
                                )


cluster_analysis

Reservation_Count        1.41
Reserv_Count_1          81.36
Reserv_Count_2          11.58
Reserv_Count_3           3.30
Reserv_Count_4           3.75
Hotel_Count              1.05
Hotel_Count_1           95.86
Hotel_Count_2            3.61
Hotel_Count_3            0.41
Hotel_Count_4            0.12
Gender_M                  NaN
Gender_F                  NaN
Gender_Un                 NaN
Metro                    0.76
Non_Metro                0.65
EARLY                    0.07
LATE                     0.05
MP                       0.05
MEAL                     0.51
ROOM                  4691.51
VAS                      0.13
Country_IN                NaN
Country_Non_IN            NaN
Stay_Days                2.06
Stay_Days_0               NaN
Stay_Days_1             59.82
Stay_Days_2             21.77
Stay_Days_3              8.13
Stay_Days_4              3.51
Stay_Days_5              1.82
Stay_Days_6              4.94
Revenue_Per_Pax       6756.43
Guest_Category_Ind        NaN
Guest_Cate

In [328]:
cluster_data["Lead_Days"].count()

455823

In [329]:
cluster_data["Country_Key"] = cluster_data["Country_Key"].apply(lambda x : "Country_Non_IN" if x != "IN" else "Country_IN")
country_count_data = cluster_data["Country_Key"].value_counts()

print(country_count_data)

country_share_data = round(country_count_data / num_of_guests, 4) * 100

for ind in country_share_data.index:
  cluster_analysis[ind] = country_share_data[ind]

cluster_analysis

Country_IN        443317
Country_Non_IN     12506
Name: Country_Key, dtype: int64


Reservation_Count        1.41
Reserv_Count_1          81.36
Reserv_Count_2          11.58
Reserv_Count_3           3.30
Reserv_Count_4           3.75
Hotel_Count              1.05
Hotel_Count_1           95.86
Hotel_Count_2            3.61
Hotel_Count_3            0.41
Hotel_Count_4            0.12
Gender_M                  NaN
Gender_F                  NaN
Gender_Un                 NaN
Metro                    0.76
Non_Metro                0.65
EARLY                    0.07
LATE                     0.05
MP                       0.05
MEAL                     0.51
ROOM                  4691.51
VAS                      0.13
Country_IN              97.26
Country_Non_IN           2.74
Stay_Days                2.06
Stay_Days_0               NaN
Stay_Days_1             59.82
Stay_Days_2             21.77
Stay_Days_3              8.13
Stay_Days_4              3.51
Stay_Days_5              1.82
Stay_Days_6              4.94
Revenue_Per_Pax       6756.43
Guest_Category_Ind        NaN
Guest_Cate

In [330]:
cluster_data["Gender"] = cluster_data["Gender"].apply(lambda x : "Gender_M" if x == "Male" 
                                                            else "Gender_F" if x == "Female"
                                                            else "Gender_Un")
gender_count_data = cluster_data["Gender"].value_counts()
print(gender_count_data)

gender_share_data = round(gender_count_data / num_of_guests, 4) * 100

for ind in gender_share_data.index:
  cluster_analysis[ind] = gender_share_data[ind]

cluster_analysis

Gender_M     251931
Gender_Un    164270
Gender_F      39622
Name: Gender, dtype: int64


Reservation_Count        1.41
Reserv_Count_1          81.36
Reserv_Count_2          11.58
Reserv_Count_3           3.30
Reserv_Count_4           3.75
Hotel_Count              1.05
Hotel_Count_1           95.86
Hotel_Count_2            3.61
Hotel_Count_3            0.41
Hotel_Count_4            0.12
Gender_M                55.27
Gender_F                 8.69
Gender_Un               36.04
Metro                    0.76
Non_Metro                0.65
EARLY                    0.07
LATE                     0.05
MP                       0.05
MEAL                     0.51
ROOM                  4691.51
VAS                      0.13
Country_IN              97.26
Country_Non_IN           2.74
Stay_Days                2.06
Stay_Days_0               NaN
Stay_Days_1             59.82
Stay_Days_2             21.77
Stay_Days_3              8.13
Stay_Days_4              3.51
Stay_Days_5              1.82
Stay_Days_6              4.94
Revenue_Per_Pax       6756.43
Guest_Category_Ind        NaN
Guest_Cate

In [331]:
cluster_data["Guest_Category"] = cluster_data["Guest_Category"].apply(lambda x : 'Guest_Category_Ind' if x == "Individual" 
                                                            else 'Guest_Category_Fam' if x == "Family"
                                                            else 'Guest_Category_MM' if x == "Middle Management"
                                                            else 'Guest_Category_JM'  )
guest_cat_count_data = cluster_data["Guest_Category"].value_counts()
# print(guest_cat_count_data)

guest_cat_share_data = round(guest_cat_count_data / num_of_guests, 4) * 100

for ind in guest_cat_share_data.index:
  cluster_analysis[ind] = guest_cat_share_data[ind]

cluster_analysis


# cluster_data["Guest_Category"].value_counts()

Reservation_Count        1.41
Reserv_Count_1          81.36
Reserv_Count_2          11.58
Reserv_Count_3           3.30
Reserv_Count_4           3.75
Hotel_Count              1.05
Hotel_Count_1           95.86
Hotel_Count_2            3.61
Hotel_Count_3            0.41
Hotel_Count_4            0.12
Gender_M                55.27
Gender_F                 8.69
Gender_Un               36.04
Metro                    0.76
Non_Metro                0.65
EARLY                    0.07
LATE                     0.05
MP                       0.05
MEAL                     0.51
ROOM                  4691.51
VAS                      0.13
Country_IN              97.26
Country_Non_IN           2.74
Stay_Days                2.06
Stay_Days_0               NaN
Stay_Days_1             59.82
Stay_Days_2             21.77
Stay_Days_3              8.13
Stay_Days_4              3.51
Stay_Days_5              1.82
Stay_Days_6              4.94
Revenue_Per_Pax       6756.43
Guest_Category_Ind      26.13
Guest_Cate