<a href="https://colab.research.google.com/github/codeRSH/AAG-DS/blob/main/AAG_Cluster_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cluster Analysis Using Python

## Initial Setup

### Mount Drive to Read files

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


### Import All Relevant Libraries

In [2]:
import glob
import pandas as pd
import numpy as np

### Make Plots show within the notebook output

In [3]:
import matplotlib as mp
import matplotlib.pyplot as plt
%matplotlib inline

### Ignore Warnings (for clean output)

In [4]:
import warnings
warnings.filterwarnings('ignore')

### Define File Paths

In [5]:
init_input_path = "/content/drive/MyDrive/AAG DS Folder/Capstone/"

input_cluster_path = init_input_path + "cluster/*csv"

init_output_path = "/content/drive/MyDrive/AAG DS Folder/Capstone/output/"
ouptut_cluster_path = init_output_path + "cluster_final_analysis.csv"

## Data Loading and Wrangling

### Read Cluster Input Data

In [28]:
# Input Cluster Data Frame
cluster_data = pd.DataFrame()

print(glob.glob(input_cluster_path))

# Read the Cluster data from all the available csv files
for file in glob.glob(input_cluster_path):
    data = pd.read_csv(file, parse_dates=True, encoding="latin1",
                       error_bad_lines=False, usecols = [*range(1, 10), *range(11, 20)] )
    cluster_data = cluster_data.append(data)

cluster_data.columns = ['Guest_ID', 'Reservation_Count', 'Hotel_Count', 'Gender', 
                     'Metro', 'Non_Metro', 'EARLY', 'LATE', 'MP', 'MEAL',	
                     'ROOM', 'VAS', 'Country_Key', 'Stay_Days', 'Revenue_Per_Pax',	
                     'Guest_Category', 'Lead_Days',	'Cluster' ]

cluster_data

['/content/drive/MyDrive/AAG DS Folder/Capstone/cluster/kmeans_clusters.csv']


Unnamed: 0,Guest_ID,Reservation_Count,Hotel_Count,Gender,Metro,Non_Metro,EARLY,LATE,MP,MEAL,ROOM,VAS,Country_Key,Stay_Days,Revenue_Per_Pax,Guest_Category,Lead_Days,Cluster
0,39853,1,1,Undisclosed,1,0,1.0,1.0,1.0,1.0,3275.774400,1.0,IN,1,4212.774400,Individual,1.000000,4
1,55125,4,1,Undisclosed,4,0,1.0,1.0,1.0,1.0,16078.568000,1.0,IN,2,9577.284000,Middle Management,2.750000,3
2,60873,3,2,Undisclosed,2,1,1.0,1.0,1.0,1.0,6137.915533,1.0,IN,6,20389.146600,Middle Management,2.000000,3
3,74138,8,1,Undisclosed,8,0,1.0,1.0,1.0,1.0,11308.820000,1.0,IN,1,4566.273333,Middle Management,0.666667,3
4,112568,1,1,Undisclosed,1,0,1.0,1.0,1.0,1.0,5341.320000,1.0,IN,1,5822.320000,Individual,1.000000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7193,3757436,1,1,Male,1,0,1.0,1.0,1.0,1.0,3457.245000,1.0,IN,2,10890.880000,Family,319.000000,5
7194,3757460,1,1,Undisclosed,1,0,1.0,1.0,1.0,1.0,3438.410000,1.0,IN,2,10603.005000,Family,319.000000,2
7195,3757466,1,1,Undisclosed,1,0,1.0,1.0,1.0,1.0,3438.410000,1.0,IN,2,10603.005000,Family,319.000000,2
7196,3757480,1,1,Female,1,0,1.0,1.0,1.0,1.0,3438.410000,1.0,IN,2,10603.005000,Family,319.000000,2


In [29]:
# Determine the Number of guests
num_of_guests = cluster_data["Guest_ID"].nunique()
num_of_guests

7198

In [30]:
round(cluster_data["Reservation_Count"].sum() / num_of_guests, 2)

1.57

In [44]:
cluster_analysis = pd.Series(index = ['Reservation_Count', 
                                           'Reserv_Count_1', 'Reserv_Count_2',
                                           'Reserv_Count_3', 'Reserv_Count_3plus', 
                                           'Hotel_Count', 
                                           'Hotel_Count_1', 'Hotel_Count_2',
                                           'Hotel_Count_3','Hotel_Count_3plus',
                                           'Gender_M', 'Gender_F', 'Gender_Un',
                                           'Metro', 'Non_Metro', 'EARLY', 'LATE', 
                                           'MP', 'MEAL', 'ROOM', 'VAS', 'Country_IN',
                                           'Country_Non_IN', 'Stay_Days', 'Stay_Days_0',
                                           'Stay_Days_1', 'Stay_Days_2', 'Stay_Days_3',
                                           'Stay_Days_4', 'Stay_Days_5', 'Stay_Days_5plus',
                                           'Stay_Days_0avg', 'Stay_Days_1avg', 'Stay_Days_2avg', 
                                           'Stay_Days_3avg', 'Stay_Days_4avg', 'Stay_Days_5avg', 
                                           'Stay_Days_5plusavg', 'Revenue_Per_Pax',	'Guest_Category_Ind',
                                           'Guest_Category_Fam', 'Guest_Category_Smg', 'Guest_Category_Mmg', 
                                           'Lead_Days',  'Lead_Days_0',  'Lead_Days_1',	
                                           'Lead_Days_2',	 'Lead_Days_3',	 'Lead_Days_4',	
                                           'Lead_Days_5',	 'Lead_Days_6',	 'Lead_Days_7',	
                                           'Lead_Days_7plus',	'Cluster' ])

# pd.Series()

# cluster_analysis.append(pd.Series(), ignore_index=True)
cluster_analysis

Reservation_Count    NaN
Reserv_Count_0       NaN
Reserv_Count_1       NaN
Reserv_Count_2       NaN
Reserv_Count_3       NaN
Reserv_Count_3plus   NaN
Hotel_Count          NaN
Hotel_Count_0        NaN
Hotel_Count_1        NaN
Hotel_Count_2        NaN
Hotel_Count_3        NaN
Hotel_Count_3plus    NaN
Gender_M             NaN
Gender_F             NaN
Gender_Un            NaN
Metro                NaN
Non_Metro            NaN
EARLY                NaN
LATE                 NaN
MP                   NaN
MEAL                 NaN
ROOM                 NaN
VAS                  NaN
Country_IN           NaN
Country_Non_IN       NaN
Stay_Days            NaN
Stay_Days_0          NaN
Stay_Days_1          NaN
Stay_Days_2          NaN
Stay_Days_3          NaN
Stay_Days_4          NaN
Stay_Days_5          NaN
Stay_Days_5plus      NaN
Stay_Days_0avg       NaN
Stay_Days_1avg       NaN
Stay_Days_2avg       NaN
Stay_Days_3avg       NaN
Stay_Days_4avg       NaN
Stay_Days_5avg       NaN
Stay_Days_5plusavg   NaN


In [45]:
cluster_analysis.shape

(56,)

In [46]:
avg_of_cluster_data = round((cluster_data[['Reservation_Count', 'Hotel_Count', 
                     'Metro', 'Non_Metro', 'EARLY', 'LATE', 'MP', 'MEAL',	
                     'ROOM', 'VAS', 'Stay_Days', 'Revenue_Per_Pax',	
                    'Lead_Days',	 ]].sum() / num_of_guests), 2 )
avg_of_cluster_data

Reservation_Count        1.57
Hotel_Count              1.03
Metro                    1.55
Non_Metro                0.02
EARLY                    1.00
LATE                     1.00
MP                       1.00
MEAL                     1.00
ROOM                  5754.48
VAS                      1.00
Stay_Days                2.29
Revenue_Per_Pax      10916.55
Lead_Days               11.96
dtype: float64

In [50]:
for ind in avg_of_cluster_data.index:
  cluster_analysis[ind] = avg_of_cluster_data[ind]

In [73]:
cluster_data_reserv_count = cluster_data["Reservation_Count"].value_counts()

In [74]:
cluster_data_reserv_count[4] = cluster_data_reserv_count.loc[4:].sum()
cluster_data_reserv_count.loc[4]

461

In [75]:
cluster_data_reserv_count = cluster_data_reserv_count[:4]
cluster_data_reserv_count

1    5463
2     933
3     341
4     461
Name: Reservation_Count, dtype: int64

In [76]:
cluster_data_reserv_share = round(cluster_data_reserv_count / num_of_guests, 4) * 100
cluster_data_reserv_share

1    75.90
2    12.96
3     4.74
4     6.40
Name: Reservation_Count, dtype: float64

In [78]:
cluster_data_reserv_share.index = ['Reserv_Count_1', 'Reserv_Count_2',
                                   'Reserv_Count_3', 'Reserv_Count_3plus']
cluster_data_reserv_share

Reserv_Count_1        75.90
Reserv_Count_2        12.96
Reserv_Count_3         4.74
Reserv_Count_3plus     6.40
Name: Reservation_Count, dtype: float64

In [80]:
for ind in cluster_data_reserv_share.index:
  cluster_analysis[ind] = cluster_data_reserv_share[ind]

In [81]:
cluster_analysis

Reservation_Count         1.57
Reserv_Count_0             NaN
Reserv_Count_1           75.90
Reserv_Count_2           12.96
Reserv_Count_3            4.74
Reserv_Count_3plus        6.40
Hotel_Count               1.03
Hotel_Count_0              NaN
Hotel_Count_1              NaN
Hotel_Count_2              NaN
Hotel_Count_3              NaN
Hotel_Count_3plus          NaN
Gender_M                   NaN
Gender_F                   NaN
Gender_Un                  NaN
Metro                     1.55
Non_Metro                 0.02
EARLY                     1.00
LATE                      1.00
MP                        1.00
MEAL                      1.00
ROOM                   5754.48
VAS                       1.00
Country_IN                 NaN
Country_Non_IN             NaN
Stay_Days                 2.29
Stay_Days_0                NaN
Stay_Days_1                NaN
Stay_Days_2                NaN
Stay_Days_3                NaN
Stay_Days_4                NaN
Stay_Days_5                NaN
Stay_Day

In [99]:
def get_freq_dist(col_name, main_df, last_count, num_total_records, index_list, analysis_df):
    dist_df = main_df[col_name].value_counts()
    print(dist_df)
    first_index = dist_df.first_valid_index()
    print(first_index)
    dist_df[last_count] = dist_df.loc[last_count:].sum()
    print(dist_df.loc[last_count:])
    dist_df = dist_df[:last_count]
    dist_df_share = round(dist_df / num_total_records, 4) * 100
    dist_df_share.index = index_list[first_index:]

    for ind in dist_df_share.index:
        analysis_df[ind] = dist_df_share[ind]

    return analysis_df

In [89]:
cluster_analysis = get_freq_dist(col_name = "Hotel_Count", main_df = cluster_data, 
                                 last_count = 4, num_total_records = num_of_guests, 
                                 index_list = ['Hotel_Count_1', 'Hotel_Count_2', 'Hotel_Count_3','Hotel_Count_3plus'],
                                 analysis_df = cluster_analysis
                                )


cluster_analysis

Reservation_Count         1.57
Reserv_Count_0             NaN
Reserv_Count_1           75.90
Reserv_Count_2           12.96
Reserv_Count_3            4.74
Reserv_Count_3plus        6.40
Hotel_Count               1.03
Hotel_Count_0              NaN
Hotel_Count_1            97.25
Hotel_Count_2             2.32
Hotel_Count_3             0.31
Hotel_Count_3plus         0.13
Gender_M                   NaN
Gender_F                   NaN
Gender_Un                  NaN
Metro                     1.55
Non_Metro                 0.02
EARLY                     1.00
LATE                      1.00
MP                        1.00
MEAL                      1.00
ROOM                   5754.48
VAS                       1.00
Country_IN                 NaN
Country_Non_IN             NaN
Stay_Days                 2.29
Stay_Days_0                NaN
Stay_Days_1                NaN
Stay_Days_2                NaN
Stay_Days_3                NaN
Stay_Days_4                NaN
Stay_Days_5                NaN
Stay_Day

In [100]:
cluster_analysis1 = get_freq_dist(col_name = "Stay_Days", main_df = cluster_data, 
                                 last_count = 7, num_total_records = num_of_guests, 
                                 index_list = ['Stay_Days_0', 'Stay_Days_1', 'Stay_Days_2', 
                                               'Stay_Days_3', 'Stay_Days_4', 'Stay_Days_5', 
                                               'Stay_Days_5plus'],
                                 analysis_df = cluster_analysis
                                )


cluster_analysis1

1     3450
2     2006
3      842
4      298
5      158
7      117
6       89
15      42
10      40
8       36
9       34
14      19
11      10
13      10
12       8
18       5
16       5
19       4
20       3
27       3
17       3
24       2
43       2
33       2
30       2
21       1
36       1
32       1
35       1
22       1
34       1
42       1
25       1
Name: Stay_Days, dtype: int64
1
7     444
6      89
15     42
10     40
8      36
9      34
14     19
11     10
13     10
12      8
18      5
16      5
19      4
20      3
27      3
17      3
24      2
43      2
33      2
30      2
21      1
36      1
32      1
35      1
22      1
34      1
42      1
25      1
Name: Stay_Days, dtype: int64


ValueError: ignored