In [103]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import lines, patches
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, MinMaxScaler
import category_encoders as ce # BinaryEncoder
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score

# my own function
# import function as fnk

import warnings
warnings.filterwarnings("ignore")

In [104]:
data = pd.read_csv("../data/raw/data_hotel_booking_demand.csv")
data.sample(10)

Unnamed: 0,country,market_segment,previous_cancellations,booking_changes,deposit_type,days_in_waiting_list,customer_type,reserved_room_type,required_car_parking_spaces,total_of_special_requests,is_canceled
54031,FRA,Groups,0,0,No Deposit,0,Transient-Party,A,0,0,0
18377,PRT,Online TA,0,0,No Deposit,0,Transient,A,0,1,0
61341,PRT,Corporate,0,0,No Deposit,0,Transient-Party,A,0,0,0
15931,PRT,Groups,0,0,No Deposit,0,Transient-Party,A,0,0,0
45991,PRT,Offline TA/TO,0,0,Non Refund,0,Transient-Party,A,0,0,1
63895,ESP,Direct,0,0,No Deposit,0,Transient,A,0,1,0
54346,ESP,Online TA,0,0,No Deposit,0,Transient,D,1,3,0
3131,FRA,Online TA,0,0,No Deposit,0,Transient,A,0,1,0
80355,PRT,Offline TA/TO,0,0,No Deposit,0,Transient,D,0,0,1
79017,ITA,Online TA,0,1,No Deposit,0,Transient-Party,A,0,2,0


In [105]:
data_full = pd.read_csv("../data/raw/hotel_bookings.csv")
data_full = data_full.drop_duplicates(subset=list(data.columns))
data_full.shape

(12305, 32)

In [106]:
data = pd.merge(data, data_full, on=list(data.columns), how='left')
data = data[['country', 'market_segment', 'previous_cancellations', 'previous_bookings_not_canceled','booking_changes', 'deposit_type', 'days_in_waiting_list', 'customer_type', 'reserved_room_type', 'required_car_parking_spaces', 'total_of_special_requests', 'lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 'is_canceled']]
# data = data.drop_duplicates(keep='last', ignore_index=True)
data.shape

(83573, 15)

In [107]:
# prev_cancel_ratio
def cancel_ratio(data):
    if data['previous_cancellations'] != 0 and data['previous_bookings_not_canceled'] != 0:
        return data['previous_cancellations'] / (data['previous_cancellations'] + data['previous_bookings_not_canceled'])
    else: 
        return 0

data['prev_cancel_ratio'] = data.apply(cancel_ratio, axis=1)

## Drop Columns

In [108]:
data = data.drop(columns=['country', 'booking_changes', 'previous_cancellations', 'previous_bookings_not_canceled'])

## Diskrititasi

In [109]:
# Diskritisasi Kolom Numerical
# data['previous_cancellations'] = data['previous_cancellations'].apply(lambda x: 1 if x >= 1 else x)
# data['booking_changes'] = data['booking_changes'].apply(lambda x: 1 if x>=3 else x)
data['total_of_special_requests'] = data['total_of_special_requests'].apply(lambda x: 1 if x >= 1 else x)
data['days_in_waiting_list'] = data['days_in_waiting_list'].apply(lambda x: 1 if x>=1 else x)
data['required_car_parking_spaces'] = data['required_car_parking_spaces'].apply(lambda x: 1 if x>=1 else x)

In [110]:
# Diskritisasi Kolom Categorical
data['market_segment'] = data['market_segment'].apply(lambda x: 'Others' if x in ['Complementary', 'Aviation', 'Undefined'] and x not in ['Online TA', 'Direct', 'Offline TA/TO', 'Groups', 'Corporate'] else x)
data['reserved_room_type'] = data['reserved_room_type'].apply(lambda x: 'Others' if x in ['B', 'C', 'H', 'P', 'L'] and x not in ['A', 'D', 'E', 'F', 'G'] else x)
data['deposit_type'] = data['deposit_type'].apply(lambda x: 1 if x in ['Refundable', 'Non Refund'] else 0)
# data['customer_type'] = data['customer_type'].apply(lambda x: 'Others' if x in ['Contract', 'Group'] and x not in ['Transient', 'Transient-Party'] else x)

## Drop duplikat

In [111]:
data = data.drop_duplicates(keep='last', ignore_index=True)
data.shape

(9095, 12)

## Rearange Columns

In [112]:
data = data[['market_segment', 'prev_cancel_ratio', 'deposit_type', 'days_in_waiting_list',
       'customer_type', 'reserved_room_type', 'required_car_parking_spaces',
       'total_of_special_requests', 'lead_time', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'is_canceled',
       ]]
data.sample(10)

Unnamed: 0,market_segment,prev_cancel_ratio,deposit_type,days_in_waiting_list,customer_type,reserved_room_type,required_car_parking_spaces,total_of_special_requests,lead_time,stays_in_weekend_nights,stays_in_week_nights,is_canceled
4879,Online TA,0.0,0,0,Transient-Party,Others,1,0,1,0,3,0
2104,Online TA,0.0,0,0,Transient,E,1,0,32,1,2,0
5644,Offline TA/TO,0.0,0,0,Transient-Party,A,0,1,120,2,3,0
9029,Online TA,0.0,0,0,Transient,F,0,1,132,1,5,0
243,Offline TA/TO,0.0,0,0,Transient,E,0,0,32,2,5,0
7749,Offline TA/TO,0.0,0,0,Contract,E,0,1,272,2,5,0
2335,Direct,0.0,0,0,Transient,A,0,1,1,1,0,0
8688,Online TA,0.0,0,0,Transient,Others,0,0,43,1,2,1
4974,Online TA,0.0,0,0,Transient,D,0,0,87,1,1,1
729,Groups,0.0,0,1,Transient-Party,A,0,0,91,0,2,0


## Searching Prototype

In [113]:
from pycaret.internal.preprocess.transformers import TransformerWrapper

customePipeline =[
    # ('polynomial', TransformerWrapper(include=['previous_cancellations'], transformer=PolynomialFeatures(degree=3))),
    # ('scaleler', TransformerWrapper(include=['previous_cancellations','booking_changes','days_in_waiting_list','required_car_parking_spaces','total_of_special_requests'], transformer=MinMaxScaler((0,1)))),
    ('scaller', TransformerWrapper(include=['lead_time', 'prev_cancel_ratio'], transformer=RobustScaler())),
    ('onehot', TransformerWrapper(include=['customer_type'], transformer=OneHotEncoder())),
    ('targetencod', TransformerWrapper(include=['market_segment', 'reserved_room_type'], transformer=ce.BinaryEncoder()))
]

In [114]:
from pycaret.anomaly import *

anomaly = setup(
    session_id=2020,
    data=data,

    preprocess=False,
    custom_pipeline=customePipeline,
    custom_pipeline_position=0
)

Unnamed: 0,Description,Value
0,Session id,2020
1,Original data shape,"(9095, 12)"
2,Transformed data shape,"(9095, 19)"
3,Numeric features,9
4,Categorical features,3


In [115]:
# use knn algorithm to find the outlier
knn = create_model('knn', fraction=0.1)

In [116]:
evaluate_model(knn)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [117]:
out_df = assign_model(knn)

In [118]:
data = out_df.query("~(Anomaly == 1)").drop(columns=['Anomaly', 'Anomaly_Score'])

In [119]:
data[['market_segment', 'customer_type', 'reserved_room_type']] = data[['market_segment', 'customer_type', 'reserved_room_type']].astype(str)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8185 entries, 0 to 9094
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   market_segment               8185 non-null   object 
 1   prev_cancel_ratio            8185 non-null   float32
 2   deposit_type                 8185 non-null   int8   
 3   days_in_waiting_list         8185 non-null   int8   
 4   customer_type                8185 non-null   object 
 5   reserved_room_type           8185 non-null   object 
 6   required_car_parking_spaces  8185 non-null   int8   
 7   total_of_special_requests    8185 non-null   int8   
 8   lead_time                    8185 non-null   int32  
 9   stays_in_weekend_nights      8185 non-null   int8   
 10  stays_in_week_nights         8185 non-null   int8   
 11  is_canceled                  8185 non-null   int8   
dtypes: float32(1), int32(1), int8(7), object(3)
memory usage: 375.7+ KB


In [120]:
# split train dan test
dfTrain, dfTest = train_test_split(data, test_size=0.2, random_state=22, stratify=data['is_canceled'])
display(dfTrain.shape, dfTest.shape)

(6548, 12)

(1637, 12)

In [121]:
from pycaret.internal.preprocess.transformers import TransformerWrapper

customePipeline1 =[
    # ('polynomial', TransformerWrapper(include=['previous_cancellations'], transformer=PolynomialFeatures(degree=3))),
    # ('scaleler', TransformerWrapper(include=['previous_cancellations','booking_changes','days_in_waiting_list','required_car_parking_spaces','total_of_special_requests'], transformer=MinMaxScaler((0,1)))),
    ('scaller', TransformerWrapper(include=['lead_time', 'prev_cancel_ratio'], transformer=StandardScaler())),
    # ('onehot', TransformerWrapper(include=['customer_type'], transformer=OneHotEncoder())),
    ('target', TransformerWrapper(include=['market_segment', 'customer_type', 'reserved_room_type'], transformer=ce.target_encoder.TargetEncoder()))
]

In [122]:
'required_car_parking_spaces', 'customer_type', 'market_segment', 'deposit_type', 'lead_time', 'total_of_special_requests'

('required_car_parking_spaces',
 'customer_type',
 'market_segment',
 'deposit_type',
 'lead_time',
 'total_of_special_requests')

In [141]:
from pycaret.classification import *

cls = setup(
    # random_state
    session_id=2020,

    # define numerical variable dan categorical variable
    numeric_features=['prev_cancel_ratio', 'days_in_waiting_list', 'deposit_type', 'required_car_parking_spaces', 'total_of_special_requests' ,'lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights'],
    categorical_features=['customer_type', 'market_segment', 'reserved_room_type'],

    # data yang digunakan untuk membangun model, split data validation, dan target
    data=dfTrain,
    train_size=0.8,
    target='is_canceled',

    # handling multicollinearity
    # remove_multicollinearity=True,
    # multicollinearity_threshold=0.8,
    # ignore_features=['prev_cancel_ratio', 'days_in_waiting_list', 'reserved_room_type'],
    # ignore_features=['reserved_room_type','total_of_special_requests','is_family', 'prev_cancel_ratio', 'days_in_waiting_list', 'meal','previous_cancellations', 'previous_bookings_not_canceled','booking_changes', 'country', 'adults', 'children', 'babies', 'total_childern'],

    # re-scaling data
    # normalize=True,
    # normalize_method='zscore',
    # max_encoding_ohe=0,
    # encoding_method=ce.cat_boost.CatBoostEncoder(),

    # feature creation
    # polynomial_features=True,
    # polynomial_degree=2
    # fix_imbalance=True,
    # fix_imbalance_method='SMOTE',
    preprocess=False,
    custom_pipeline=customePipeline1,
    custom_pipeline_position=0,

    fix_imbalance=True,
    fix_imbalance_method='SMOTE'
)

Unnamed: 0,Description,Value
0,Session id,2020
1,Target,is_canceled
2,Target type,Binary
3,Original data shape,"(6548, 12)"
4,Transformed data shape,"(6548, 12)"
5,Transformed train set shape,"(5238, 12)"
6,Transformed test set shape,"(1310, 12)"
7,Numeric features,8
8,Categorical features,3


In [142]:
# add balanced_accucary metric
from sklearn.metrics import balanced_accuracy_score, fbeta_score

add_metric(
    'balanced', 'Bal. Acc', balanced_accuracy_score, greater_is_better = True
           )

add_metric(
    'f2', 'F2', fbeta_score, greater_is_better = True, beta=2
           )

Name                                                           F2
Display Name                                                   F2
Score Function       <function fbeta_score at 0x00000208EF003B80>
Scorer                           make_scorer(fbeta_score, beta=2)
Target                                                       pred
Args                                                  {'beta': 2}
Greater is Better                                            True
Multiclass                                                   True
Custom                                                       True
Name: f2, dtype: object

In [143]:
get_config('X_train_transformed')

Unnamed: 0,market_segment,prev_cancel_ratio,deposit_type,days_in_waiting_list,customer_type,reserved_room_type,required_car_parking_spaces,total_of_special_requests,lead_time,stays_in_weekend_nights,stays_in_week_nights
7548,0.313502,-0.120125,0,0,0.296472,0.229765,0,0,-0.705373,2,0
5208,0.197471,-0.120125,0,0,0.296472,0.235960,0,1,-0.454129,2,2
5708,0.197471,-0.120125,0,0,0.296472,0.229765,0,0,-0.918930,0,3
1191,0.313502,-0.120125,0,0,0.296472,0.254955,0,0,-0.604875,1,2
995,0.313502,-0.120125,0,0,0.123468,0.302974,0,0,0.575971,0,2
...,...,...,...,...,...,...,...,...,...,...,...
5252,0.197471,-0.120125,0,0,0.296472,0.254955,0,1,1.091021,1,3
5193,0.313502,-0.120125,0,0,0.296472,0.235960,1,1,-0.039576,2,3
6441,0.197471,-0.120125,0,0,0.296472,0.210884,0,0,-0.617437,1,2
7776,0.139403,-0.120125,0,1,0.123468,0.235960,0,0,1.141270,2,2


In [144]:
best = compare_models(sort='MCC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Bal. Acc,F2,TT (Sec)
gbc,Gradient Boosting Classifier,0.7845,0.8135,0.3145,0.632,0.4189,0.3043,0.3329,0.6266,0.3492,1.065
lightgbm,Light Gradient Boosting Machine,0.7688,0.8003,0.3717,0.5489,0.4418,0.3031,0.3129,0.6354,0.3967,1.085
ada,Ada Boost Classifier,0.7778,0.8074,0.3068,0.6014,0.4055,0.286,0.311,0.6195,0.3398,1.061
xgboost,Extreme Gradient Boosting,0.7608,0.7906,0.3825,0.5232,0.4408,0.2935,0.2999,0.6337,0.4037,1.201
lda,Linear Discriminant Analysis,0.7741,0.7933,0.2187,0.6261,0.3215,0.223,0.2683,0.5875,0.2506,1.057
lr,Logistic Regression,0.7762,0.7977,0.1885,0.6704,0.2921,0.2068,0.2666,0.5788,0.2196,1.19
nb,Naive Bayes,0.4255,0.7786,0.9985,0.3007,0.4621,0.1328,0.2654,0.618,0.6818,1.057
svm,SVM - Linear Kernel,0.7616,0.0,0.2337,0.6575,0.298,0.2021,0.2604,0.5843,0.2499,1.055
et,Extra Trees Classifier,0.7302,0.7055,0.398,0.4488,0.4211,0.2464,0.2474,0.6186,0.4068,1.084
rf,Random Forest Classifier,0.7365,0.7475,0.3709,0.4601,0.4095,0.2429,0.2457,0.6137,0.3852,1.09


In [145]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [146]:
gbc = create_model('gbc', return_train_score=True, cross_validation=False,probability_threshold=0.34)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Bal. Acc,F2
Test,0.7366,0.7881,0.6327,0.4756,0.543,0.3632,0.3705,0.7018,0.5935
Train,0.7776,0.8436,0.7141,0.5375,0.6133,0.4616,0.4707,0.7562,0.6701


In [140]:
evaluate_model(gbc)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [151]:
parameters = {
    "n_estimators":[150,300,500,700,1000],
    "max_depth":[1,3,5,7,9],
    # "learning_rate":[round(i,2) for i in np.arange(0.05, 0.2, 0.01)],
    # 'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
    # 'min_samples_leaf' : np.linspace(0.1, 0.5, 5, endpoint=True),
    # 'max_features' : list(range(1,get_config('X_train_transformed').shape[1]))
}

tuned_model = tune_model(gbc, 
        optimize='Bal. Acc',
        choose_better=True, 
        n_iter=100, 
        search_algorithm='random', 
        search_library='scikit-learn', 
        custom_grid=parameters, 
        return_train_score=True
 )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Bal. Acc,F2
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.7424,0.7789,0.6512,0.4828,0.5545,0.3788,0.3872,0.7117,0.6087
1,0.7595,0.8165,0.6899,0.5086,0.5855,0.4216,0.4313,0.7361,0.644
2,0.7634,0.819,0.6977,0.5143,0.5921,0.4308,0.4407,0.7412,0.6512
3,0.7557,0.8145,0.6822,0.5029,0.5789,0.4124,0.4219,0.731,0.6368
4,0.7615,0.8106,0.7385,0.5134,0.6057,0.4425,0.4575,0.7537,0.6789
5,0.7748,0.8288,0.6615,0.5375,0.5931,0.4397,0.4443,0.7369,0.6324
6,0.7385,0.8145,0.7692,0.4831,0.5935,0.4153,0.4397,0.7488,0.6878
7,0.7462,0.8241,0.7385,0.4923,0.5908,0.4173,0.4353,0.7436,0.6713
8,0.7342,0.8035,0.6744,0.4728,0.5559,0.3745,0.3866,0.7141,0.6214
9,0.7514,0.7978,0.6822,0.4972,0.5752,0.4055,0.4157,0.7281,0.6349


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 25 candidates, totalling 250 fits


1. Kprototype Cost function as metric evaluation (based on Elbow Method)

In [130]:
from pycaret.internal.preprocess.transformers import TransformerWrapper

# rescalling lead_time and prev_cancel_ratio
rs = RobustScaler()
data[['lead_time', 'prev_cancel_ratio']] = rs.fit_transform(data[['lead_time', 'prev_cancel_ratio']])

# encoder categorical column
# be = ce.BinaryEncoder(cols=['market_segment', 'reserved_room_type'])
# be.fit(data)
# data_encoded = be.transform(data)

# onehot = TransformerWrapper(transformer=OneHotEncoder())
# onehot.fit(data_encoded[['customer_type']])
# data_encoded = onehot.fit_transform(data_encoded)

In [131]:
from kmodes.kprototypes import KPrototypes
from tqdm import tqdm
# #Takes a while
# costs = []
# n_clusters = []
# clusters_assigned = []

# cat_cols = [0, 4, 5]

# for i in tqdm(range(2, 10)):
#     try:
#         kproto = KPrototypes(n_clusters=i, init='Huang', verbose=2)
#         clusters = kproto.fit_predict(data, categorical=cat_cols)
#         costs.append(kproto.cost_)
#         n_clusters.append(i)
#         clusters_assigned.append(clusters)
#     except:
#         print(f"Can't cluster with {i} clusters")

In [132]:
# from plotly import graph_objects as go
# display(pd.DataFrame({
#     'costs' : costs,
#     'n_clusters' : n_clusters,
#     'clusters_assigned' : clusters_assigned
# }))

# fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs))
# fig.show()

insight:
1. 7 clusters is the best number of clusters

### Finding the Criticism by using threshold 75th percentiles

In [133]:
data.sample(10)

Unnamed: 0,market_segment,prev_cancel_ratio,deposit_type,days_in_waiting_list,customer_type,reserved_room_type,required_car_parking_spaces,total_of_special_requests,lead_time,stays_in_weekend_nights,stays_in_week_nights,is_canceled
9074,Online TA,0.0,0,0,Transient,D,0,1,-0.411215,0,4,0
1313,Offline TA/TO,0.0,0,0,Transient,E,0,0,0.906542,0,3,0
6742,Direct,0.0,0,0,Transient,E,0,1,-0.448598,0,1,0
585,Online TA,0.0,0,0,Transient,E,0,1,1.205607,2,4,0
8376,Online TA,0.0,0,0,Transient,A,1,1,0.046729,1,2,0
7374,Online TA,0.0,0,0,Transient,Others,0,0,-0.102804,2,5,1
1806,Others,0.0,0,0,Transient-Party,A,0,0,-0.411215,1,3,0
270,Online TA,0.0,0,0,Transient,A,0,1,0.252336,2,2,0
6094,Corporate,0.0,0,0,Transient-Party,D,0,0,-0.35514,1,0,0
7139,Online TA,0.0,0,0,Transient,E,0,1,-0.121495,1,5,1


In [134]:
from kmodes.kprototypes import KPrototypes
from kmodes.util.dissim import euclidean_dissim, matching_dissim
from tqdm import tqdm

# set 7 for n_clusters parameter
kproto = KPrototypes(n_clusters=7, init='Huang', verbose=2, num_dissim=euclidean_dissim, cat_dissim=matching_dissim)
clusters = kproto.fit_predict(data, categorical=[0,4,5])

# Get cluster centroids of each cluster
cluster_centroids = kproto.cluster_centroids_

# Calculate distance of each point to its cluster centroid
distances = np.zeros(len(data))
# for i, cluster_idx in enumerate(clusters):
#     distances[i] = np.linalg.norm(data.to_numpy()[i] - cluster_centroids[cluster_idx])


Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 1981, ncost: 17335.366870214133
Run: 1, iteration: 2/100, moves: 919, ncost: 16180.639680005444
Run: 1, iteration: 3/100, moves: 318, ncost: 15847.69417243036
Run: 1, iteration: 4/100, moves: 91, ncost: 15835.687195994202
Run: 1, iteration: 5/100, moves: 34, ncost: 15833.835151813024
Run: 1, iteration: 6/100, moves: 26, ncost: 15827.05859802105
Run: 1, iteration: 7/100, moves: 40, ncost: 15814.111668327825
Run: 1, iteration: 8/100, moves: 11, ncost: 15814.018113456053
Run: 1, iteration: 9/100, moves: 5, ncost: 15813.986758039044
Run: 1, iteration: 10/100, moves: 0, ncost: 15813.986758039044
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 2455, ncost: 16262.864554194945
Run: 2, iteration: 2/100, moves: 888, ncost: 15934.992159503603
Run: 2, iteration: 3/100, moves: 391, ncost: 15817.799613751678
Run: 2, iteratio

In [135]:
kproto.cluster_centroids_[1]

array(['0.003980501635391035', '0.10802469135802469',
       '0.19907407407407407', '0.0787037037037037', '0.5555555555555556',
       '1.7320439598477004', '0.6975308641975309', '2.4290123456790123',
       '0.44753086419753085', 'Online TA', 'Transient', 'A'], dtype='<U32')

In [136]:
cluster_centroids[0]

array(['0.006857724774926142', '0.016129032258064516',
       '0.019524617996604415', '0.21137521222410866',
       '0.5704584040747029', '-0.004442822461641342',
       '1.4202037351443124', '0.6392190152801358', '0.22156196943972836',
       'Online TA', 'Transient', 'A'], dtype='<U32')

In [137]:
from kmodes.util.dissim import euclidean_dissim, matching_dissim
matching_dissim(data.to_numpy()[0][6], cluster_centroids[0][0])

AxisError: axis 1 is out of bounds for array of dimension 0

In [None]:
from kmodes.kprototypes import KPrototypes
dir(KPrototypes)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_feature_names',
 '_check_n_features',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_validate_data',
 '_validate_params',
 'cluster_centroids_',
 'fit',
 'fit_predict',
 'get_params',
 'predict',
 'set_params']

In [None]:
# import numpy as np
# from kmodes.kprototypes import KPrototypes
# from sklearn.preprocessing import StandardScaler

# # Generate example dataset with numerical and categorical features
# np.random.seed(42)
# numerical_data = np.random.rand(100, 2)  # Numerical features
# categorical_data = np.random.choice(['A', 'B', 'C'], size=(100, 1))  # Categorical feature

# # Combine numerical and categorical features
# combined_data = np.hstack((numerical_data, categorical_data))

# # Standardize numerical features
# numerical_data_scaled = StandardScaler().fit_transform(numerical_data)

# # Specify the number of clusters (prototype points)
# num_clusters = 3

# # Create K-Prototype instance
# kproto = KPrototypes(n_clusters=num_clusters, verbose=2, init='Huang')
# clusters = kproto.fit_predict(combined_data, categorical=[2])  # Index of the categorical feature

# # Get cluster centroids
# cluster_centroids_numerical = kproto.cluster_centroids_[0]  # Numerical centroids
# cluster_centroids_categorical = kproto.cluster_centroids_[1]  # Categorical centroids

# # Calculate distance of each point to its cluster centroid
# distances = np.zeros(len(combined_data))
# for i, cluster_idx in enumerate(clusters):
#     # Calculate the Euclidean distance for numerical features
#     dist_numerical = np.linalg.norm(numerical_data_scaled[i] - cluster_centroids_numerical[cluster_idx])
    
#     # Calculate the distance for categorical features (you can use appropriate distance measures)
#     dist_categorical = 0 if categorical_data[i] == cluster_centroids_categorical[cluster_idx] else 1
    
#     # Combine distances
#     distances[i] = dist_numerical + dist_categorical

# # Set a threshold to identify prototype and criticism points
# threshold = np.percentile(distances, 75)  # You can adjust this threshold based on your data

# # Separate prototype and criticism points
# prototype_indices = np.where(distances <= threshold)[0]
# criticism_indices = np.where(distances > threshold)[0]

# prototype_points = combined_data[prototype_indices]
# criticism_points = combined_data[criticism_indices]

# print("Number of prototype points:", len(prototype_points))
# print("Number of criticism points:", len(criticism_points))


In [None]:
# from pycaret.internal.preprocess.transformers import TransformerWrapper

# # encoder categorical column
# be = ce.BinaryEncoder()
# market = be.fit_transform(data[['market_segment']])
# room = be.fit_transform(data[['reserved_room_type']])

# trans = TransformerWrapper(transformer=OneHotEncoder())
# trans.fit(data[['customer_type']])
# cust = trans.transform(data[['customer_type']])

In [None]:
# data = a[['market_segment_0', 'market_segment_1', \
#        'market_segment_2', 'prev_cancel_ratio', 'deposit_type', 'days_in_waiting_list', 'customer_type_Contract', 'customer_type_Group', \
#        'customer_type_Transient', 'customer_type_Transient-Party', 'reserved_room_type_0', 'reserved_room_type_1',\
#        'reserved_room_type_2', 'required_car_parking_spaces', 'total_of_special_requests', 'lead_time',\
#        'is_canceled']]

In [None]:
import torch
def select_prototypes(K:torch.Tensor, num_prototypes:int):
    sample_indices = torch.arange(0, K.shape[0])
    num_samples = sample_indices.shape[0]

    colsum = 2 * K.sum(0) / num_samples
    is_selected = torch.zeros_like(sample_indices)
    selected = sample_indices[is_selected > 0]

    for i in range(num_prototypes):
        candidate_indices = sample_indices[is_selected == 0]
        s1 = colsum[candidate_indices]

        if selected.shape[0] == 0:
            s1 -= K.diagonal()[candidate_indices].abs()
        else:
            temp = K[selected, :][:, candidate_indices]
            s2 = temp.sum(0) * 2 + K.diagonal()[candidate_indices]
            s2 /= (selected.shape[0] + 1)
            s1 -= s2

        best_sample_index = candidate_indices[s1.argmax()]
        is_selected[best_sample_index] = i + 1
        selected = sample_indices[is_selected > 0]

    selected_in_order = selected[is_selected[is_selected > 0].argsort()]
    return selected_in_order

In [None]:
import numpy as np
from kmodes.kprototypes import KPrototypes
from sklearn.preprocessing import StandardScaler

# Generate example dataset with numerical and categorical features
np.random.seed(42)
numerical_data = np.random.rand(100, 2)  # Numerical features
categorical_data = np.random.choice(['A', 'B', 'C'], size=(100, 1))  # Categorical feature

# Combine numerical and categorical features
combined_data = np.hstack((numerical_data, categorical_data))

# Standardize numerical features
numerical_data_scaled = StandardScaler().fit_transform(numerical_data)

# Specify the number of clusters (prototype points)
num_clusters = 3

# Create K-Prototype instance
kproto = KPrototypes(n_clusters=num_clusters, verbose=2, init='Huang')
clusters = kproto.fit_predict(combined_data, categorical=[2])  # Index of the categorical feature

# Get cluster centroids
cluster_centroids = kproto.cluster_centroids_

# Calculate distance of each point to its cluster centroid
distances = np.zeros(len(combined_data))
# for i, cluster_idx in enumerate(clusters):
#     distances[i] = np.linalg.norm(combined_data[i] - cluster_centroids[cluster_idx])

# # Set a threshold to identify prototype and criticism points
# threshold = np.percentile(distances, 75)  # You can adjust this threshold based on your data

# # Separate prototype and criticism points
# prototype_indices = np.where(distances <= threshold)[0]
# criticism_indices = np.where(distances > threshold)[0]

# prototype_points = combined_data[prototype_indices]
# criticism_points = combined_data[criticism_indices]

# print("Number of prototype points:", len(prototype_points))
# print("Number of criticism points:", len(criticism_points))


Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 11, ncost: 16.002032513075548
Run: 1, iteration: 2/100, moves: 8, ncost: 15.509421117050662
Run: 1, iteration: 3/100, moves: 6, ncost: 15.27557201586817
Run: 1, iteration: 4/100, moves: 1, ncost: 15.265664024173809
Run: 1, iteration: 5/100, moves: 2, ncost: 15.24093834505537
Run: 1, iteration: 6/100, moves: 6, ncost: 15.038697129417722
Run: 1, iteration: 7/100, moves: 6, ncost: 14.824167671479957
Run: 1, iteration: 8/100, moves: 1, ncost: 14.815416286491109
Run: 1, iteration: 9/100, moves: 0, ncost: 14.815416286491109
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 21, ncost: 15.939802820231174
Run: 2, iteration: 2/100, moves: 13, ncost: 15.12923423525456
Run: 2, iteration: 3/100, moves: 5, ncost: 14.93178156740369
Run: 2, iteration: 4/100, moves: 3, ncost: 14.905617401033235
Run: 2, iteration: 5/100, moves: 5,

In [None]:
# combined_data
np.linalg.norm(combined_data[0])

ValueError: could not convert string to float: 'C'