# Descrição do Problema

Neste arquivo serão implementadas soluções para o seguinte estudo de caso [Hotel Resevations Dataset](https://www.kaggle.com/datasets/ahsan81/hotel-reservations-classification-dataset?resource=download), que tentam prever se um cliente irá cancelar sua reserva ou não baseado nos dados de reservas de um hotel.


## Análise dos Dados

pré processamento, e correlações

In [85]:
# importando bibliotecas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier

from imblearn.over_sampling import RandomOverSampler

In [86]:
# lendo dataframe
df = pd.read_csv('./hotel_reservations.csv')
df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Booking_ID                            36275 non-null  object 
 1   no_of_adults                          36275 non-null  int64  
 2   no_of_children                        36275 non-null  int64  
 3   no_of_weekend_nights                  36275 non-null  int64  
 4   no_of_week_nights                     36275 non-null  int64  
 5   type_of_meal_plan                     36275 non-null  object 
 6   required_car_parking_space            36275 non-null  int64  
 7   room_type_reserved                    36275 non-null  object 
 8   lead_time                             36275 non-null  int64  
 9   arrival_year                          36275 non-null  int64  
 10  arrival_month                         36275 non-null  int64  
 11  arrival_date   

In [88]:
# tratamento dos dados
label_encoder_type_of_meal_plan = LabelEncoder()
label_encoder_room_type_reserved = LabelEncoder()
label_encoder_market_segment_type = LabelEncoder()
label_encoder_booking_status = LabelEncoder()

df['type_of_meal_plan'] = label_encoder_type_of_meal_plan.fit_transform(df['type_of_meal_plan'])
df['room_type_reserved'] = label_encoder_room_type_reserved.fit_transform(df['room_type_reserved'])
df['market_segment_type'] = label_encoder_market_segment_type.fit_transform(df['market_segment_type'])
df['booking_status'] = label_encoder_booking_status.fit_transform(df['booking_status'])

df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,0,0,0,224,2017,10,2,3,0,0,0,65.0,0,1
1,INN00002,2,0,2,3,3,0,0,5,2018,11,6,4,0,0,0,106.68,1,1
2,INN00003,1,0,2,1,0,0,0,1,2018,2,28,4,0,0,0,60.0,0,0
3,INN00004,2,0,0,2,0,0,0,211,2018,5,20,4,0,0,0,100.0,0,0
4,INN00005,2,0,1,1,3,0,0,48,2018,4,11,4,0,0,0,94.5,0,0


## Técnicas Utilizadas

explicar os algoritmos utilizados, e implementar cada

### KNN

In [89]:
X = df.drop(['Booking_ID', 'booking_status'], axis=1)
Y = df['booking_status'].values
ros = RandomOverSampler(random_state=0)
X, Y = ros.fit_resample(X, Y)
#definição do cross-validation
kFolds = 30
kf = StratifiedKFold(n_splits=kFolds, shuffle=True)

knn = KNeighborsClassifier(n_neighbors=1)
scores = cross_val_score(knn, X, Y, cv=kf, n_jobs=-1)
print('Acurácia com 1 K-NN: %0.4f +/- %0.4f' % (scores.mean(), scores.std()))

knn = KNeighborsClassifier(n_neighbors=3)
scores = cross_val_score(knn, X, Y, cv=kf)
print('Acurácia com 3 K-NN: %0.4f +/- %0.4f' % (scores.mean(), scores.std()))

knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, Y, cv=kf)
print('Acurácia com 5 K-NN: %0.4f +/- %0.4f' % (scores.mean(), scores.std()))

knn = KNeighborsClassifier(n_neighbors=11)
scores = cross_val_score(knn, X, Y, cv=kf)
print('Acurácia com 11 K-NN: %0.4f +/- %0.4f' % (scores.mean(), scores.std()))

Acurácia com 1 K-NN: 0.9008 +/- 0.0078
Acurácia com 3 K-NN: 0.8453 +/- 0.0100
Acurácia com 5 K-NN: 0.8218 +/- 0.0092
Acurácia com 11 K-NN: 0.7932 +/- 0.0086


### Naive-Bayes

In [90]:
nb = GaussianNB()
scores = cross_val_score(nb, X, Y, cv=kf)
print('Acurácia com Naive-Bayes: %0.4f +/- %0.4f' % (scores.mean(), scores.std()))

# testar usando apenas variáveis contínuas 
# lead_time e numero de noites, preço,

Acurácia com Naive-Bayes: 0.5673 +/- 0.0064


### Regressão Logística

In [91]:
rlog = LogisticRegression(max_iter=10000)
scores = cross_val_score(rlog, X, Y, cv=kf, n_jobs=-1)
print('Acurácia com Regressão Logística: %0.4f +/- %0.4f' % (scores.mean(), scores.std()))
# selecionar melhor as colunas

Acurácia com Regressão Logística: 0.7791 +/- 0.0102


### SVM

In [None]:
svm = SVC(kernel='linear')
scores = cross_val_score(svm, X, Y, cv=kf)
print('Acurácia com SVM Linear: %0.4f +/- %0.4f' % (scores.mean(), scores.std()))

svm = SVC(kernel='rbf')
scores = cross_val_score(svm, X, Y, cv=kf)
print('Acurácia com SVM RBF: %0.4f +/- %0.4f' % (scores.mean(), scores.std()))

svm = SVC(kernel='poly', degree=3)
scores = cross_val_score(svm, X, Y, cv=kf)
print('Acurácia com SVM Poly: %0.4f +/- %0.4f' % (scores.mean(), scores.std()))
#rodar no colab

### árvore de decisão

### ensembles

## Análise dos Resultados

ver os resultados de cada algoritmo e comparar os resultados

## Conclusão

conclusões obtidas sobre cada algoritmo e o estudo de caso

## Apêndice

resumo dos seminários apresentados em aula