In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import pickle
import joblib

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.naive_bayes import GaussianNB

from datetime import datetime


In [None]:
#Set parameters to see all data
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [None]:
# Read dataset

%%time
resort_file_path = 'H1.csv'

try:
    resort_hotel_df = pd.read_csv(resort_file_path)
except pd.errors.ParserError as e:
    print(f'Error while parsing CSV file: {e}')



resort_hotel_df.shape

CPU times: user 231 ms, sys: 81.2 ms, total: 313 ms
Wall time: 867 ms


(40060, 31)

In [None]:
# Read dataset

%%time
city_file_path = 'H2.csv'

try:
    city_hotel_df = pd.read_csv(city_file_path)
except pd.errors.ParserError as e:
    print(f'Error while parsing CSV file: {e}')



city_hotel_df.shape

CPU times: user 402 ms, sys: 165 ms, total: 566 ms
Wall time: 1.09 s


(79330, 31)

In [None]:
resort_hotel_df.shape[0] + city_hotel_df.shape[0]

119390

In [None]:
resort_hotel_df.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,Country,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,342,2015,July,27,1,0,0,2,0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,0,737,2015,July,27,1,0,0,2,0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,0,7,2015,July,27,1,0,1,1,0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,0,13,2015,July,27,1,0,1,1,0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,0,14,2015,July,27,1,0,2,2,0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [None]:
resort_hotel_df['Hoteltype'] = 'resort'
city_hotel_df['Hoteltype'] = 'city'

In [None]:
# Combineing dataframes
hotel_df = pd.concat([resort_hotel_df, city_hotel_df], ignore_index=True)
hotel_df.head(5).append(hotel_df.tail(5))

  hotel_df.head(5).append(hotel_df.tail(5))


Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,Meal,Country,MarketSegment,DistributionChannel,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,ReservedRoomType,AssignedRoomType,BookingChanges,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,Hoteltype
0,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01,resort
1,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01,resort
2,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02,resort
3,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02,resort
4,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03,resort
119385,0,23,2017,August,35,30,2,5,2,0.0,0,BB,BEL,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,394.0,,0,Transient,96.14,0,0,Check-Out,2017-09-06,city
119386,0,102,2017,August,35,31,2,5,3,0.0,0,BB,FRA,Online TA,TA/TO,0,0,0,E,E,0,No Deposit,9.0,,0,Transient,225.43,0,2,Check-Out,2017-09-07,city
119387,0,34,2017,August,35,31,2,5,2,0.0,0,BB,DEU,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,9.0,,0,Transient,157.71,0,4,Check-Out,2017-09-07,city
119388,0,109,2017,August,35,31,2,5,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,89.0,,0,Transient,104.4,0,0,Check-Out,2017-09-07,city
119389,0,205,2017,August,35,29,2,7,2,0.0,0,HB,DEU,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,9.0,,0,Transient,151.2,0,2,Check-Out,2017-09-07,city


In [None]:
# Method for reducing the memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [None]:
hotel_df = reduce_mem_usage(hotel_df)

Mem. usage decreased to 15.37 Mb (47.3% reduction)


In [None]:
hotel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   IsCanceled                   119390 non-null  int8   
 1   LeadTime                     119390 non-null  int16  
 2   ArrivalDateYear              119390 non-null  int16  
 3   ArrivalDateMonth             119390 non-null  object 
 4   ArrivalDateWeekNumber        119390 non-null  int8   
 5   ArrivalDateDayOfMonth        119390 non-null  int8   
 6   StaysInWeekendNights         119390 non-null  int8   
 7   StaysInWeekNights            119390 non-null  int8   
 8   Adults                       119390 non-null  int8   
 9   Children                     119386 non-null  float16
 10  Babies                       119390 non-null  int8   
 11  Meal                         119390 non-null  object 
 12  Country                      118902 non-null  object 
 13 

In [None]:
hotel_df.columns

Index(['IsCanceled', 'LeadTime', 'ArrivalDateYear', 'ArrivalDateMonth', 'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth', 'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children', 'Babies', 'Meal', 'Country', 'MarketSegment', 'DistributionChannel', 'IsRepeatedGuest', 'PreviousCancellations', 'PreviousBookingsNotCanceled', 'ReservedRoomType', 'AssignedRoomType', 'BookingChanges', 'DepositType', 'Agent', 'Company', 'DaysInWaitingList', 'CustomerType', 'ADR', 'RequiredCarParkingSpaces', 'TotalOfSpecialRequests', 'ReservationStatus', 'ReservationStatusDate', 'Hoteltype'], dtype='object')

In [None]:
# Initialize an empty list to store the lines
lines = []

# Open the file in read mode
with open('best_accurate_columns.txt', 'r') as f:
    # Iterate over each line in the file
    for line in f:
        # Strip any leading or trailing whitespace and append the line to the list
        lines.append(line.strip())

# Print the list of lines
print(lines)


['ReservedRoomType', 'Children', 'AssignedRoomType', 'MarketSegment', 'Adults', 'TotalOfSpecialRequests', 'DistributionChannel', 'StaysInWeekNights', 'Meal', 'RequiredCarParkingSpaces', 'LeadTime', 'PreviousCancellations', 'PreviousBookingsNotCanceled', 'CustomerType', 'DepositType', 'Hoteltype', 'Country', 'IsRepeatedGuest']


In [None]:
# Get the difference between sets
difference = set(hotel_df.columns) - set(lines)
print(difference)

{'StaysInWeekendNights', 'BookingChanges', 'ReservationStatusDate', 'Agent', 'ReservationStatus', 'DaysInWaitingList', 'Company', 'ArrivalDateYear', 'ArrivalDateDayOfMonth', 'ADR', 'ArrivalDateMonth', 'IsCanceled', 'Babies', 'ArrivalDateWeekNumber'}


In [None]:
len(lines)

18

In [None]:
# Combining features
append_col_list = ['ArrivalDateDayOfMonth', 'ArrivalDateMonth', 'ArrivalDateYear',
                   'Company','Agent',
                   'StaysInWeekendNights',
                   'ReservationStatusDate',
                   'ADR']
lines.extend(append_col_list)
print(len(lines))

26


In [None]:
print(hotel_df.shape)
df = hotel_df[lines].copy()
print(df.shape)

(119390, 32)
(119390, 26)


In [None]:
# Stardatization of column names
df.columns = df.columns.str.lower()


In [None]:
df['company'].max(), df['agent'].max()

('       NULL', '       NULL')

In [None]:
df['company'].value_counts()

       NULL    112593
         40       927
        223       784
         67       267
         45       250
                ...  
         32         1
         11         1
        487         1
        101         1
        376         1
Name: company, Length: 353, dtype: int64

In [None]:
df.agent.value_counts()

          9    31961
       NULL    16340
        240    13922
          1     7191
         14     3640
               ...  
        280        1
        285        1
        289        1
        265        1
        497        1
Name: agent, Length: 334, dtype: int64

In [None]:
df.columns

Index(['reservedroomtype', 'children', 'assignedroomtype', 'marketsegment', 'adults', 'totalofspecialrequests', 'distributionchannel', 'staysinweeknights', 'meal', 'requiredcarparkingspaces', 'leadtime', 'previouscancellations', 'previousbookingsnotcanceled', 'customertype', 'deposittype', 'hoteltype', 'country', 'isrepeatedguest', 'arrivaldatedayofmonth', 'arrivaldatemonth', 'arrivaldateyear', 'company', 'agent', 'staysinweekendnights', 'reservationstatusdate', 'adr'], dtype='object')

In [None]:
hotel_df.columns

Index(['IsCanceled', 'LeadTime', 'ArrivalDateYear', 'ArrivalDateMonth', 'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth', 'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children', 'Babies', 'Meal', 'Country', 'MarketSegment', 'DistributionChannel', 'IsRepeatedGuest', 'PreviousCancellations', 'PreviousBookingsNotCanceled', 'ReservedRoomType', 'AssignedRoomType', 'BookingChanges', 'DepositType', 'Agent', 'Company', 'DaysInWaitingList', 'CustomerType', 'ADR', 'RequiredCarParkingSpaces', 'TotalOfSpecialRequests', 'ReservationStatus', 'ReservationStatusDate', 'Hoteltype'], dtype='object')

In [None]:
hotel_df['IsCanceled'].mean()

0.37041628277075134

In [None]:
hotel_df['IsCanceled'].value_counts()

0    75166
1    44224
Name: IsCanceled, dtype: int64

In [None]:
hotel_df[hotel_df['IsCanceled'] == 1].ReservationStatus.value_counts()

Canceled    43017
No-Show      1207
Name: ReservationStatus, dtype: int64

In [None]:
hotel_df[hotel_df['IsCanceled'] == 0].ReservationStatus.value_counts()

Check-Out    75166
Name: ReservationStatus, dtype: int64

In [None]:
# Missing data imputation
def fill_null_values(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column].fillna('unknown', inplace=True)
        elif df[column].dtype in ['int64', 'float64']:
            df[column].fillna(0, inplace=True)
            df2[column][np.isnan(df2[column])] = 0

    return df

In [None]:
#Merging date information as a date column
def merge_date_columns(df):

    # Convert 'arrival_date_month' to a numerical representation
    month_mapping = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
                     'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}

    df['arrivaldatemonth'] = df['arrivaldatemonth'].map(month_mapping)

    # Create a new 'arrival_date' column by applying a lambda function
    df['arrivaldate'] = df.apply(lambda row: pd.to_datetime(f"{row['arrivaldateyear']}-{row['arrivaldatemonth']}-{row['arrivaldatedayofmonth']}",
                                                            errors='coerce'), axis=1)

    return df

In [None]:
%%time
df2 = merge_date_columns(df)
df2.head()

CPU times: user 14.5 s, sys: 116 ms, total: 14.6 s
Wall time: 15 s


Unnamed: 0,reservedroomtype,children,assignedroomtype,marketsegment,adults,totalofspecialrequests,distributionchannel,staysinweeknights,meal,requiredcarparkingspaces,leadtime,previouscancellations,previousbookingsnotcanceled,customertype,deposittype,hoteltype,country,isrepeatedguest,arrivaldatedayofmonth,arrivaldatemonth,arrivaldateyear,company,agent,staysinweekendnights,reservationstatusdate,adr,arrivaldate
0,C,0.0,C,Direct,2,0,Direct,0,BB,0,342,0,0,Transient,No Deposit,resort,PRT,0,1,7,2015,,,0,2015-07-01,0.0,2015-07-01
1,C,0.0,C,Direct,2,0,Direct,0,BB,0,737,0,0,Transient,No Deposit,resort,PRT,0,1,7,2015,,,0,2015-07-01,0.0,2015-07-01
2,A,0.0,C,Direct,1,0,Direct,1,BB,0,7,0,0,Transient,No Deposit,resort,GBR,0,1,7,2015,,,0,2015-07-02,75.0,2015-07-01
3,A,0.0,A,Corporate,1,0,Corporate,1,BB,0,13,0,0,Transient,No Deposit,resort,GBR,0,1,7,2015,,304.0,0,2015-07-02,75.0,2015-07-01
4,A,0.0,A,Online TA,2,1,TA/TO,2,BB,0,14,0,0,Transient,No Deposit,resort,GBR,0,1,7,2015,,240.0,0,2015-07-03,98.0,2015-07-01


In [None]:
df2.arrivaldate.min(), df2.arrivaldate.max()

(Timestamp('2015-07-01 00:00:00'), Timestamp('2017-08-31 00:00:00'))

In [None]:
df2[df2.arrivaldate < '2017-08-01'].shape, df2[df2.arrivaldate >= '2017-08-01'].shape

((114465, 27), (4925, 27))

In [None]:
print(df2.shape)
drop_cols = ['arrivaldateyear', 'arrivaldatemonth',
             'arrivaldatedayofmonth']
df2 = df2.drop(columns=drop_cols)
print(df2.shape)


(119390, 27)
(119390, 24)


In [None]:
df2 = fill_null_values(df2)

In [None]:
df2.customertype.value_counts()

Transient          89613
Transient-Party    25124
Contract            4076
Group                577
Name: customertype, dtype: int64

In [None]:
df2.head()

Unnamed: 0,reservedroomtype,children,assignedroomtype,marketsegment,adults,totalofspecialrequests,distributionchannel,staysinweeknights,meal,requiredcarparkingspaces,leadtime,previouscancellations,previousbookingsnotcanceled,customertype,deposittype,hoteltype,country,isrepeatedguest,company,agent,staysinweekendnights,reservationstatusdate,adr,arrivaldate
0,C,0.0,C,Direct,2,0,Direct,0,BB,0,342,0,0,Transient,No Deposit,resort,PRT,0,,,0,2015-07-01,0.0,2015-07-01
1,C,0.0,C,Direct,2,0,Direct,0,BB,0,737,0,0,Transient,No Deposit,resort,PRT,0,,,0,2015-07-01,0.0,2015-07-01
2,A,0.0,C,Direct,1,0,Direct,1,BB,0,7,0,0,Transient,No Deposit,resort,GBR,0,,,0,2015-07-02,75.0,2015-07-01
3,A,0.0,A,Corporate,1,0,Corporate,1,BB,0,13,0,0,Transient,No Deposit,resort,GBR,0,,304.0,0,2015-07-02,75.0,2015-07-01
4,A,0.0,A,Online TA,2,1,TA/TO,2,BB,0,14,0,0,Transient,No Deposit,resort,GBR,0,,240.0,0,2015-07-03,98.0,2015-07-01


In [None]:
df2[df2.adr > 0].shape, df2[df2.adr < 1].shape

((117430, 24), (1962, 24))

In [None]:
df3 = df2[df2.adr > 0].copy()

In [None]:
df3.country.nunique()

178

In [None]:
df3.head()

Unnamed: 0,reservedroomtype,children,assignedroomtype,marketsegment,adults,totalofspecialrequests,distributionchannel,staysinweeknights,meal,requiredcarparkingspaces,leadtime,previouscancellations,previousbookingsnotcanceled,customertype,deposittype,hoteltype,country,isrepeatedguest,company,agent,staysinweekendnights,reservationstatusdate,adr,arrivaldate
2,A,0.0,C,Direct,1,0,Direct,1,BB,0,7,0,0,Transient,No Deposit,resort,GBR,0,,,0,2015-07-02,75.0,2015-07-01
3,A,0.0,A,Corporate,1,0,Corporate,1,BB,0,13,0,0,Transient,No Deposit,resort,GBR,0,,304.0,0,2015-07-02,75.0,2015-07-01
4,A,0.0,A,Online TA,2,1,TA/TO,2,BB,0,14,0,0,Transient,No Deposit,resort,GBR,0,,240.0,0,2015-07-03,98.0,2015-07-01
5,A,0.0,A,Online TA,2,1,TA/TO,2,BB,0,14,0,0,Transient,No Deposit,resort,GBR,0,,240.0,0,2015-07-03,98.0,2015-07-01
6,C,0.0,C,Direct,2,0,Direct,2,BB,0,0,0,0,Transient,No Deposit,resort,PRT,0,,,0,2015-07-03,107.0,2015-07-01


In [None]:
df3.shape

(117430, 24)

In [None]:
get_dummy_cols = ['isrepeatedguest', 'deposittype', 'hoteltype', 'customertype', 'marketsegment', 'distributionchannel']
normalized_cols = ['previouscancellations', 'previousbookingsnotcanceled',
                   'requiredcarparkingspaces', 'totalofspecialrequests']
bin_cols = ['leadtime', 'staysinweekendnights', 'staysinweeknights', 'children', 'adults']
cat_conversion_cols = ['meal', 'reservedroomtype', 'assignedroomtype']
day_diff_cols = ['arrivaldate']
embeding_cols = ['agent', 'company']


In [None]:
df3.shape

(117430, 24)

In [None]:
# OneHot encoding for categorical data
for col in get_dummy_cols:
  print(col)
  df3 = pd.concat([df3, pd.get_dummies(df3[col],prefix=col + "_cat")], axis=1)
  df3.drop(columns=[col], axis=1, inplace=True)
df3.shape

isrepeatedguest
deposittype
hoteltype
customertype
marketsegment
distributionchannel


(117430, 42)

In [None]:
# Normalizing with MinMaxScaler
def normalize_data(dataframe, columns):

    normalized_dataframe = dataframe.copy()

    # Initialize MinMaxScaler
    scaler = MinMaxScaler()

    # Check if the specified columns exist in the DataFrame
    existing_columns = [col for col in columns if col in normalized_dataframe.columns]

    if existing_columns:
        # Fit and transform the specified columns using MinMaxScaler
        normalized_dataframe[existing_columns] = scaler.fit_transform(normalized_dataframe[existing_columns])

    return normalized_dataframe

In [None]:
df_normalized = normalize_data(df3, normalized_cols)
df_normalized.shape

(117430, 42)

In [None]:
# Using bins instead of numerics
def replace_columns_with_bins(dataframe, column_list):

    df_copy = dataframe.copy()

    for column_name in column_list:
      print(column_name)
      if column_name in df_copy.columns and pd.api.types.is_numeric_dtype(df_copy[column_name]):
          bin_ranges = [df_copy[column_name].min(), df_copy[column_name].mean(),
                        df_copy[column_name].mean() + df_copy[column_name].std(),
                        df_copy[column_name].max()]
          df_copy[column_name] = pd.cut(df_copy[column_name],
                                        bins=bin_ranges,
                                        include_lowest=True,
                                        duplicates='drop').cat.codes
    return df_copy

In [None]:
df_normalized.head()

Unnamed: 0,reservedroomtype,children,assignedroomtype,adults,totalofspecialrequests,staysinweeknights,meal,requiredcarparkingspaces,leadtime,previouscancellations,previousbookingsnotcanceled,country,company,agent,staysinweekendnights,reservationstatusdate,adr,arrivaldate,isrepeatedguest_cat_0,isrepeatedguest_cat_1,deposittype_cat_No Deposit,deposittype_cat_Non Refund,deposittype_cat_Refundable,hoteltype_cat_city,hoteltype_cat_resort,customertype_cat_Contract,customertype_cat_Group,customertype_cat_Transient,customertype_cat_Transient-Party,marketsegment_cat_Aviation,marketsegment_cat_Complementary,marketsegment_cat_Corporate,marketsegment_cat_Direct,marketsegment_cat_Groups,marketsegment_cat_Offline TA/TO,marketsegment_cat_Online TA,marketsegment_cat_Undefined,distributionchannel_cat_Corporate,distributionchannel_cat_Direct,distributionchannel_cat_GDS,distributionchannel_cat_TA/TO,distributionchannel_cat_Undefined
2,A,0.0,C,1,0.0,1,BB,0.0,7,0.0,0.0,GBR,,,0,2015-07-02,75.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0
3,A,0.0,A,1,0.0,1,BB,0.0,13,0.0,0.0,GBR,,304.0,0,2015-07-02,75.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,A,0.0,A,2,0.2,2,BB,0.0,14,0.0,0.0,GBR,,240.0,0,2015-07-03,98.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
5,A,0.0,A,2,0.2,2,BB,0.0,14,0.0,0.0,GBR,,240.0,0,2015-07-03,98.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
6,C,0.0,C,2,0.0,2,BB,0.0,0,0.0,0.0,PRT,,,0,2015-07-03,107.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [None]:
df_normalized.columns

Index(['reservedroomtype', 'children', 'assignedroomtype', 'adults', 'totalofspecialrequests', 'staysinweeknights', 'meal', 'requiredcarparkingspaces', 'leadtime', 'previouscancellations', 'previousbookingsnotcanceled', 'country', 'company', 'agent', 'staysinweekendnights', 'reservationstatusdate', 'adr', 'arrivaldate', 'isrepeatedguest_cat_0', 'isrepeatedguest_cat_1', 'deposittype_cat_No Deposit     ', 'deposittype_cat_Non Refund     ', 'deposittype_cat_Refundable     ', 'hoteltype_cat_city', 'hoteltype_cat_resort', 'customertype_cat_Contract', 'customertype_cat_Group', 'customertype_cat_Transient', 'customertype_cat_Transient-Party', 'marketsegment_cat_Aviation', 'marketsegment_cat_Complementary', 'marketsegment_cat_Corporate', 'marketsegment_cat_Direct', 'marketsegment_cat_Groups', 'marketsegment_cat_Offline TA/TO', 'marketsegment_cat_Online TA', 'marketsegment_cat_Undefined', 'distributionchannel_cat_Corporate', 'distributionchannel_cat_Direct', 'distributionchannel_cat_GDS',
     

In [None]:
# Using bins instead of numerics
def replace_with_bins(dataframe, column_name, bin_ranges):

    df_copy = dataframe.copy()
    df_copy[column_name] = pd.cut(df_copy[column_name], bins=bin_ranges, include_lowest=True).cat.codes
    return df_copy


In [None]:
d# Creating dictionary for converting category to incremantal number
ef create_dict_with_incremental_integers(df, column_name):
    unique_values = df[column_name].unique()
    value_mapping = {value: index + 1 for index, value in enumerate(unique_values)}
    return value_mapping

In [None]:
# Replacing cats with numerics
def replace_column_with_dict(df, column_name, mapping_dict):

    df[column_name] = df[column_name].replace(mapping_dict.keys(), mapping_dict.values())
    return df

In [None]:
df_normalized.head()

Unnamed: 0,reservedroomtype,children,assignedroomtype,adults,totalofspecialrequests,staysinweeknights,meal,requiredcarparkingspaces,leadtime,previouscancellations,previousbookingsnotcanceled,country,company,agent,staysinweekendnights,reservationstatusdate,adr,arrivaldate,isrepeatedguest_cat_0,isrepeatedguest_cat_1,deposittype_cat_No Deposit,deposittype_cat_Non Refund,deposittype_cat_Refundable,hoteltype_cat_city,hoteltype_cat_resort,customertype_cat_Contract,customertype_cat_Group,customertype_cat_Transient,customertype_cat_Transient-Party,marketsegment_cat_Aviation,marketsegment_cat_Complementary,marketsegment_cat_Corporate,marketsegment_cat_Direct,marketsegment_cat_Groups,marketsegment_cat_Offline TA/TO,marketsegment_cat_Online TA,marketsegment_cat_Undefined,distributionchannel_cat_Corporate,distributionchannel_cat_Direct,distributionchannel_cat_GDS,distributionchannel_cat_TA/TO,distributionchannel_cat_Undefined
2,A,0.0,C,1,0.0,1,BB,0.0,7,0.0,0.0,GBR,,,0,2015-07-02,75.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0
3,A,0.0,A,1,0.0,1,BB,0.0,13,0.0,0.0,GBR,,304.0,0,2015-07-02,75.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,A,0.0,A,2,0.2,2,BB,0.0,14,0.0,0.0,GBR,,240.0,0,2015-07-03,98.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
5,A,0.0,A,2,0.2,2,BB,0.0,14,0.0,0.0,GBR,,240.0,0,2015-07-03,98.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
6,C,0.0,C,2,0.0,2,BB,0.0,0,0.0,0.0,PRT,,,0,2015-07-03,107.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [None]:
df5 = df_normalized.copy()

# Define a list of dictionaries containing column names and their respective mapping dictionaries
columns_and_mappings = [
    ('meal', create_dict_with_incremental_integers(df5, 'meal')),
    ('country', create_dict_with_incremental_integers(df5, 'country')),
    ('reservedroomtype', create_dict_with_incremental_integers(df5, 'reservedroomtype')),
    ('assignedroomtype', create_dict_with_incremental_integers(df5, 'assignedroomtype'))
]

In [None]:

# Iterate over the list of dictionaries and replace columns using the function
for column_name, mapping_dict in columns_and_mappings:
    df5 = replace_column_with_dict(df5, column_name, mapping_dict)


In [None]:
df5.head()

Unnamed: 0,reservedroomtype,children,assignedroomtype,adults,totalofspecialrequests,staysinweeknights,meal,requiredcarparkingspaces,leadtime,previouscancellations,previousbookingsnotcanceled,country,company,agent,staysinweekendnights,reservationstatusdate,adr,arrivaldate,isrepeatedguest_cat_0,isrepeatedguest_cat_1,deposittype_cat_No Deposit,deposittype_cat_Non Refund,deposittype_cat_Refundable,hoteltype_cat_city,hoteltype_cat_resort,customertype_cat_Contract,customertype_cat_Group,customertype_cat_Transient,customertype_cat_Transient-Party,marketsegment_cat_Aviation,marketsegment_cat_Complementary,marketsegment_cat_Corporate,marketsegment_cat_Direct,marketsegment_cat_Groups,marketsegment_cat_Offline TA/TO,marketsegment_cat_Online TA,marketsegment_cat_Undefined,distributionchannel_cat_Corporate,distributionchannel_cat_Direct,distributionchannel_cat_GDS,distributionchannel_cat_TA/TO,distributionchannel_cat_Undefined
2,1,0.0,1,1,0.0,1,1,0.0,7,0.0,0.0,1,,,0,2015-07-02,75.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0
3,1,0.0,2,1,0.0,1,1,0.0,13,0.0,0.0,1,,304.0,0,2015-07-02,75.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,1,0.0,2,2,0.2,2,1,0.0,14,0.0,0.0,1,,240.0,0,2015-07-03,98.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
5,1,0.0,2,2,0.2,2,1,0.0,14,0.0,0.0,1,,240.0,0,2015-07-03,98.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
6,2,0.0,1,2,0.0,2,1,0.0,0,0.0,0.0,2,,,0,2015-07-03,107.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [None]:
df5.staysinweekendnights.max()

19

In [None]:
# Setting test date threshold
test_split_date = '2017-08-01'
result_df = df5[df5.arrivaldate >= test_split_date][['company', 'agent', 'arrivaldate', 'adr']].copy()


In [None]:
result_df.arrivaldate.max(), result_df.arrivaldate.min(), result_df.shape

(Timestamp('2017-08-31 00:00:00'), Timestamp('2017-08-01 00:00:00'), (4897, 4))

In [None]:
d# Drop white spaces
f5.dropna(subset=['agent'], inplace=True)
df5['agent'] = df5['agent'].str.strip()
df5['company'] = df5['company'].str.strip()



In [None]:
# Replacing NULL values with 0
df5['agent'] = df5['agent'].replace('NULL', 0)
df5['company'] = df5['company'].replace('NULL', 0)


In [None]:
df5.agent.unique()

array([0, '304', '240', '303', '15', '241', '8', '250', '115', '5', '175',
       '134', '156', '243', '242', '3', '105', '40', '147', '306', '184',
       '96', '2', '127', '95', '146', '9', '177', '6', '143', '244',
       '149', '167', '300', '171', '305', '67', '196', '152', '142',
       '261', '104', '36', '26', '29', '258', '110', '71', '181', '88',
       '251', '275', '69', '248', '208', '256', '314', '126', '281',
       '273', '185', '330', '334', '328', '326', '321', '324', '313',
       '38', '155', '68', '335', '308', '332', '94', '348', '310', '339',
       '375', '66', '327', '387', '298', '91', '245', '253', '385', '257',
       '393', '168', '405', '249', '315', '75', '128', '307', '11', '436',
       '1', '201', '183', '223', '368', '336', '291', '464', '411', '481',
       '10', '154', '468', '410', '390', '440', '495', '492', '493',
       '434', '57', '531', '420', '483', '526', '472', '429', '16', '446',
       '34', '78', '139', '252', '270', '47', '114', '301',

In [None]:
df5.shape

(117430, 42)

In [None]:
normalized_cols_V2 = ['reservedroomtype' , 'assignedroomtype' , 'adults' , 'staysinweeknights' , 'meal' ,
                      'leadtime' , 'country', 'company' , 'agent' , 'staysinweekendnights']
df6 = normalize_data(df5, normalized_cols_V2)
df6.shape

(117430, 42)

In [None]:
#ReservationStatusDate
df6.head()

Unnamed: 0,reservedroomtype,children,assignedroomtype,adults,totalofspecialrequests,staysinweeknights,meal,requiredcarparkingspaces,leadtime,previouscancellations,previousbookingsnotcanceled,country,company,agent,staysinweekendnights,reservationstatusdate,adr,arrivaldate,isrepeatedguest_cat_0,isrepeatedguest_cat_1,deposittype_cat_No Deposit,deposittype_cat_Non Refund,deposittype_cat_Refundable,hoteltype_cat_city,hoteltype_cat_resort,customertype_cat_Contract,customertype_cat_Group,customertype_cat_Transient,customertype_cat_Transient-Party,marketsegment_cat_Aviation,marketsegment_cat_Complementary,marketsegment_cat_Corporate,marketsegment_cat_Direct,marketsegment_cat_Groups,marketsegment_cat_Offline TA/TO,marketsegment_cat_Online TA,marketsegment_cat_Undefined,distributionchannel_cat_Corporate,distributionchannel_cat_Direct,distributionchannel_cat_GDS,distributionchannel_cat_TA/TO,distributionchannel_cat_Undefined
2,0.0,0.0,0.0,0.25,0.0,0.02,0.0,0.0,0.009873,0.0,0.0,0.0,0.0,0.0,0.0,2015-07-02,75.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0
3,0.0,0.0,0.1,0.25,0.0,0.02,0.0,0.0,0.018336,0.0,0.0,0.0,0.0,0.568224,0.0,2015-07-02,75.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,0.0,0.0,0.1,0.5,0.2,0.04,0.0,0.0,0.019746,0.0,0.0,0.0,0.0,0.448598,0.0,2015-07-03,98.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
5,0.0,0.0,0.1,0.5,0.2,0.04,0.0,0.0,0.019746,0.0,0.0,0.0,0.0,0.448598,0.0,2015-07-03,98.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
6,0.125,0.0,0.0,0.5,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.00565,0.0,0.0,0.0,2015-07-03,107.0,2015-07-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [None]:
df6[df6.reservationstatusdate <= df6.arrivaldate].shape, df6.shape

((44034, 42), (117430, 42))

In [None]:
# Set flag if reservation changes before arrival
df6['isreservationcahanged'] = (df6['reservationstatusdate'] <= df6['arrivaldate']).astype(int)

df6[['isreservationcahanged', 'reservationstatusdate', 'arrivaldate']].head(10)


Unnamed: 0,isreservationcahanged,reservationstatusdate,arrivaldate
2,0,2015-07-02,2015-07-01
3,0,2015-07-02,2015-07-01
4,0,2015-07-03,2015-07-01
5,0,2015-07-03,2015-07-01
6,0,2015-07-03,2015-07-01
7,0,2015-07-03,2015-07-01
8,1,2015-05-06,2015-07-01
9,1,2015-04-22,2015-07-01
10,1,2015-06-23,2015-07-01
11,0,2015-07-05,2015-07-01


In [None]:
df6.shape

(117430, 43)

In [None]:
df6 = df6.drop(columns='reservationstatusdate')
df6.shape

(117430, 42)

In [None]:
df6.arrivaldate.max()

Timestamp('2017-08-31 00:00:00')

In [None]:
df7 = df6[df6.arrivaldate >= '2016-08-01'].copy()
df7.arrivaldate.max(),df7.arrivaldate.min(),df7.shape

(Timestamp('2017-08-31 00:00:00'),
 Timestamp('2016-08-01 00:00:00'),
 (64831, 42))

In [None]:
#Split data as train and test case
test_split_date = '2017-08-01'
test_df = df7[df7.arrivaldate >= test_split_date].copy()
train_df = df7[df7.arrivaldate < test_split_date].copy()
train_df.shape, test_df.shape

((59934, 42), (4897, 42))

In [None]:
train_df.arrivaldate.min(), train_df.arrivaldate.max()

(Timestamp('2016-08-01 00:00:00'), Timestamp('2017-07-31 00:00:00'))

In [None]:
test_df.arrivaldate.min(), test_df.arrivaldate.max()

(Timestamp('2017-08-01 00:00:00'), Timestamp('2017-08-31 00:00:00'))

In [None]:
# Defining a method to move target data to the end of the dataframe
def move_target_to_end(df, target_column):
    """
    Move the target column to the end of the DataFrame.

    Parameters:
    - df: pandas DataFrame
    - target_column: str, the name of the target column

    Returns:
    - df: pandas DataFrame, updated DataFrame
    """

    # Ensure the target column is in the DataFrame
    if target_column not in df.columns:
        print(f"Error: Target column '{target_column}' not found in the DataFrame.")
        return df

    # Reorder columns to move the target column to the end
    new_order = [col for col in df.columns if col != target_column] + [target_column]
    df = df[new_order]

    return df

In [None]:
# Move target column to the end of list
train_df = move_target_to_end(train_df, 'adr')
test_df = move_target_to_end(test_df, 'adr')

In [None]:
test_df.head()

Unnamed: 0,reservedroomtype,children,assignedroomtype,adults,totalofspecialrequests,staysinweeknights,meal,requiredcarparkingspaces,leadtime,previouscancellations,previousbookingsnotcanceled,country,company,agent,staysinweekendnights,arrivaldate,isrepeatedguest_cat_0,isrepeatedguest_cat_1,deposittype_cat_No Deposit,deposittype_cat_Non Refund,deposittype_cat_Refundable,hoteltype_cat_city,hoteltype_cat_resort,customertype_cat_Contract,customertype_cat_Group,customertype_cat_Transient,customertype_cat_Transient-Party,marketsegment_cat_Aviation,marketsegment_cat_Complementary,marketsegment_cat_Corporate,marketsegment_cat_Direct,marketsegment_cat_Groups,marketsegment_cat_Offline TA/TO,marketsegment_cat_Online TA,marketsegment_cat_Undefined,distributionchannel_cat_Corporate,distributionchannel_cat_Direct,distributionchannel_cat_GDS,distributionchannel_cat_TA/TO,distributionchannel_cat_Undefined,isreservationcahanged,adr
13115,0.25,0.0,0.2,0.5,0.0,0.02,0.0,0.0,0.172073,0.0,0.0,0.141243,0.0,0.448598,0.0,2017-08-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,209.0
13116,0.0,0.0,0.1,0.5,0.2,0.06,0.0,0.0,0.110014,0.0,0.0,0.19774,0.0,0.448598,0.0,2017-08-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,230.0
13117,0.0,0.0,0.1,0.25,0.2,0.08,0.0,0.0,0.094499,0.0,0.0,0.00565,0.0,0.586916,0.0,2017-08-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,188.625
13118,0.0,0.0,0.1,0.5,0.2,0.08,0.0,0.0,0.09732,0.0,0.0,0.00565,0.0,0.452336,0.0,2017-08-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,230.0
13119,0.0,0.0,0.1,0.5,0.0,0.08,0.0,0.0,0.279267,0.0,0.0,0.067797,0.0,0.448598,0.0,2017-08-01,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,157.25


In [None]:
# Calculating day difference for date column
def calculate_day_difference(df, column_name):
    df[column_name] = pd.to_datetime(df[column_name]).dt.date
    df[column_name] = (df[column_name].max() - df[column_name]).dt.days
    return df

In [None]:
test_df = calculate_day_difference(test_df, 'arrivaldate')
train_df = calculate_day_difference(train_df, 'arrivaldate')

train_df[['arrivaldate' , 'adr']].tail(5)

Unnamed: 0,arrivaldate,adr
117737,0,112.5
117772,0,132.5
117821,6,107.125
117822,6,107.125
117829,0,170.0


In [None]:
# Checking NaN values
print(train_df.isnull().sum())#children
print(test_df.isnull().sum())

reservedroomtype                     0
children                             0
assignedroomtype                     0
adults                               0
totalofspecialrequests               0
staysinweeknights                    0
meal                                 0
requiredcarparkingspaces             0
leadtime                             0
previouscancellations                0
previousbookingsnotcanceled          0
country                              0
company                              0
agent                                0
staysinweekendnights                 0
arrivaldate                          0
isrepeatedguest_cat_0                0
isrepeatedguest_cat_1                0
deposittype_cat_No Deposit           0
deposittype_cat_Non Refund           0
deposittype_cat_Refundable           0
hoteltype_cat_city                   0
hoteltype_cat_resort                 0
customertype_cat_Contract            0
customertype_cat_Group               0
customertype_cat_Transien

In [None]:
# Replaces NaN values in the DataFrame with the mean of the corresponding column.

def fill_na_with_mean(df, column_name):

    # Calculate the mean of the corresponding column
    mean_value = df[column_name].mean()

    # Replace NaN values with the mean of the column
    #df[column_name].fillna(mean_value, inplace=True)
    df[column_name].fillna(0, inplace=True)
    return df


In [None]:
# Nan data imputation
for col in train_df.columns:
  #print(col)
  train_df = fill_na_with_mean(train_df, col)
  test_df = fill_na_with_mean(test_df, col)

In [None]:
# Convert target data from float to int
train_df['adr'] = train_df['adr'].astype(int)
test_df['adr'] = test_df['adr'].astype(int)

In [None]:
# Split train and test datas for input and output
y_train = train_df['adr'].copy() #features (independent variables)
X_train = train_df.drop(columns=['adr']) #features (independent variables)
y_test = test_df['adr'].copy() #features (independent variables)
X_test = test_df.drop(columns=['adr']) #features (independent variables)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((59934, 41), (59934,), (4897, 41), (4897,))

In [None]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

Mem. usage decreased to  3.72 Mb (59.9% reduction)
Mem. usage decreased to  0.30 Mb (60.5% reduction)


In [None]:
X_train.shape, X_test.shape

((59934, 41), (4897, 41))

In [None]:
# Load a joblib model from the specified file path.

def load_joblib_model(model):

    # Define the filename
    filename = f"{model}.sav"

    try:
        # Load the model from the joblib file
        loaded_model = joblib.load(filename)
        return loaded_model
    except Exception as e:
        print(f"Error loading the joblib model: {e}")
        return None

In [None]:
result_df.head()

Unnamed: 0,company,agent,arrivaldate,adr
13115,,240,2017-08-01,209.0
13116,,240,2017-08-01,230.0
13117,,314,2017-08-01,188.625
13118,,242,2017-08-01,230.0
13119,,240,2017-08-01,157.25


#RandomForestClassifier


In [None]:
n_estimators = [700]
max_depth = [10, 8]
min_samples_split = [10, 5]
min_samples_leaf = [5, 3]

In [None]:
hyper_random = {"n_estimators":n_estimators,
                "max_depth":max_depth,
                "min_samples_split":min_samples_split,
                "min_samples_leaf":min_samples_leaf}

In [None]:
%%time

clf_rf_tuned = GridSearchCV(RandomForestClassifier(), hyper_random,
                            cv = 5, verbose = 1,
                            n_jobs = -1)
clf_rf_tuned.fit(X_train, y_train)
print('Model Trained')

best_params_random = clf_rf_tuned.best_params_
print(best_params_random)

CV_clf_rf = RandomForestClassifier(max_depth=best_params_random["max_depth"],
                                   min_samples_leaf=best_params_random["min_samples_leaf"],
                                   min_samples_split=best_params_random["min_samples_split"],
                                   n_estimators= best_params_random["n_estimators"])


CV_clf_rf.fit(X_train, y_train)
print('Model was fitted')

# save the model to disk
filename = 'random_forest_classifier.sav'
pickle.dump(CV_clf_rf, open(filename, 'wb'))
print('Model dumped')


Fitting 5 folds for each of 8 candidates, totalling 40 fits




Model Trained
{'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 700}
Model was fitted
Model dumped
CPU times: user 1min 29s, sys: 14.6 s, total: 1min 43s
Wall time: 23min 52s


In [None]:
# Load model
CV_clf_rf = load_joblib_model('random_forest_classifier')

In [None]:


# Make prediction
y_test_predict_random = CV_clf_rf.predict(X_test)
print('Model prediction completed')

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_test_predict_random)
print(f"Accuracy: {accuracy}")

#print(classification_report(y_test, y_test_predict_random))

Model prediction completed
Accuracy: 0.08515417602613845


In [None]:
result_df['Random_Forest_Prodecitons'] = y_test_predict_random
result_df.head()

Unnamed: 0,company,agent,arrivaldate,adr,Random_Forest_Prodecitons
13115,,240,2017-08-01,209.0,100
13116,,240,2017-08-01,230.0,80
13117,,314,2017-08-01,188.625,37
13118,,242,2017-08-01,230.0,80
13119,,240,2017-08-01,157.25,80


In [None]:
result_df.to_csv("RF_Prediction_Results.csv", index=False)



# CalibratedClassifierCV

In [None]:
import pandas as pd

# Analyzing the class distribution
class_distribution = pd.Series(y_train).value_counts()
print(class_distribution)

90     1107
80     1044
130     900
95      854
75      846
       ... 
14        1
360       1
344       1
397       1
339       1
Name: adr, Length: 331, dtype: int64


In [None]:
# Read dataset
result_file_path = 'Prediction_Results.csv'

try:
    result_df = pd.read_csv(result_file_path)
except pd.errors.ParserError as e:
    print(f'Error while parsing CSV file: {e}')

print(result_df.shape)

(4897, 8)


In [None]:
%%time

# Create a corrected classifier.

clf_sigmoid = CalibratedClassifierCV(CV_clf_rf, method='sigmoid')

clf_sigmoid.fit(X_train, y_train)
print('Model was fitted')

# save the model to disk
filename = 'calibrated_classifier.sav'
pickle.dump(clf_sigmoid, open(filename, 'wb'))
print('Model dumped')




Model was fitted


In [None]:
#clf_sigmoid = load_joblib_model('calibrated_classifier')
y_test_predict_random_calibrated = clf_sigmoid.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_test_predict_random_calibrated)
print(f"Accuracy: {accuracy}")

#print(classification_report(y_test, y_test_predict_random_calibrated))

In [None]:
# Read dataset

%%time
result_file_path = 'Prediction_Results.csv'

try:
    result_df = pd.read_csv(result_file_path)
except pd.errors.ParserError as e:
    print(f'Error while parsing CSV file: {e}')



result_df.shape

In [None]:
result_df['Calibrated_Random_Forest_Probability'] = y_test_predict_random_calibrated
result_df.to_csv("Prediction_Results.csv", index=False)


In [None]:
result_df.head(20)