* https://www.sciencedirect.com/science/article/pii/S2352340918315191
* https://www.kaggle.com/datasets/jessemostipak/hotel-booking-demand

<img src="https://ars.els-cdn.com/content/image/1-s2.0-S2352340918315191-gr1.jpg">

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df_csv = pd.read_csv("data/hotel_bookings.csv")
df_csv.shape

(119390, 32)

In [3]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [4]:
chunk_size = 1e4
chunk_iter = pd.read_csv("data/hotel_bookings.csv", chunksize=chunk_size)

In [5]:
def downcast(df_chunk):
    for col in df_chunk.columns:
        dtypes_name = df_chunk[col].dtypes.name
        if dtypes_name.startswith("float"):
            df_chunk[col] = pd.to_numeric(df_chunk[col], downcast="float")
        elif dtypes_name.startswith("int"):
            # 최솟값을 구해서 음수가 있을 때는 integer
            # 음수가 없을 때는 unsigned
            if df_chunk[col].min() < 0 :
                df_chunk[col] = pd.to_numeric(df_chunk[col], downcast="integer")
            else:
                df_chunk[col] = pd.to_numeric(df_chunk[col], downcast="unsigned")
        # 문자일 때는 category 로 변경해 줍니다.
        # 카디널리티가 높거나 텍스트 데이터에는 적합하지 않을 수 있습니다.
        elif dtypes_name.startswith("object"):
                df_chunk[col] = df_chunk[col].astype("category")
    return df_chunk

In [6]:
row_count = 0
chunk_list = [] 
for chunk in chunk_iter:
    print(chunk.shape)
    row_count = row_count + chunk.shape[0]
    chunk_list.append(downcast(chunk))
row_count 

(10000, 32)
(10000, 32)
(10000, 32)
(10000, 32)
(10000, 32)
(10000, 32)
(10000, 32)
(10000, 32)
(10000, 32)
(10000, 32)
(10000, 32)
(9390, 32)


119390

In [7]:
df = pd.concat(chunk_list, ignore_index=True)
df.shape

(119390, 32)

In [8]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype   
---  ------                          --------------   -----   
 0   hotel                           119390 non-null  object  
 1   is_canceled                     119390 non-null  uint8   
 2   lead_time                       119390 non-null  uint16  
 3   arrival_date_year               119390 non-null  uint16  
 4   arrival_date_month              119390 non-null  object  
 5   arrival_date_week_number        119390 non-null  uint8   
 6   arrival_date_day_of_month       119390 non-null  uint8   
 7   stays_in_weekend_nights         119390 non-null  uint8   
 8   stays_in_week_nights            119390 non-null  uint8   
 9   adults                          119390 non-null  uint8   
 10  children                        119386 non-null  float32 
 11  babies                          119390 non-null  uint8   
 12  me