In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
import requests
import time
import pycountry
from scipy.stats import linregress

In [2]:
# Data File
hotel_data_path = "data/hotel_bookings.csv"

# Read Data File and store into Pandas DataFrames
hotel_data = pd.read_csv(hotel_data_path)

hotel_data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [3]:
hotel_data.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')

In [4]:
# Create a new DataFrame
new_hotel_data = hotel_data.copy()

# Rename columns for better understanding
rename_columns = {
    'hotel': 'Hotel',
    'is_canceled': 'Is canceled',
    'lead_time': 'Lead time',
    'arrival_date_year': 'Arrival date year',
    'arrival_date_month': 'Arrival date month',
    'arrival_date_week_number': 'Arrival date week number',
    'arrival_date_day_of_month': 'Arrival date day of month',
    'stays_in_weekend_nights': 'Stays in weekend nights',
    'stays_in_week_nights': 'Stays in week nights',
    'adults': 'Adults',
    'children': 'Children',
    'babies': 'Babies',
    'meal': 'Meal',
    'country': 'Country',
    'market_segment': 'Market segment',
    'distribution_channel': 'Distribution channel',
    'is_repeated_guest': 'Is repeated guest',
    'previous_cancellations': 'Previous cancellations',
    'previous_bookings_not_canceled': 'Previous bookings not canceled',
    'reserved_room_type': 'Reserved room type',
    'assigned_room_type': 'Assigned room type',
    'booking_changes': 'Booking changes',
    'deposit_type': 'Deposit type',
    'agent': 'Agent',
    'company': 'Company',
    'days_in_waiting_list': 'Days in waiting list',
    'customer_type': 'Customer type',
    'adr': 'ADR',
    'required_car_parking_spaces': 'Required car parking spaces',
    'total_of_special_requests': 'Total of special requests',
    'reservation_status': 'Reservation status',
    'reservation_status_date': 'Reservation status date'
}

# Rename DataFrame columns using dictionary
new_hotel_data.rename(columns=rename_columns, inplace=True)

# Show the new column names
new_hotel_data.columns

Index(['Hotel', 'Is canceled', 'Lead time', 'Arrival date year',
       'Arrival date month', 'Arrival date week number',
       'Arrival date day of month', 'Stays in weekend nights',
       'Stays in week nights', 'Adults', 'Children', 'Babies', 'Meal',
       'Country', 'Market segment', 'Distribution channel',
       'Is repeated guest', 'Previous cancellations',
       'Previous bookings not canceled', 'Reserved room type',
       'Assigned room type', 'Booking changes', 'Deposit type', 'Agent',
       'Company', 'Days in waiting list', 'Customer type', 'ADR',
       'Required car parking spaces', 'Total of special requests',
       'Reservation status', 'Reservation status date'],
      dtype='object')

In [5]:
new_hotel_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Hotel                           119390 non-null  object 
 1   Is canceled                     119390 non-null  int64  
 2   Lead time                       119390 non-null  int64  
 3   Arrival date year               119390 non-null  int64  
 4   Arrival date month              119390 non-null  object 
 5   Arrival date week number        119390 non-null  int64  
 6   Arrival date day of month       119390 non-null  int64  
 7   Stays in weekend nights         119390 non-null  int64  
 8   Stays in week nights            119390 non-null  int64  
 9   Adults                          119390 non-null  int64  
 10  Children                        119386 non-null  float64
 11  Babies                          119390 non-null  int64  
 12  Meal            

In [6]:
# Removing the 'Company' column due to missing data
new_hotel_data.drop('Company',inplace=True,axis=1)

In [22]:
# Export the Data into a csv
## hotel_data_df.to_csv("output_data/hotel_data.csv", index_label="")

In [23]:
# Save the figure
## plt.savefig("output_data/Fig1.png")