### Import Statements

In [28]:
import numpy as np
import pandas as pd

import random
from pprint import pprint

### Load And Prepare Data

In [29]:
df = pd.read_csv("Hotel-Reservations.csv")
df = df.drop(columns="Booking_ID",axis=1)
df = df.rename(columns={"booking_status":"label"})

In [30]:
df['date'] = pd.to_datetime(dict(year=df.arrival_year, month=df.arrival_month, day=df.arrival_date), errors='coerce')
df["no_of_people"] = df["no_of_adults"]+df["no_of_children"]
df["no_of_nights"] = df["no_of_week_nights"]+df["no_of_weekend_nights"]
df['total_previous_booking'] = df['no_of_previous_cancellations'] + df['no_of_previous_bookings_not_canceled']

In [31]:
features = ['no_of_people','no_of_nights', 'total_previous_booking', 'room_type_reserved', 'avg_price_per_room','lead_time','market_segment_type','repeated_guest',
'no_of_special_requests', 'type_of_meal_plan', 'label']

In [32]:
# features = ['no_of_people','no_of_nights','avg_price_per_room','lead_time','total_previous_booking', 'label']

In [33]:
df = df[features]
df.head()

Unnamed: 0,no_of_people,no_of_nights,total_previous_booking,room_type_reserved,avg_price_per_room,lead_time,market_segment_type,repeated_guest,no_of_special_requests,type_of_meal_plan,label
0,2,3,0,Room_Type 1,65.0,224,Offline,0,0,Meal Plan 1,Not_Canceled
1,2,5,0,Room_Type 1,106.68,5,Online,0,1,Not Selected,Not_Canceled
2,1,3,0,Room_Type 1,60.0,1,Online,0,0,Meal Plan 1,Canceled
3,2,2,0,Room_Type 1,100.0,211,Online,0,0,Meal Plan 1,Canceled
4,2,2,0,Room_Type 1,94.5,48,Online,0,0,Not Selected,Canceled


In [34]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["room_type_reserved"] = le.fit_transform(df["room_type_reserved"])
df["market_segment_type"] = le.fit_transform(df["market_segment_type"])
df["type_of_meal_plan"] = le.fit_transform(df["type_of_meal_plan"])


In [35]:
for column in df.columns:
    print(column, '-'  ,len(df[column].unique()))

no_of_people - 8
no_of_nights - 25
total_previous_booking - 63
room_type_reserved - 7
avg_price_per_room - 3930
lead_time - 352
market_segment_type - 5
repeated_guest - 2
no_of_special_requests - 6
type_of_meal_plan - 4
label - 2


### Train_Test_Split

In [36]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices,axis='index')
    
    return train_df, test_df

In [37]:
train_df, test_df = train_test_split(df, test_size = 0.2)

In [38]:
train_df.head()

Unnamed: 0,no_of_people,no_of_nights,total_previous_booking,room_type_reserved,avg_price_per_room,lead_time,market_segment_type,repeated_guest,no_of_special_requests,type_of_meal_plan,label
0,2,3,0,0,65.0,224,3,0,0,0,Not_Canceled
1,2,5,0,0,106.68,5,4,0,1,3,Not_Canceled
2,1,3,0,0,60.0,1,4,0,0,0,Canceled
5,2,2,0,0,115.0,346,4,0,1,1,Canceled
6,2,4,0,0,107.55,34,4,0,1,0,Not_Canceled


In [39]:
train_df.shape
test_df.shape

(7255, 11)

### Helper Functions

In [40]:
data = train_df.values
data[:5]

array([[2, 3, 0, 0, 65.0, 224, 3, 0, 0, 0, 'Not_Canceled'],
       [2, 5, 0, 0, 106.68, 5, 4, 0, 1, 3, 'Not_Canceled'],
       [1, 3, 0, 0, 60.0, 1, 4, 0, 0, 0, 'Canceled'],
       [2, 2, 0, 0, 115.0, 346, 4, 0, 1, 1, 'Canceled'],
       [2, 4, 0, 0, 107.55, 34, 4, 0, 1, 0, 'Not_Canceled']], dtype=object)

Data pure?

In [41]:
def check_purity(data):

    label_column = data[:,-1]
    unique_class = np.unique(label_column)

    if len(unique_class) == 1:
        return True
    else:
        return False

In [42]:
check_purity(train_df.values)

False

Classify

In [43]:
def  classify_data(data):
    label_column = data[:,-1]
    unique_class, counts_unique_class = np.unique(label_column,return_counts=True)

    index = counts_unique_class.argmax()
    classification = unique_class[index]
    return classification

In [44]:
classify_data(test_df.values)

'Not_Canceled'

Potential Split

In [45]:
def get_potential_splits(data):
    potential_splits = {}
    _, n_column = data.shape

    for column_index in range(n_column - 1):
        potential_splits[column_index] = []
        values = data[:,column_index]
        unique_values = np.unique(values)

        potential_splits[column_index] = unique_values
    return potential_splits

In [46]:
get_potential_splits(test_df.values)

{0: array([1, 2, 3, 4, 5], dtype=object),
 1: array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 19, 20,
        21], dtype=object),
 2: array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 21,
        23, 28, 30, 33, 35, 37], dtype=object),
 3: array([0, 1, 2, 3, 4, 5, 6], dtype=object),
 4: array([0.0, 1.0, 2.0, ..., 349.63, 375.5, 540.0], dtype=object),
 5: array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
        53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
        70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
        87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
        103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
        116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
        1

In [47]:
train_df.values.shape

(29020, 11)

In [51]:
FEATURE_TYPES = ["continuous","continuous","continuous","categories",
        "continuous","continuous","categories","categories","continuous","categories"]

Split Data

In [52]:
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]
    type_of_features = FEATURE_TYPES[split_column]
    if type_of_features == "continuous":
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values >  split_value]
    else:
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]
    return data_below, data_above

In [49]:
test_df.head()

Unnamed: 0,no_of_people,no_of_nights,total_previous_booking,room_type_reserved,avg_price_per_room,lead_time,market_segment_type,repeated_guest,no_of_special_requests,type_of_meal_plan,label
8253,1,4,0,0,132.05,28,4,0,0,3,Canceled
13814,2,6,0,0,89.25,144,4,0,0,3,Canceled
33125,2,4,0,0,144.0,62,4,0,1,0,Not_Canceled
11901,3,2,0,3,151.2,85,4,0,2,0,Not_Canceled
10442,2,4,0,0,88.4,38,4,0,2,0,Not_Canceled


Lowest Overall Entropy?

In [61]:
def calculate_entropy(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy

In [62]:
def calculate_overall_entropy(data_below, data_above):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * calculate_entropy(data_below) 
                      + p_data_above * calculate_entropy(data_above))
    
    return overall_entropy

In [63]:
def determine_best_split(data, potential_splits):
    
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)

            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

In [None]:
train_df.head()

Unnamed: 0,no_of_people,no_of_nights,total_previous_booking,room_type_reserved,avg_price_per_room,lead_time,market_segment_type,repeated_guest,no_of_special_requests,type_of_meal_plan,label
0,2,3,0,0,65.0,224,3,0,0,0,Not_Canceled
1,2,5,0,0,106.68,5,4,0,1,3,Not_Canceled
3,2,2,0,0,100.0,211,4,0,0,0,Canceled
4,2,2,0,0,94.5,48,4,0,0,3,Canceled
5,2,2,0,0,115.0,346,4,0,1,1,Canceled


### Decision Tree Algorithm

In [None]:
tree = {"question":[{"question":[]},]}

In [59]:
def decision_tree_algorithm(df, counter=0, min_samples=5, max_depth=20):
    
    # data preparations
    if counter == 0:
        global COLUMN_HEADERS
        global FEATURE_TYPES
        COLUMN_HEADERS = df.columns
        FEATURE_TYPES = ["continuous","continuous","continuous","categories",
        "continuous","continuous","categories","categories","continuous","categories"]
        data = df.values
    else:
        data = df           
    
    
    # base cases
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        
        return classification

    
    # recursive part
    else:    
        counter += 1

        # helper functions 
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
          # check for empty data
        if len(data_below) == 0 or len(data_above) == 0:
            classification = classify_data(data)
            return classification

        
        # determine question
        feature_name = COLUMN_HEADERS[split_column]
        type_of_feature = FEATURE_TYPES[split_column]
        if type_of_feature == "continuous":
            question = "{} <= {}".format(feature_name, split_value)
            
        # feature is categorical
        else:
            question = "{} = {}".format(feature_name, split_value)
            
        # instantiate sub-tree
        feature_name = COLUMN_HEADERS[split_column]
        question = "{} <= {}".format(feature_name, split_value)
        sub_tree = {question: []}
        
        # find answers (recursion)
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth)
        
        # If the answers are the same, then there is no point in asking the qestion.
        # This could happen when the data is classified even though it is not pure
        # yet (min_samples or max_depth base cases).
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree

In [64]:
tree = decision_tree_algorithm(train_df,min_samples=10,max_depth=100)

In [65]:
pprint(tree,width=50)

{'lead_time <= 151': [{'no_of_special_requests <= 0': [{'market_segment_type <= 4': [{'lead_time <= 13': [{'avg_price_per_room <= 99.14': [{'avg_price_per_room <= 15.0': ['Not_Canceled',
                                                                                                                                                                           {'avg_price_per_room <= 60.0': [{'avg_price_per_room <= 59.94': [{'avg_price_per_room <= 44.6': ['Canceled',
                                                                                                                                                                                                                                                                            'Not_Canceled']},
                                                                                                                                                                                                                                            'Canceled']

## Classification

In [66]:
def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split()

    # ask question
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)

In [67]:
example = test_df.iloc[0]
example

no_of_people                     1
no_of_nights                     4
total_previous_booking           0
room_type_reserved               0
avg_price_per_room          132.05
lead_time                       28
market_segment_type              4
repeated_guest                   0
no_of_special_requests           0
type_of_meal_plan                3
label                     Canceled
Name: 8253, dtype: object

In [68]:
classify_example(example,tree)

'Canceled'

## Calculate Accuracy

In [69]:
def calculate_accuracy(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

In [70]:
ac = calculate_accuracy(test_df,tree)
ac

0.7192281185389386