# Import

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Helper Functions

In [2]:
def create_leaf(y, ml_task):
    
    if ml_task == "regression":
        leaf = float(np.mean(y))
    else:
        counts = y.value_counts().reset_index()
        leaf = counts.iloc[0,0]
    
    return leaf


def get_potential_splits(data):
    
    X = data.drop(columns='target')
    potential_splits = {}
    columns = X.columns.tolist()
    for column in columns:

        values = X[[column]]
        unique_values = np.unique(values)
        
        potential_splits[column] = unique_values - 1
    
    return potential_splits


def calculate_gini(y):
    
    counts = y.value_counts().to_numpy()
    probabilities = counts / counts.sum()
    gini = np.sum(probabilities*(1-probabilities))
     
    return gini


def calculate_mse(y):
    
    if len(y) == 0:
        mse = 0
    else:
        mse = np.mean((y - np.mean(y)) **2)
    
    return mse


def total_impurity(data_left, data_right, metric_function):\

    n = len(data_left) + len(data_right)
    prop_left = len(data_left) / n
    prop_right = len(data_right) / n

    overall_metric =  (prop_left * metric_function(data_left['target']) 
                     + prop_right * metric_function(data_right['target']))
    
    return overall_metric


def split_data(data, column_types, split_column, split_value):
    
    type_of_feature = column_types[split_column]

    if type_of_feature == "continuous":
        data_left = data[data[split_column] <= split_value]
        data_right = data[data[split_column] >  split_value]
    
    else:
        data_left = data[data[split_column] == split_value]
        data_right = data[data[split_column] != split_value]
    
    return data_left, data_right


def determine_best_split(data, column_types, potential_splits, ml_task):

    best_overall_metric = np.inf
    for column, splits in potential_splits.items():
        for split in splits:
            
            data_left, data_right = split_data(data, column_types, split_column=column, split_value=split)
            
            if ml_task == "regression":
                node_impurity = total_impurity(data_left, data_right, metric_function=calculate_mse)
            else:
                node_impurity = total_impurity(data_left, data_right, metric_function=calculate_gini)
            
            if node_impurity <= best_overall_metric:
                best_overall_metric = node_impurity
                best_split_column = column
                best_split_value = split
    
    return best_split_column, best_split_value

# Algorithm

In [3]:
def decision_tree_algorithm(df, column_types, ml_task, min_samples=2, max_depth=5):
    
    leaves = []
    path = 'root'
    datasets = [(df,path)]
    split_conditions = []
    for current_depth in range(max_depth+1):
        next_set = []
        for dataset in datasets:
            data = dataset[0]
            path = dataset[1]
            
            if (len(data.target.unique()) == 1) or (len(data) < min_samples):
                leaf = create_leaf(data[['target']], ml_task)
                leaves.append((path,leaf))
                continue

            potential_splits = get_potential_splits(data)
            split_column, split_value = determine_best_split(data, column_types, potential_splits, ml_task)
            data_left, data_right = split_data(data, column_types, split_column, split_value)

            if len(data_left) == 0 or len(data_right) == 0:
                leaf = create_leaf(data[['target']], ml_task)
                leaves.append((path,leaf))
                continue
            print(len(data_left),len(data_right))
            split_conditions.append((path,split_column,split_value))
            next_set.append((data_left,path+',l'))
            next_set.append((data_right,path+',r'))

        datasets = next_set

    for dataset in datasets:
        data = dataset[0]
        path = dataset[1]
        leaf = create_leaf(data[['target']], ml_task)
        leaves.append((path,leaf))

    return leaves, split_conditions

# Make predictions with decision tree

def make_predictions(df, column_types, leaves, split_conditions):

    df['path'] = 'root'
    df['value'] = 0
    
    for split_condition in split_conditions:
        path = split_condition[0]
        column = split_condition[1]
        value = split_condition[2]

        if column_types[column] == "continuous":
            df.loc[(df['path']==path)&(df[column]<= value),'path'] = path+',l'
            df.loc[(df['path']==path)&(df[column]> value),'path'] = path+',r'
        else:
            df.loc[(df['path']==path)&(df[column]== value),'path'] = path+',l'
            df.loc[(df['path']==path)&(df[column]!= value),'path'] = path+',r'

    df['prediction'] = df['path'].map(dict(leaves))

    return df


def calculate_accuracy(df, column_types, ml_task, leaves, split_conditions):
    predictions = make_predictions(df, column_types, leaves, split_conditions).prediction
    
    if ml_task == 'regression':    
        predictions_array = predictions.values
        target_array = df.target.values
        metric = sum((predictions_array - target_array)**2) / len(predictions_array)
        
    else:
        predictions_correct = predictions == df.target
        metric = predictions_correct.mean()
    
    return  metric

# Data Loading & Preprocessing

In [4]:
airbnb = pd.read_csv('data/regression/AB_NYC_2019.csv')

# Fill string columns' na with unknown
airbnb['name'].fillna('unknown',inplace=True)
airbnb['host_name'].fillna('unknown',inplace=True)

# Fill numerical columns' na with the mean
airbnb['reviews_per_month'].fillna(0,inplace=True)
'''not filling last_review yet to use it for feature engineering'''

airbnb.isnull().sum()

#Creating that binary column (new listing : Yes/No)
airbnb["new_listing"] = 0
airbnb.loc[airbnb['last_review'].isnull(), 'new_listing'] = 1

#Now fill NA
airbnb['last_review'].fillna(0,inplace=True)

# Cast the last_review column to datetime
airbnb['last_review'] = pd.to_datetime(airbnb['last_review'])

# Create a column days since last review
airbnb['review_recency'] = (airbnb.last_review.max() - airbnb.last_review).dt.days

# Create columns that depict the availability category the listing belongs to
airbnb['all_year_avail'] = (airbnb['availability_365']>350).astype(int)
airbnb['low_avail'] = (airbnb['availability_365']< 12).astype(int)

# Create a variable that shows the number of months this listing has been in the platform
airbnb['months'] = airbnb['number_of_reviews']/airbnb['reviews_per_month']
airbnb['months'] = airbnb['months'].fillna(0)

# One-hot encode the room_type and neighbourhood_group
airbnb['type'] = 1
airbnb_pivoted = airbnb.pivot(columns='room_type',values='type').fillna(0)
airbnb = airbnb.join(airbnb_pivoted)
airbnb_pivoted = airbnb.pivot(columns='neighbourhood_group',values='type').fillna(0)
airbnb = airbnb.join(airbnb_pivoted)

# Label encoding of neighbourhood
airbnb['neighbourhood'] = airbnb['neighbourhood'].astype("category").cat.codes

# Drop the now unneccessary columns
airbnb = airbnb.drop(columns=['type','room_type','neighbourhood_group','availability_365','neighbourhood'])

# Getting rid of unimportant columns
airbnb.drop(columns=['id','name','host_id','host_name', 'last_review'],inplace=True)

# log(price)
airbnb['price'] = np.log(airbnb['price']+1)
airbnb = airbnb.rename(columns={'price':'target'})

airbnb.head()

Unnamed: 0,latitude,longitude,target,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,new_listing,review_recency,all_year_avail,low_avail,months,Entire home/apt,Private room,Shared room,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,40.64749,-73.97237,5.010635,1,9,0.21,6,0,262,1,0,42.857143,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,40.75362,-73.98377,5.420535,1,45,0.38,2,0,48,1,0,118.421053,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,40.80902,-73.9419,5.01728,3,0,0.0,1,1,18085,1,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,40.68514,-73.95976,4.49981,1,270,4.64,1,0,3,0,0,58.189655,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,40.79851,-73.94399,4.394449,10,9,0.1,1,0,231,0,1,90.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [5]:
train, test = train_test_split(airbnb, test_size=0.3, random_state=42)
column_types = {'latitude':'continuous', 'longitude':'continuous', 'target':'continuous', 
       'minimum_nights':'continuous', 'number_of_reviews':'continuous', 'reviews_per_month':'continuous',
       'calculated_host_listings_count':'continuous', 'new_listing':'categorical', 'review_recency':'continuous',
       'all_year_avail':'categorical', 'low_avail':'categorical', 'months':'continuous', 'Entire home/apt':'categorical',
       'Private room':'categorical', 'Shared room':'categorical', 'Bronx':'categorical', 'Brooklyn':'categorical', 
       'Manhattan':'categorical', 'Queens':'categorical', 'Staten Island':'categorical'}
ml_task = 'regression'

# Model Training

In [6]:
leaves, split_conditions = decision_tree_algorithm(train, column_types, ml_task, min_samples=2, max_depth=3)

16446 17780
10532 5914
8484 9296
10059 473
5486 428
1822 6662
5333 3963
8496 1563
208 265
4701 785
253 175
1697 125
6421 241
2658 2675
2742 1221


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [7]:
dict(leaves)

{'root,l,l,l,l': 4.206446201084287,
 'root,l,l,l,r': 4.027048306803904,
 'root,l,l,r,l': 3.9451700562045393,
 'root,l,l,r,r': 3.5441721699410005,
 'root,l,r,l,l': 4.5420568542088455,
 'root,l,r,l,r': 4.319507433838237,
 'root,l,r,r,l': 4.650549135701824,
 'root,l,r,r,r': 5.212642587756372,
 'root,r,l,l,l': 4.7969359719482645,
 'root,r,l,l,r': 5.155863219219758,
 'root,r,l,r,l': 5.002688684207182,
 'root,r,l,r,r': 5.404515509769971,
 'root,r,r,l,l': 5.503678356602868,
 'root,r,r,l,r': 5.300648698395708,
 'root,r,r,r,l': 5.117294809733429,
 'root,r,r,r,r': 5.338164514567498}

# Prediction

In [8]:
predict = make_predictions(test, column_types, leaves, split_conditions)
predict

Unnamed: 0,latitude,longitude,target,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,new_listing,review_recency,all_year_avail,...,Private room,Shared room,Bronx,Brooklyn,Manhattan,Queens,Staten Island,path,value,prediction
879,40.64354,-73.97777,4.499810,3,62,0.71,1,0,187,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,"root,r,l,r,l",0,5.002689
44383,40.70666,-73.90779,3.433987,21,0,0.00,1,1,18085,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,"root,l,l,l,r",0,4.027048
15394,40.76116,-73.99016,4.795791,2,17,0.43,1,0,801,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,"root,l,r,l,l",0,4.542057
43230,40.70763,-74.01050,6.154858,2,5,1.88,327,0,36,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,"root,r,r,l,l",0,5.503678
16332,40.79658,-73.93287,5.298317,2,30,0.80,1,0,35,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,"root,r,r,l,r",0,5.300649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9370,40.67267,-73.97871,4.189655,1,0,0.00,1,1,18085,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,"root,l,l,l,l",0,4.206446
34442,40.72071,-73.97822,4.025352,5,2,0.18,1,0,322,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,"root,l,r,l,l",0,4.542057
9164,40.71688,-73.94162,4.510860,2,18,0.37,1,0,49,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,"root,l,l,l,l",0,4.206446
11841,40.72647,-73.98379,4.564348,1,2,0.04,2,0,1336,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,"root,l,r,l,l",0,4.542057


In [9]:
MSE = calculate_accuracy(test, column_types, ml_task, leaves, split_conditions)
MSE

0.24209229480037586

In [10]:
# Specifically for this task where we take log(price+1) as target variable
# We need to exponentiate the price back to its original form

predictions_array = make_predictions(test, column_types, leaves, split_conditions).prediction.values
target_array = test.target.values
RMSE = np.sqrt(sum(np.exp(predictions_array)- np.exp(target_array))**2 / len(predictions_array))
RMSE

2792.8424072744224