In [1]:
import pandas as pd
import numpy as np
import re
import json
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from pprint import pprint
import folium
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from collections import Counter

# Modelos
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Métricas
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import pickle



In [2]:
class airbnb:
    
    def __init__(self, csvs, city_names):
            
            self.l_dfs = list()
            
            for enum, dataset in enumerate(csvs):
                
                self.l_dfs.append(pd.read_csv(dataset))
                
                self.l_dfs[enum].drop("source", axis = 1, inplace = True)
                
                self.l_dfs[enum]["city"] = city_names[enum].lower()
        
            self.df = pd.concat(self.l_dfs)
            
            print("Instance created!")
            
    def return_initial_df(self):
    
        return self.df
    
    def display__initial_df(self):
    
        display(self.df)

    def clean_tested_columns(self):
        
        """
        Sets predefined columns, transforms price to a float column and separates bathroom_text 
        into 3 different categories, private, shared and unknown.
        """
        
        # Sets predefined columns
        
        tested_cols = ['neighbourhood_cleansed', 'city',
                       'room_type', 'accommodates', 'availability_365',
                       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
                       'minimum_nights', 'maximum_nights',
                       'number_of_reviews', 'reviews_per_month', 'host_total_listings_count']
        
        self.df_cleaned = self.df[tested_cols]
        
        # Transforms price to a float column
        
        self.df_cleaned["price"] = self.df_cleaned["price"].apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x).values
            
        # Get numbers out of bathroom_text columns
        
        self.df_cleaned = self.df_cleaned[self.df_cleaned["bathrooms_text"].isnull() == False]

        l_nums = [re.findall(r'\d+',i) for i in self.df_cleaned["bathrooms_text"].values]

        l_nums_completed = []

        for i in l_nums:

            if len(i) > 1:

                l_nums_completed.append('.'.join(i))

            elif len(i) == 0:

                l_nums_completed.append('0')

            else:

                l_nums_completed.append(i[0])
                
        # Replace bathrooms_text with floats
        
        self.df_cleaned["bathrooms_text"] = l_nums_completed

        self.df_cleaned["bathrooms_text"] = self.df_cleaned["bathrooms_text"].astype("float64")
        
        # Amenities
                
        l_amenities_cleaned = list()
        
        for i in self.df_cleaned["amenities"]:

            l_amenities_cleaned.append(json.loads(i))

        # Most relevant amenities, detailed analysis in the EDA file

        l_amenities_valuables = ['Long term stays allowed','Cooking basics','Dishes and silverware','Essentials','Coffee maker','Hair dryer','Microwave','Refrigerator','Heating','Air conditioning']

        for j in l_amenities_valuables:

            self.df_cleaned[j] = [1 if j in i else 0 for i in l_amenities_cleaned]

        self.df_cleaned.drop("amenities", axis =1, inplace=True)
    
        # Room type
        
        self.df_cleaned = self.df_cleaned[self.df_cleaned["room_type"] != "Hotel room"]
        self.df_cleaned = pd.concat([self.df_cleaned, pd.get_dummies(data = self.df_cleaned["room_type"])], axis = 1).drop("room_type", axis = 1)
        
        self.df_cleaned.dropna(inplace = True)
        
    def return_cleaned(self):
        
        return self.df_cleaned
    
    def display_cleaned(self):
        
        display(self.df_cleaned)
    
    def remove_outliers(self, accommodates = 8, bathrooms_min = 1, bathrooms_max = 2, bedrooms = 4, beds_min = 1, beds_max = 5, minimum_nights = 30,
                       maximum_nights = 70000, nreviews = 375, reviews_pmonth = 9, price = 350, htlc = 50000):

        self.df_cleaned = self.df_cleaned[self.df_cleaned["accommodates"] <= accommodates]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["bathrooms_text"].between(bathrooms_min, bathrooms_max)]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["bedrooms"] <= bedrooms]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["beds"].between(beds_min, beds_max)]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["minimum_nights"] <= minimum_nights]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["maximum_nights"] <= maximum_nights]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["number_of_reviews"] <= nreviews]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["reviews_per_month"] <= reviews_pmonth]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["price"] <= price]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["host_total_listings_count"] <= htlc]

        return self.df_cleaned
    
    def display_outliers(self):
        
        for i in self.df_cleaned.columns:
    
            print(i)
            sns.kdeplot(self.df_cleaned[i])
            plt.show()

    def label_encoding(self):
        
        city_encoder = LabelEncoder()
        self.df_cleaned["city"] = city_encoder.fit_transform(self.df_cleaned["city"])
        neighbourhood_encoder = LabelEncoder()
        self.df_cleaned["neighbourhood_cleansed"] = neighbourhood_encoder.fit_transform(self.df_cleaned["neighbourhood_cleansed"])
        
        return self.df_cleaned
    
    def normalize(self):
        
        x_scaler = MinMaxScaler()
        self.df_cleaned[self.df_cleaned.drop("price", axis = 1).columns] = x_scaler.fit_transform(self.df_cleaned[self.df_cleaned.drop("price", axis = 1).columns])

        y_scaler = MinMaxScaler()
        self.df_cleaned["price"] = y_scaler.fit_transform(self.df_cleaned[["price"]]).flatten()
        
        return self.df_cleaned
    
    def tts(self):
        
        self.X = self.df_cleaned.drop(["price"], axis = 1)
        self.y = self.df_cleaned["price"]
                
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = 0.2, random_state = 42)

        print(f"X_train: {self.X_train.shape} | y_train: {self.y_train.shape}")
        print(f"X_test: {self.X_test.shape} | y_test: {self.y_test.shape}")
    
    def train_model(self):
        
        models = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor(),
                 RandomForestRegressor(), SVR(), AdaBoostRegressor(), GradientBoostingRegressor()]
                
        metrics = list()
        
        for model in models:
            
            # fit
            
            model.fit(self.X_train, self.y_train)

            # predict
            
            self.yhat = model.predict(self.X_test)
            
            # metrics
            
            r2 = r2_score(self.y_test, self.yhat)
            mse = mean_squared_error(self.y_test, self.yhat)
        
            metrics.append([str(model), r2, mse, model])
            
        self.df_metrics = pd.DataFrame(data = metrics, columns = ["model_name", "r2", "mse", "model"])
        self.df_metrics.sort_values(by = "r2", ascending = False, inplace= True)
        
    def return_metrics(self):
        
        return self.df_metrics
    
    def display_metrics(self):
        
        display(self.df_metrics)
        
    def model_feature_importances(self, model):
        
        importances = np.argsort(model.feature_importances_)[::-1]
        d_importances = dict()
        
        for i in importances:

            d_importances[i] = [model.feature_importances_[i]*100, self.df_cleaned.drop("price", axis = 1).columns[i]]
            print(i, model.feature_importances_[i]*100, self.df_cleaned.drop("price", axis = 1).columns[i])
            
        return d_importances
    
    def grid_search_cv_tuning(self):
        
        model = RandomForestRegressor()
        
        params = {"n_estimators" : [i for i in range(100, 1001, 50)],
                  "max_depth"    : [8, 10, 12, 14, 16],
                  "max_features" : ["log2", "sqrt"]}

        scorers = {"r2", "neg_mean_squared_error"}

        grid_solver = GridSearchCV(estimator  = model, 
                                   param_grid = params, 
                                   scoring    = scorers,
                                   cv         = 10,
                                   refit      = "r2",
                                   n_jobs     = -1, 
                                   verbose    = 2)

        self.model_result = grid_solver.fit(X_train, y_train)
        
    def return_model_result_gcv(self):
        
        return self.model_result
        
    def grid_search_cv_validation(self):
        
        l_validations = [self.model_result.best_estimator_,
                         self.model_result.cv_results_["mean_test_r2"].max(),
                         self.model_result.best_score_]
        self.df_validations = pd.DataFrame(data = l_validations, columns = ["Best Estimator",
                                                                            "Mean Test R**2",
                                                                            "Best Score"])
        return self.df_validations
    
    def final_trial_model(self):
        
        '''It trains the best model with the features recomended'''
        
        model = RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=650, random_state = 42)
        model.fit(self.X_train, self.y_train)
        
        self.yhat = model.predict(self.X_test)
    
        return f"r**2 = {r2_score(self.y_test, self.yhat)}"
    
    def train_final_model(self, max_depth, max_features, n_estimators,random_state):
        
        '''Returns the definitive model'''
        
        self.X_def = self.df_cleaned.drop(["price"], axis = 1)
        self.y_def = self.df_cleaned["price"]
        
        model = RandomForestRegressor(max_depth = max_depth, max_features = max_features, n_estimators = n_estimators, random_state = random_state)
        model.fit(self.X_def, self.y_def)
        
        return model
    
    def predict(self, array):  
        
        '''Predicts the price given a cleaned array with te features needed'''
        
        self.price_predicted = y_scaler.inverse_transform([model.predict([array])])
    
    def return_prediction(self):
        
        
        return self.price_predicted
    
    def save_model(self, name, ext, model):
    
        with open(f"{name}.{ext}", "wb") as file:
            pickle.dump(model)
            
    def load_model(self, name, ext):
        
        with open(f"{name}.{ext}", "rb") as file:
            self.model = pickle.load(file)
            
        
    

In [3]:
madrid = "datasets/madrid.csv"
barcelona = "datasets/barcelona.csv"
london = "datasets/london.csv"

d_csvs, d_names = dict(), dict()

d_csvs["csvs1"] = [madrid, barcelona]
d_csvs["csvs2"] = [london]

d_names["names1"] = ["madrid","barcelona"]
d_names["names2"] = ["london"]

In [4]:
d_dfs = dict()

In [5]:
%%time

for i in range(1,3):
    
    d_dfs[f"instance{i}"] = airbnb(d_csvs[f"csvs{i}"],d_names[f"names{i}"])

Instance created!
Instance created!
CPU times: user 3.06 s, sys: 266 ms, total: 3.32 s
Wall time: 3.47 s


In [6]:
d_dfs.values()

dict_values([<__main__.airbnb object at 0x7fa0a8b95160>, <__main__.airbnb object at 0x7fa0a8b97610>])

In [7]:
%%time
for instance in d_dfs.values():
    
    instance.clean_tested_columns()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_cleaned["price"] = self.df_cleaned["price"].apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_cleaned["price"] = self.df_cleaned["price"].apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x).values


CPU times: user 1.4 s, sys: 86.8 ms, total: 1.48 s
Wall time: 1.58 s


In [8]:
%%time
l_dfs = list()

for instance in d_dfs.values():
    
    l_dfs.append(instance.return_cleaned())

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.39 µs


In [9]:
d_dfs["instance1"].return_cleaned()

Unnamed: 0,neighbourhood_cleansed,city,accommodates,availability_365,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,...,Essentials,Coffee maker,Hair dryer,Microwave,Refrigerator,Heating,Air conditioning,Entire home/apt,Private room,Shared room
0,Hispanoamérica,madrid,2,56,1.0,1.0,1.0,77.0,3,1125,...,1,1,1,1,1,0,0,0,1,0
1,Cármenes,madrid,1,255,1.0,1.0,1.0,31.0,4,40,...,1,1,1,1,1,1,1,0,1,0
3,Legazpi,madrid,1,339,1.0,1.0,1.0,26.0,2,1125,...,1,0,1,0,1,1,0,0,1,0
4,Sol,madrid,2,271,1.5,1.0,1.0,49.0,2,90,...,1,0,1,1,1,1,0,0,1,0
5,Sol,madrid,2,353,1.0,1.0,2.0,120.0,5,180,...,0,0,0,0,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16915,la Maternitat i Sant Ramon,barcelona,6,0,1.5,1.0,6.0,5000.0,1,10,...,1,1,1,1,1,1,1,0,1,0
16916,la Maternitat i Sant Ramon,barcelona,6,0,1.5,1.0,6.0,5000.0,1,10,...,1,1,1,1,1,1,1,0,1,0
16917,la Maternitat i Sant Ramon,barcelona,6,0,1.5,1.0,6.0,5000.0,1,10,...,1,1,1,1,1,1,1,0,1,0
16918,Pedralbes,barcelona,5,0,2.0,4.0,5.0,250.0,6,1125,...,1,1,1,1,1,1,0,1,0,0


In [10]:
%%time

for instance in d_dfs.values():
    
    instance.remove_outliers()
    

CPU times: user 53.9 ms, sys: 3.92 ms, total: 57.8 ms
Wall time: 66.9 ms


In [11]:
d_dfs["instance1"].return_cleaned()

Unnamed: 0,neighbourhood_cleansed,city,accommodates,availability_365,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,...,Essentials,Coffee maker,Hair dryer,Microwave,Refrigerator,Heating,Air conditioning,Entire home/apt,Private room,Shared room
0,Hispanoamérica,madrid,2,56,1.0,1.0,1.0,77.0,3,1125,...,1,1,1,1,1,0,0,0,1,0
1,Cármenes,madrid,1,255,1.0,1.0,1.0,31.0,4,40,...,1,1,1,1,1,1,1,0,1,0
3,Legazpi,madrid,1,339,1.0,1.0,1.0,26.0,2,1125,...,1,0,1,0,1,1,0,0,1,0
5,Sol,madrid,2,353,1.0,1.0,2.0,120.0,5,180,...,0,0,0,0,0,1,1,1,0,0
6,Embajadores,madrid,1,287,1.0,1.0,1.0,20.0,6,1124,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16908,"Vallvidrera, el Tibidabo i les Planes",barcelona,3,0,2.0,2.0,2.0,109.0,8,30,...,1,0,0,0,0,1,1,1,0,0
16910,el Besòs i el Maresme,barcelona,4,56,1.0,2.0,2.0,146.0,4,1125,...,1,1,1,1,1,1,0,1,0,0
16911,el Besòs i el Maresme,barcelona,6,110,1.0,3.0,4.0,231.0,1,1125,...,1,1,1,1,1,0,1,1,0,0
16918,Pedralbes,barcelona,5,0,2.0,4.0,5.0,250.0,6,1125,...,1,1,1,1,1,1,0,1,0,0


In [12]:
for instance in d_dfs.values():
    instance.label_encoding()

In [13]:
for instance in d_dfs.values():
    instance.normalize()

In [14]:
for instance in d_dfs.values():
    instance.tts()

X_train: (16786, 25) | y_train: (16786,)
X_test: (4197, 25) | y_test: (4197,)
X_train: (33600, 25) | y_train: (33600,)
X_test: (8400, 25) | y_test: (8400,)


In [15]:
%%time

for name, instance in d_dfs.items():
    
    instance.train_model()
    print(name)
    instance.display_metrics()

instance1


Unnamed: 0,model_name,r2,mse,model
3,RandomForestRegressor(),0.738584,0.012878,"(DecisionTreeRegressor(max_features='auto', ra..."
6,GradientBoostingRegressor(),0.670557,0.016229,([DecisionTreeRegressor(criterion='friedman_ms...
4,SVR(),0.647817,0.017349,SVR()
1,KNeighborsRegressor(),0.588164,0.020288,KNeighborsRegressor()
0,LinearRegression(),0.574795,0.020947,LinearRegression()
2,DecisionTreeRegressor(),0.45882,0.02666,DecisionTreeRegressor()
5,AdaBoostRegressor(),0.427784,0.028189,"(DecisionTreeRegressor(max_depth=3, random_sta..."


instance2


Unnamed: 0,model_name,r2,mse,model
3,RandomForestRegressor(),0.65721,0.014512,"(DecisionTreeRegressor(max_features='auto', ra..."
6,GradientBoostingRegressor(),0.637802,0.015334,([DecisionTreeRegressor(criterion='friedman_ms...
4,SVR(),0.57445,0.018016,SVR()
1,KNeighborsRegressor(),0.511806,0.020668,KNeighborsRegressor()
0,LinearRegression(),0.508913,0.020791,LinearRegression()
5,AdaBoostRegressor(),0.402676,0.025288,"(DecisionTreeRegressor(max_depth=3, random_sta..."
2,DecisionTreeRegressor(),0.31591,0.028962,DecisionTreeRegressor()


CPU times: user 1min 45s, sys: 6.53 s, total: 1min 51s
Wall time: 1min 47s


In [16]:
d_dfs["instance2"].return_cleaned()

Unnamed: 0,neighbourhood_cleansed,city,accommodates,availability_365,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,...,Essentials,Coffee maker,Hair dryer,Microwave,Refrigerator,Heating,Air conditioning,Entire home/apt,Private room,Shared room
0,0.56250,0.0,0.000000,0.939726,0.0,0.000000,0.00,0.142857,0.000000,0.002451,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1,0.59375,0.0,0.142857,0.191781,0.0,0.000000,0.00,0.214286,0.068966,0.004289,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
7,0.59375,0.0,0.285714,0.493151,0.5,0.000000,0.00,0.848571,0.448276,0.031863,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
8,0.96875,0.0,0.142857,0.317808,0.5,0.000000,0.00,0.557143,0.068966,0.098389,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
9,0.43750,0.0,0.142857,0.871233,0.0,0.000000,0.00,0.257143,0.068966,0.098389,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69346,0.31250,0.0,0.000000,0.000000,0.0,0.000000,0.00,0.157143,0.137931,0.000525,...,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
69347,0.84375,0.0,0.571429,0.000000,0.0,0.000000,0.50,0.574286,0.068966,0.002363,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
69348,0.81250,0.0,0.714286,0.010959,1.0,1.000000,1.00,0.702857,0.068966,0.031863,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
69349,0.81250,0.0,0.142857,0.005479,0.0,0.000000,0.00,0.714286,0.068966,0.098389,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0


In [17]:
%%time
d_scores_ft = dict()

for name, instance in d_dfs.items():
    
    d_scores_ft[name] = instance.final_trial_model()
    

CPU times: user 30.6 s, sys: 595 ms, total: 31.2 s
Wall time: 32.4 s


In [18]:
model = d_dfs["instance1"].return_metrics()

model["model"][3]

RandomForestRegressor()

In [19]:
d_dfs["instance1"].model_feature_importances(model["model"][3])

22 22.372085982561178 Entire home/apt
2 13.547950114863827 accommodates
1 10.103423642732267 city
11 9.229673823724411 host_total_listings_count
3 8.306752392146016 availability_365
4 6.0543845963802365 bathrooms_text
10 5.414161629193398 reviews_per_month
0 4.986606627187222 neighbourhood_cleansed
7 4.560548816971723 minimum_nights
9 3.9415673278718555 number_of_reviews
8 2.6687700649409387 maximum_nights
21 1.5271895847111856 Air conditioning
6 1.3868375842354663 beds
5 1.183842030642853 bedrooms
14 0.747343613204171 Dishes and silverware
17 0.6283987544085798 Hair dryer
13 0.583982880443838 Cooking basics
15 0.5359849250534456 Essentials
16 0.5317544604200457 Coffee maker
18 0.5203885845013884 Microwave
20 0.48555204756695247 Heating
19 0.33514423930236525 Refrigerator
12 0.16408466545140937 Long term stays allowed
23 0.09630031204702585 Private room
24 0.0872712994381865 Shared room


{22: [22.372085982561178, 'Entire home/apt'],
 2: [13.547950114863827, 'accommodates'],
 1: [10.103423642732267, 'city'],
 11: [9.229673823724411, 'host_total_listings_count'],
 3: [8.306752392146016, 'availability_365'],
 4: [6.0543845963802365, 'bathrooms_text'],
 10: [5.414161629193398, 'reviews_per_month'],
 0: [4.986606627187222, 'neighbourhood_cleansed'],
 7: [4.560548816971723, 'minimum_nights'],
 9: [3.9415673278718555, 'number_of_reviews'],
 8: [2.6687700649409387, 'maximum_nights'],
 21: [1.5271895847111856, 'Air conditioning'],
 6: [1.3868375842354663, 'beds'],
 5: [1.183842030642853, 'bedrooms'],
 14: [0.747343613204171, 'Dishes and silverware'],
 17: [0.6283987544085798, 'Hair dryer'],
 13: [0.583982880443838, 'Cooking basics'],
 15: [0.5359849250534456, 'Essentials'],
 16: [0.5317544604200457, 'Coffee maker'],
 18: [0.5203885845013884, 'Microwave'],
 20: [0.48555204756695247, 'Heating'],
 19: [0.33514423930236525, 'Refrigerator'],
 12: [0.16408466545140937, 'Long term sta

In [20]:
d_scores_ft

{'instance1': 'r**2 = 0.727962968354814',
 'instance2': 'r**2 = 0.6513428239615381'}