In [164]:

import pandas as pd
import numpy as np
import re
import json
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from pprint import pprint
import folium
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler


# Modelos
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Métricas
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import pickle
class airbnb_city:
    
    def __init__(self, csvs, city_names):
            
            self.l_dfs = list()
            
            for enum, dataset in enumerate(csvs):
                
                self.l_dfs.append(pd.read_csv(dataset))
                
                self.l_dfs[enum].drop("source", axis = 1, inplace = True)
                
                self.l_dfs[enum]["city"] = city_names[enum].lower()
        
            self.df = pd.concat(self.l_dfs)
            
            print("Instance created!")
            
    def return_initial_df(self):
    
        return self.df
    
    def display__initial_df(self):
    
        display(self.df)

    def clean_tested_columns(self):
        
        """
        Sets predefined columns, transforms price to a float column and separates bathroom_text 
        into 3 different categories, private, shared and unknown.
        """
        
        # Sets predefined columns
        
        tested_cols = ['neighbourhood_cleansed', 'city',
                       'room_type', 'accommodates',
                       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
                       'minimum_nights', 'maximum_nights', 'availability_365',
                       'number_of_reviews', 'reviews_per_month', 'host_total_listings_count']
        
        self.df_cleaned = self.df[tested_cols]
        
        # Transforms price to a float column
        
        self.df_cleaned["price"] = self.df_cleaned["price"].apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x).values
            
        # Get numbers out of bathroom_text columns
        
        self.df_cleaned = self.df_cleaned[self.df_cleaned["bathrooms_text"].isnull() == False]

        l_nums = [re.findall(r'\d+',i) for i in self.df_cleaned["bathrooms_text"].values]

        l_nums_completed = []

        for i in l_nums:

            if len(i) > 1:

                l_nums_completed.append('.'.join(i))

            elif len(i) == 0:

                l_nums_completed.append('0')

            else:

                l_nums_completed.append(i[0])
                
        # Create two different columns replacing bathroom_text
        
        self.df_cleaned["bathrooms_text"] = l_nums_completed

        self.df_cleaned["bathrooms_text"] = self.df_cleaned["bathrooms_text"].astype("float64")
        
        # Amenities
        
        l_amenities_cleaned = list()
        
        for i in self.df_cleaned["amenities"]:

            l_amenities_cleaned.append(json.loads(i))

        # Most relevant amenities, detailed analysis in the EDA file

        l_amenities_valuables = ['Long term stays allowed','Cooking basics','Dishes and silverware','Essentials','Coffee maker','Hair dryer','Microwave','Refrigerator','Heating','Air conditioning']

        for j in l_amenities_valuables:

            self.df_cleaned[j] = [1 if j in i else 0 for i in l_amenities_cleaned]

        self.df_cleaned.drop("amenities", axis =1, inplace=True)
        
        self.df_cleaned.dropna(inplace = True)
        
        # Room type
        
        self.df_cleaned = self.df_cleaned[self.df_cleaned["room_type"] != "Hotel room"]
        self.df_cleaned = pd.concat([self.df_cleaned, pd.get_dummies(data = self.df_cleaned["room_type"])], axis = 1).drop("room_type", axis = 1)
        
        return self.df_cleaned
        
    def label_encoding(self):
        
        city_encoder = LabelEncoder()
        self.df_cleaned["city"] = city_encoder.fit_transform(self.df_cleaned["city"])
        neighbourhood_encoder = LabelEncoder()
        self.df_cleaned["neighbourhood_cleansed"] = neighbourhood_encoder.fit_transform(self.df_cleaned["neighbourhood_cleansed"])
        
        return self.df_cleaned
    
    
    def display_outliers(self):
        for i in self.df_cleaned.columns:
    
            print(i)
            sns.kdeplot(self.df_cleaned[i])
            plt.show()
            
    
    def remove_outliers(self, accommodates, bathrooms_min, bathrooms_max, bedrooms, beds_min, beds_max, minimum_nights,
                       maximum_nights, nreviews, reviews_pmonth, price, htlc):

        self.df_cleaned = self.df_cleaned[self.df_cleaned["accommodates"] <= accommodates]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["bathrooms_text"].between(bathrooms_min, bathrooms_max)]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["bedrooms"] <= bedrooms]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["beds"].between(beds_min, beds_max)]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["minimum_nights"] <= minimum_nights]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["maximum_nights"] <= maximum_nights]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["number_of_reviews"] <= nreviews]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["reviews_per_month"] <= reviews_pmonth]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["price"] <= price]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["host_total_listings_count"] <= htlc]

        return self.df_cleaned

    def normalize(self):
        
        x_scaler = MinMaxScaler()
        self.df_cleaned[self.df_cleaned.drop("price", axis = 1).columns] = x_scaler.fit_transform(self.df_cleaned[self.df_cleaned.drop("price", axis = 1).columns])

        y_scaler = MinMaxScaler()
        self.df_cleaned["price"] = y_scaler.fit_transform(self.df_cleaned[["price"]]).flatten()
        
        return self.df_cleaned
    
    def tts(self):
        
        self.X = self.df_cleaned.drop(["price"], axis = 1)
        self.y = self.df_cleaned["price"]
                
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = 0.2, random_state = 42)

        print(f"X_train: {self.X_train.shape} | y_train: {self.y_train.shape}")
        print(f"X_test: {self.X_test.shape} | y_test: {self.y_test.shape}")
    
    def train_model(self):
        
        models = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor(),
                  RandomForestRegressor(), SVR(), AdaBoostRegressor(), GradientBoostingRegressor()]
        
        metrics = list()
        
        for model in models:
            
            # fit
            
            model.fit(self.X_train, self.y_train)

            # predict
            
            self.yhat = model.predict(self.X_test)
            
            # metrics
            
            r2 = r2_score(self.y_test, self.yhat)
            mse = mean_squared_error(self.y_test, self.yhat)
        
            metrics.append([str(model), r2, mse, model])
            
        self.df_metrics = pd.DataFrame(data = metrics, columns = ["model_name", "r2", "mse", "model"])
        self.df_metrics.sort_values(by = "r2", ascending = False, inplace= True)
        
    def return_metrics(self):
        
        return self.df_metrics
    
    def display_metrics(self):
        
        display(self.df_metrics)
        
    def model_feature_importances(self, model):
        
        importances = np.argsort(model.feature_importances_)[::-1]
        d_importances = dict()
        
        for i in importances:

            d_importances[i] = [model.feature_importances_[i]*100, self.df_cleaned.drop("price", axis = 1).columns[i]]
            print(i, model.feature_importances_[i]*100, self.df_cleaned.drop("price", axis = 1).columns[i])
            
        return d_importances
    
    def grid_search_cv_tuning(self):
        
        model = RandomForestRegressor()
        
        params = {"n_estimators" : [i for i in range(100, 1001, 50)],
                  "max_depth"    : [8, 10, 12, 14, 16],
                  "max_features" : ["log2", "sqrt"]}

        scorers = {"r2", "neg_mean_squared_error"}

        grid_solver = GridSearchCV(estimator  = model, 
                                   param_grid = params, 
                                   scoring    = scorers,
                                   cv         = 10,
                                   refit      = "r2",
                                   n_jobs     = -1, 
                                   verbose    = 2)

        self.model_result = grid_solver.fit(X_train, y_train)
        
    def grid_search_cv_validation(self):
        
        l_validations = [self.model_result.best_estimator_,
                         self.model_result.cv_results_["mean_test_r2"].max(),
                         self.model_result.best_score_]
        self.df_validations = pd.DataFrame(data = l_validations, columns = ["Best Estimator",
                                                                            "Mean Test R**2",
                                                                            "Best Score"])
        return self.df_validations
    
    def final_trial_model(self):
        
        '''It trains the best model with the features recomended'''
        
        model = RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=650, random_state = 42)
        model.fit(self.X_train, self.y_train)
        
        self.yhat = model.predict(self.X_test)
    
        return f"r**2 = {r2_score(self.y_test, self.yhat)}"
    
    def train_final_model(self, max_depth, max_features, n_estimators,random_state):
        
        '''Returns the definitive model'''
        
        self.X_def = self.df_cleaned.drop(["price"], axis = 1)
        self.y_def = self.df_cleaned["price"]
        
        model = RandomForestRegressor(max_depth = max_depth, max_features = max_features, n_estimators = n_estimators, random_state = random_state)
        model.fit(self.X_def, self.y_def)
        
        return model
    
    def predict(self, array):  
        
        '''predicts the price given a cleaned array with te features needed'''
        
        price_predicted = y_scaler.inverse_transform([model.predict([array])])
        
        return price_predicted
    
    def save_model(self, name, ext, model):
    
        with open(f"{name}.{ext}", "wb") as file:
            pickle.dump(model)
            
    def load_model(self, name, ext):
        
        with open(f"{name}.{ext}", "rb") as file:
            self.model = pickle.load(file)
            


In [165]:
madrid = "madrid.csv"
rome = "rome.csv"
paris = "paris.csv"
oslo = "oslo.csv"
london = "london.csv"
geneva = "geneva.csv"
dublin = "dublin.csv"
barcelona = "barcelona.csv"
athens = "athens.csv"
amsterdam = "amsterdam.csv"

#l_coms = [df_madrid,df_barcelona,df_paris,df_london,df_milan,df_rome,df_geneva]
csvs = [madrid, barcelona]
""",
          df_paris,
          df_london,
          df_amsterdam,
          df_rome,
          df_dublin,
          df_geneva,
          df_athens,
          df_oslo]
"""

l_names = ["Madrid","Barcelona"]#,"Paris","London","Amsterdam","Rome","Dublin","Geneva","Athens","Oslo"]

In [138]:
trial_1 = airbnb_city(csvs,l_names)

Instance created!


In [140]:
trial_1.clean_tested_columns()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_cleaned["price"] = self.df_cleaned["price"].apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x).values


Unnamed: 0,neighbourhood_cleansed,city,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,availability_365,...,Essentials,Coffee maker,Hair dryer,Microwave,Refrigerator,Heating,Air conditioning,Entire home/apt,Private room,Shared room
0,Hispanoamérica,madrid,2,1.0,1.0,1.0,77.0,3,1125,56,...,1,1,1,1,1,0,0,0,1,0
1,Cármenes,madrid,1,1.0,1.0,1.0,31.0,4,40,255,...,1,1,1,1,1,1,1,0,1,0
3,Legazpi,madrid,1,1.0,1.0,1.0,26.0,2,1125,339,...,1,0,1,0,1,1,0,0,1,0
4,Sol,madrid,2,1.5,1.0,1.0,49.0,2,90,271,...,1,0,1,1,1,1,0,0,1,0
5,Sol,madrid,2,1.0,1.0,2.0,120.0,5,180,353,...,0,0,0,0,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16915,la Maternitat i Sant Ramon,barcelona,6,1.5,1.0,6.0,5000.0,1,10,0,...,1,1,1,1,1,1,1,0,1,0
16916,la Maternitat i Sant Ramon,barcelona,6,1.5,1.0,6.0,5000.0,1,10,0,...,1,1,1,1,1,1,1,0,1,0
16917,la Maternitat i Sant Ramon,barcelona,6,1.5,1.0,6.0,5000.0,1,10,0,...,1,1,1,1,1,1,1,0,1,0
16918,Pedralbes,barcelona,5,2.0,4.0,5.0,250.0,6,1125,0,...,1,1,1,1,1,1,0,1,0,0


In [141]:
trial_1.remove_outliers(accommodates=8,
                   bathrooms_min=1,
                   bathrooms_max=2,
                   bedrooms=4,
                   beds_min=1,
                   beds_max=5,
                   minimum_nights=30,
                   maximum_nights=500000,
                   nreviews=300,
                   reviews_pmonth=8,
                   price=400,
                   htlc=500000)

Unnamed: 0,neighbourhood_cleansed,city,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,availability_365,...,Essentials,Coffee maker,Hair dryer,Microwave,Refrigerator,Heating,Air conditioning,Entire home/apt,Private room,Shared room
0,Hispanoamérica,madrid,2,1.0,1.0,1.0,77.0,3,1125,56,...,1,1,1,1,1,0,0,0,1,0
1,Cármenes,madrid,1,1.0,1.0,1.0,31.0,4,40,255,...,1,1,1,1,1,1,1,0,1,0
3,Legazpi,madrid,1,1.0,1.0,1.0,26.0,2,1125,339,...,1,0,1,0,1,1,0,0,1,0
5,Sol,madrid,2,1.0,1.0,2.0,120.0,5,180,353,...,0,0,0,0,0,1,1,1,0,0
6,Embajadores,madrid,1,1.0,1.0,1.0,20.0,6,1124,287,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16908,"Vallvidrera, el Tibidabo i les Planes",barcelona,3,2.0,2.0,2.0,109.0,8,30,0,...,1,0,0,0,0,1,1,1,0,0
16910,el Besòs i el Maresme,barcelona,4,1.0,2.0,2.0,146.0,4,1125,56,...,1,1,1,1,1,1,0,1,0,0
16911,el Besòs i el Maresme,barcelona,6,1.0,3.0,4.0,231.0,1,1125,110,...,1,1,1,1,1,0,1,1,0,0
16918,Pedralbes,barcelona,5,2.0,4.0,5.0,250.0,6,1125,0,...,1,1,1,1,1,1,0,1,0,0


In [142]:
trial_1.label_encoding()

Unnamed: 0,neighbourhood_cleansed,city,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,availability_365,...,Essentials,Coffee maker,Hair dryer,Microwave,Refrigerator,Heating,Air conditioning,Entire home/apt,Private room,Shared room
0,63,1,2,1.0,1.0,1.0,77.0,3,1125,56,...,1,1,1,1,1,0,0,0,1,0
1,47,1,1,1.0,1.0,1.0,31.0,4,40,255,...,1,1,1,1,1,1,1,0,1,0
3,71,1,1,1.0,1.0,1.0,26.0,2,1125,339,...,1,0,1,0,1,1,0,0,1,0
5,133,1,2,1.0,1.0,2.0,120.0,5,180,353,...,0,0,0,0,0,1,1,1,0,0
6,54,1,1,1.0,1.0,1.0,20.0,6,1124,287,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16908,144,0,3,2.0,2.0,2.0,109.0,8,30,0,...,1,0,0,0,0,1,1,1,0,0
16910,154,0,4,1.0,2.0,2.0,146.0,4,1125,56,...,1,1,1,1,1,1,0,1,0,0
16911,154,0,6,1.0,3.0,4.0,231.0,1,1125,110,...,1,1,1,1,1,0,1,1,0,0
16918,95,0,5,2.0,4.0,5.0,250.0,6,1125,0,...,1,1,1,1,1,1,0,1,0,0


In [143]:
trial_1.normalize()

Unnamed: 0,neighbourhood_cleansed,city,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,availability_365,...,Essentials,Coffee maker,Hair dryer,Microwave,Refrigerator,Heating,Air conditioning,Entire home/apt,Private room,Shared room
0,0.324742,1.0,0.142857,0.0,0.000000,0.00,0.176020,0.068966,0.007895,0.153425,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.242268,1.0,0.000000,0.0,0.000000,0.00,0.058673,0.103448,0.000274,0.698630,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
3,0.365979,1.0,0.000000,0.0,0.000000,0.00,0.045918,0.034483,0.007895,0.928767,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
5,0.685567,1.0,0.142857,0.0,0.000000,0.25,0.285714,0.137931,0.001257,0.967123,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
6,0.278351,1.0,0.000000,0.0,0.000000,0.00,0.030612,0.172414,0.007888,0.786301,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16908,0.742268,0.0,0.285714,1.0,0.333333,0.25,0.257653,0.241379,0.000204,0.000000,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
16910,0.793814,0.0,0.428571,0.0,0.333333,0.25,0.352041,0.103448,0.007895,0.153425,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
16911,0.793814,0.0,0.714286,0.0,0.666667,0.75,0.568878,0.000000,0.007895,0.301370,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
16918,0.489691,0.0,0.571429,1.0,1.000000,1.00,0.617347,0.172414,0.007895,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0


In [131]:
trial_1.tts()

X_train: (16675, 25) | y_train: (16675,)
X_test: (4169, 25) | y_test: (4169,)


In [132]:
%%time

trial_1.train_model()

Wall time: 13.2 s


In [133]:
df_metrics = trial_1.display_metrics()

Unnamed: 0,model_name,r2,mse,model
3,RandomForestRegressor(),0.717291,0.011259,"(DecisionTreeRegressor(max_features='auto', ra..."
6,GradientBoostingRegressor(),0.663865,0.013387,([DecisionTreeRegressor(criterion='friedman_ms...
4,SVR(),0.627861,0.014821,SVR()
1,KNeighborsRegressor(),0.582393,0.016632,KNeighborsRegressor()
0,LinearRegression(),0.561157,0.017478,LinearRegression()
5,AdaBoostRegressor(),0.429703,0.022713,"(DecisionTreeRegressor(max_depth=3, random_sta..."
2,DecisionTreeRegressor(),0.378472,0.024754,DecisionTreeRegressor()


In [134]:
metrics = trial_1.return_metrics()

In [135]:
trial_1.final_trial_model()

'r**2 = 0.7204426832633279'

In [None]:
################################################################################################################################

In [173]:
trial_2 = airbnb_city(csvs,l_names)

trial_2.clean_tested_columns()

df = trial_2.label_encoding()

df

Instance created!


In [177]:
trial_1.remove_outliers(accommodates=11,
                   bathrooms_min=1,
                   bathrooms_max=4,
                   bedrooms=5,
                   beds_min=1,
                   beds_max=8,
                   minimum_nights=30,
                   maximum_nights=5000,
                   nreviews=350,
                   reviews_pmonth=8,
                   price=500,
                   htlc=400)

Unnamed: 0,neighbourhood_cleansed,city,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,availability_365,...,Essentials,Coffee maker,Hair dryer,Microwave,Refrigerator,Heating,Air conditioning,Entire home/apt,Private room,Shared room
0,Hispanoamérica,madrid,2,1.0,1.0,1.0,77.0,3,1125,56,...,1,1,1,1,1,0,0,0,1,0
1,Cármenes,madrid,1,1.0,1.0,1.0,31.0,4,40,255,...,1,1,1,1,1,1,1,0,1,0
3,Legazpi,madrid,1,1.0,1.0,1.0,26.0,2,1125,339,...,1,0,1,0,1,1,0,0,1,0
5,Sol,madrid,2,1.0,1.0,2.0,120.0,5,180,353,...,0,0,0,0,0,1,1,1,0,0
6,Embajadores,madrid,1,1.0,1.0,1.0,20.0,6,1124,287,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16908,"Vallvidrera, el Tibidabo i les Planes",barcelona,3,2.0,2.0,2.0,109.0,8,30,0,...,1,0,0,0,0,1,1,1,0,0
16910,el Besòs i el Maresme,barcelona,4,1.0,2.0,2.0,146.0,4,1125,56,...,1,1,1,1,1,1,0,1,0,0
16911,el Besòs i el Maresme,barcelona,6,1.0,3.0,4.0,231.0,1,1125,110,...,1,1,1,1,1,0,1,1,0,0
16918,Pedralbes,barcelona,5,2.0,4.0,5.0,250.0,6,1125,0,...,1,1,1,1,1,1,0,1,0,0


In [179]:
trial_2.normalize()

trial_2.tts()

%%time

trial_2.train_model()

df_metrics = trial_2.display_metrics()

Unnamed: 0,neighbourhood_cleansed,city,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,availability_365,...,Essentials,Coffee maker,Hair dryer,Microwave,Refrigerator,Heating,Air conditioning,Entire home/apt,Private room,Shared room
0,0.323077,1.0,0.066667,0.090909,0.000000,0.000000,0.001080,0.001779,0.007895,0.153425,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.241026,1.0,0.000000,0.090909,0.000000,0.000000,0.000360,0.002669,0.000274,0.698630,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
3,0.364103,1.0,0.000000,0.090909,0.000000,0.000000,0.000282,0.000890,0.007895,0.928767,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,0.682051,1.0,0.066667,0.136364,0.000000,0.000000,0.000642,0.000890,0.000625,0.742466,...,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
5,0.682051,1.0,0.066667,0.090909,0.000000,0.025641,0.001752,0.003559,0.001257,0.967123,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16915,0.923077,0.0,0.333333,0.136364,0.000000,0.128205,0.078111,0.000000,0.000063,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
16916,0.923077,0.0,0.333333,0.136364,0.000000,0.128205,0.078111,0.000000,0.000063,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
16917,0.923077,0.0,0.333333,0.136364,0.000000,0.128205,0.078111,0.000000,0.000063,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
16918,0.487179,0.0,0.266667,0.181818,0.157895,0.102564,0.003787,0.004448,0.007895,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0


In [None]:
######################################################################################################

In [331]:
trial_3 = airbnb_city(csvs,l_names)

trial_3.clean_tested_columns()

df = trial_3.label_encoding()

df

Instance created!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_cleaned["price"] = self.df_cleaned["price"].apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x).values


Unnamed: 0,neighbourhood_cleansed,city,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,availability_365,...,Essentials,Coffee maker,Hair dryer,Microwave,Refrigerator,Heating,Air conditioning,Entire home/apt,Private room,Shared room
0,63,1,2,1.0,1.0,1.0,77.0,3,1125,56,...,1,1,1,1,1,0,0,0,1,0
1,47,1,1,1.0,1.0,1.0,31.0,4,40,255,...,1,1,1,1,1,1,1,0,1,0
3,71,1,1,1.0,1.0,1.0,26.0,2,1125,339,...,1,0,1,0,1,1,0,0,1,0
4,133,1,2,1.5,1.0,1.0,49.0,2,90,271,...,1,0,1,1,1,1,0,0,1,0
5,133,1,2,1.0,1.0,2.0,120.0,5,180,353,...,0,0,0,0,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16915,180,0,6,1.5,1.0,6.0,5000.0,1,10,0,...,1,1,1,1,1,1,1,0,1,0
16916,180,0,6,1.5,1.0,6.0,5000.0,1,10,0,...,1,1,1,1,1,1,1,0,1,0
16917,180,0,6,1.5,1.0,6.0,5000.0,1,10,0,...,1,1,1,1,1,1,1,0,1,0
16918,95,0,5,2.0,4.0,5.0,250.0,6,1125,0,...,1,1,1,1,1,1,0,1,0,0


In [332]:
trial_3.remove_outliers(accommodates=8,
                   bathrooms_min=1,
                   bathrooms_max=2,
                   bedrooms=4,
                   beds_min=1,
                   beds_max=5,
                   minimum_nights=30,
                   maximum_nights=70000,
                   nreviews=375,
                   reviews_pmonth=9,
                   price=350,
                   htlc=50000)

Unnamed: 0,neighbourhood_cleansed,city,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,availability_365,...,Essentials,Coffee maker,Hair dryer,Microwave,Refrigerator,Heating,Air conditioning,Entire home/apt,Private room,Shared room
0,63,1,2,1.0,1.0,1.0,77.0,3,1125,56,...,1,1,1,1,1,0,0,0,1,0
1,47,1,1,1.0,1.0,1.0,31.0,4,40,255,...,1,1,1,1,1,1,1,0,1,0
3,71,1,1,1.0,1.0,1.0,26.0,2,1125,339,...,1,0,1,0,1,1,0,0,1,0
5,133,1,2,1.0,1.0,2.0,120.0,5,180,353,...,0,0,0,0,0,1,1,1,0,0
6,54,1,1,1.0,1.0,1.0,20.0,6,1124,287,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16908,145,0,3,2.0,2.0,2.0,109.0,8,30,0,...,1,0,0,0,0,1,1,1,0,0
16910,155,0,4,1.0,2.0,2.0,146.0,4,1125,56,...,1,1,1,1,1,1,0,1,0,0
16911,155,0,6,1.0,3.0,4.0,231.0,1,1125,110,...,1,1,1,1,1,0,1,1,0,0
16918,95,0,5,2.0,4.0,5.0,250.0,6,1125,0,...,1,1,1,1,1,1,0,1,0,0


In [333]:
# bathrooms_max=3 ------------r2 = 0.716114
# bathrooms_max=3 ------------r2 = 0.716731
# maximum_nights=100000 ------r2 = 0.720190
# maximum_nights=80000 -------r2 = 0.720383
# maximum_nights=50000 -------r2 = 0.720327
# maximum_nights=10000 -------r2 = 0.716765
# maximum_nights=60000 -------r2 = 0.717097
# maximum_nights=70000 -------r2 = 0.720978	!!!!!!!!!!!!
# maximum_nights=75000 -------r2 = 0.720386	
# nreviews=250 ---------------r2 = 0.708339
# nreviews=350 ---------------r2 = 0.724894 !!!!!!!!!!!!
# nreviews=400 ---------------r2 = 0.713598
# nreviews=325 ---------------r2 = 0.707223
# nreviews=375 ---------------r2 = 0.717460
# reviews_pmonth=9 -----------r2 = 0.732210 !!!!!!!!!!!!
# reviews_pmonth=10 ----------r2 = 0.729373
# reviews_pmonth=7 -----------r2 = 0.730031
# price=500 ------------------r2 = 0.702768
# price=300 ------------------r2 = 0.729817	
# price=350 ------------------r2 = 0.734658 !!!!!!!!!!!!	
# nreviews=375 ---------------r2 = 0.740396 !!!!!!!!!!!!
# price=375 ------------------r2 = 0.732906
# htlc=50000 -----------------r2 = 0.741377 !!!!!!!!!!!!
# htlc=30000 -----------------r2 = 0.738201
# htlc=10000 -----------------r2 = 0.739066
# htlc=5000 ------------------r2 = 0.739066
# htlc=60000 -----------------r2 = 0.739066
# htlc=40000 -----------------r2 = 0.739670

In [334]:
%%time
trial_3.normalize()

trial_3.tts()

trial_3.train_model()

df_metrics = trial_3.display_metrics()

X_train: (16786, 25) | y_train: (16786,)
X_test: (4197, 25) | y_test: (4197,)


Unnamed: 0,model_name,r2,mse,model
3,RandomForestRegressor(),0.741552,0.012732,"(DecisionTreeRegressor(max_features='auto', ra..."
6,GradientBoostingRegressor(),0.670464,0.016234,([DecisionTreeRegressor(criterion='friedman_ms...
4,SVR(),0.647899,0.017345,SVR()
1,KNeighborsRegressor(),0.58863,0.020265,KNeighborsRegressor()
0,LinearRegression(),0.574796,0.020947,LinearRegression()
2,DecisionTreeRegressor(),0.468116,0.026202,DecisionTreeRegressor()
5,AdaBoostRegressor(),0.345155,0.032259,"(DecisionTreeRegressor(max_depth=3, random_sta..."


Wall time: 14.3 s


In [336]:
metrics = trial_3.return_metrics()

In [337]:
model = metrics["model"][3]
d_fi = trial_3.model_feature_importances(model)

22 21.111680352202615 Entire home/apt
2 14.980436256269483 accommodates
1 10.005043867835962 city
11 9.107120746715099 host_total_listings_count
8 8.236306263201627 availability_365
3 6.005293215092348 bathrooms_text
10 5.4050144122882795 reviews_per_month
0 5.081103781533659 neighbourhood_cleansed
6 4.57203921959932 minimum_nights
9 3.9629585932999007 number_of_reviews
7 2.662267282855014 maximum_nights
21 1.464175741759651 Air conditioning
5 1.4427858311957487 beds
4 1.180161324878754 bedrooms
14 0.756282343417031 Dishes and silverware
17 0.616876115765606 Hair dryer
13 0.5936884810738148 Cooking basics
15 0.5639721456052693 Essentials
16 0.5422553331092057 Coffee maker
20 0.5090865024439843 Heating
18 0.49752738822479897 Microwave
19 0.3630240463360059 Refrigerator
12 0.1612664942182994 Long term stays allowed
23 0.10065279610206362 Private room
24 0.07898146497645667 Shared room
