In [1]:
import pandas as pd
import numpy as np
import re
import json
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from pprint import pprint
import folium
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler


# Modelos
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Métricas
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import pickle



In [2]:
class airbnb_city:
    
    def __init__(self, csvs, city_names):
            
            self.l_dfs = list()
            
            for enum, dataset in enumerate(csvs):
                
                self.l_dfs.append(pd.read_csv(dataset))
                
                self.l_dfs[enum].drop("source", axis = 1, inplace = True)
                
                self.l_dfs[enum]["city"] = city_names[enum].lower()
        
            self.df = pd.concat(self.l_dfs)
            
            print("Instance created!")
            
    def return_initial_df(self):
    
        return self.df
    
    def display__initial_df(self):
    
        display(self.df)

    def clean_tested_columns(self):
        
        """
        Sets predefined columns, transforms price to a float column and separates bathroom_text 
        into 3 different categories, private, shared and unknown.
        """
        
        # Sets predefined columns
        
        tested_cols = ['neighbourhood_cleansed', 'city',
                       'room_type', 'accommodates', 'availability365'
                       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
                       'minimum_nights', 'maximum_nights',
                       'number_of_reviews', 'reviews_per_month', 'host_total_listings_count']
        
        self.df_cleaned = self.df[tested_cols]
        
        # Transforms price to a float column
        
        self.df_cleaned["price"] = self.df_cleaned["price"].apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x).values
            
        # Get numbers out of bathroom_text columns
        
        self.df_cleaned = self.df_cleaned[self.df_cleaned["bathrooms_text"].isnull() == False]

        l_nums = [re.findall(r'\d+',i) for i in self.df_cleaned["bathrooms_text"].values]

        l_nums_completed = []

        for i in l_nums:

            if len(i) > 1:

                l_nums_completed.append('.'.join(i))

            elif len(i) == 0:

                l_nums_completed.append('0')

            else:

                l_nums_completed.append(i[0])
                
        # Create two different columns replacing bathroom_text
        
        self.df_cleaned["bathrooms_text"] = l_nums_completed

        self.df_cleaned["bathrooms_text"] = self.df_cleaned["bathrooms_text"].astype("float64")
        
        # Amenities
        
        l_amenities_cleaned = list()
        
        for i in self.df_cleaned["amenities"]:

            l_amenities_cleaned.append(json.loads(i))

        # Most relevant amenities, detailed analysis in the EDA file

        l_amenities_valuables = ['Long term stays allowed','Cooking basics','Dishes and silverware','Essentials','Coffee maker','Hair dryer','Microwave','Refrigerator','Heating','Air conditioning']

        for j in l_amenities_valuables:

            self.df_cleaned[j] = [1 if j in i else 0 for i in l_amenities_cleaned]

        self.df_cleaned.drop("amenities", axis =1, inplace=True)
        
        self.df_cleaned.dropna(inplace = True)
        
        # Room type
        
        self.df_cleaned = self.df_cleaned[self.df_cleaned["room_type"] != "Hotel room"]
        self.df_cleaned = pd.concat([self.df_cleaned, pd.get_dummies(data = self.df_cleaned["room_type"])], axis = 1).drop("room_type", axis = 1)
        
    def return_cleaned(self):
        
        return self.df_cleaned
    
    def display_cleaned(self):
        
        display(self.df_cleaned)
    
    def remove_outliers(self, accommodates, bathrooms_min, bathrooms_max, bedrooms, beds_min, beds_max, minimum_nights,
                       maximum_nights, nreviews, reviews_pmonth, price, htlc):

        self.df_cleaned = self.df_cleaned[self.df_cleaned["accommodates"] <= accommodates]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["bathrooms_text"].between(bathrooms_min, bathrooms_max)]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["bedrooms"] <= bedrooms]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["beds"].between(beds_min, beds_max)]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["minimum_nights"] <= minimum_nights]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["maximum_nights"] <= maximum_nights]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["number_of_reviews"] <= nreviews]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["reviews_per_month"] <= reviews_pmonth]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["price"] <= price]
        self.df_cleaned = self.df_cleaned[self.df_cleaned["host_total_listings_count"] <= htlc]

        return self.df_cleaned
    
    def display_outliers(self):
        
        for i in self.df_cleaned.columns:
    
            print(i)
            sns.kdeplot(self.df_cleaned[i])
            plt.show()

    def label_encoding(self):
        
        city_encoder = LabelEncoder()
        self.df_cleaned["city"] = city_encoder.fit_transform(self.df_cleaned["city"])
        neighbourhood_encoder = LabelEncoder()
        self.df_cleaned["neighbourhood_cleansed"] = neighbourhood_encoder.fit_transform(self.df_cleaned["neighbourhood_cleansed"])
        
        return self.df_cleaned
    
    def normalize(self):
        
        x_scaler = MinMaxScaler()
        self.df_cleaned[self.df_cleaned.drop("price", axis = 1).columns] = x_scaler.fit_transform(self.df_cleaned[self.df_cleaned.drop("price", axis = 1).columns])

        y_scaler = MinMaxScaler()
        self.df_cleaned["price"] = y_scaler.fit_transform(self.df_cleaned[["price"]]).flatten()
        
        return self.df_cleaned
    
    def tts(self):
        
        self.X = self.df_cleaned.drop(["price"], axis = 1)
        self.y = self.df_cleaned["price"]
                
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = 0.2, random_state = 42)

        print(f"X_train: {self.X_train.shape} | y_train: {self.y_train.shape}")
        print(f"X_test: {self.X_test.shape} | y_test: {self.y_test.shape}")
    
    def train_model(self):
        
        #models = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor(),
        #          RandomForestRegressor(), SVR(), AdaBoostRegressor(), GradientBoostingRegressor()]
        
        
        models = [RandomForestRegressor()]
        
        metrics = list()
        
        for model in models:
            
            # fit
            
            model.fit(self.X_train, self.y_train)

            # predict
            
            self.yhat = model.predict(self.X_test)
            
            # metrics
            
            r2 = r2_score(self.y_test, self.yhat)
            mse = mean_squared_error(self.y_test, self.yhat)
        
            metrics.append([str(model), r2, mse, model])
            
        self.df_metrics = pd.DataFrame(data = metrics, columns = ["model_name", "r2", "mse", "model"])
        self.df_metrics.sort_values(by = "r2", ascending = False, inplace= True)
        
    def return_metrics(self):
        
        return self.df_metrics
    
    def display_metrics(self):
        
        display(self.df_metrics)
        
    def model_feature_importances(self, model):
        
        importances = np.argsort(model.feature_importances_)[::-1]
        d_importances = dict()
        
        for i in importances:

            d_importances[i] = [model.feature_importances_[i]*100, self.df_cleaned.drop("price", axis = 1).columns[i]]
            print(i, model.feature_importances_[i]*100, self.df_cleaned.drop("price", axis = 1).columns[i])
            
        return d_importances
    
    def grid_search_cv_tuning(self):
        
        model = RandomForestRegressor()
        
        params = {"n_estimators" : [i for i in range(100, 1001, 50)],
                  "max_depth"    : [8, 10, 12, 14, 16],
                  "max_features" : ["log2", "sqrt"]}

        scorers = {"r2", "neg_mean_squared_error"}

        grid_solver = GridSearchCV(estimator  = model, 
                                   param_grid = params, 
                                   scoring    = scorers,
                                   cv         = 10,
                                   refit      = "r2",
                                   n_jobs     = -1, 
                                   verbose    = 2)

        self.model_result = grid_solver.fit(X_train, y_train)
        
    def grid_search_cv_validation(self):
        
        l_validations = [self.model_result.best_estimator_,
                         self.model_result.cv_results_["mean_test_r2"].max(),
                         self.model_result.best_score_]
        self.df_validations = pd.DataFrame(data = l_validations, columns = ["Best Estimator",
                                                                            "Mean Test R**2",
                                                                            "Best Score"])
        return self.df_validations
    
    def final_trial_model(self):
        
        '''It trains the best model with the features recomended'''
        
        model = RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=650, random_state = 42)
        model.fit(self.X_train, self.y_train)
        
        self.yhat = model.predict(self.X_test)
    
        return f"r**2 = {r2_score(self.y_test, self.yhat)}"
    
    def train_final_model(self, max_depth, max_features, n_estimators,random_state):
        
        '''Returns the definitive model'''
        
        self.X_def = self.df_cleaned.drop(["price"], axis = 1)
        self.y_def = self.df_cleaned["price"]
        
        model = RandomForestRegressor(max_depth = max_depth, max_features = max_features, n_estimators = n_estimators, random_state = random_state)
        model.fit(self.X_def, self.y_def)
        
        return model
    
    def predict(self, array):  
        
        '''Predicts the price given a cleaned array with te features needed'''
        
        price_predicted = y_scaler.inverse_transform([model.predict([array])])
        
        return price_predicted
    
    def save_model(self, name, ext, model):
    
        with open(f"{name}.{ext}", "wb") as file:
            pickle.dump(model)
            
    def load_model(self, name, ext):
        
        with open(f"{name}.{ext}", "rb") as file:
            self.model = pickle.load(file)
            
        
    

In [3]:
madrid = "datasets/madrid.csv"
rome = "datasets/rome.csv"
paris = "datasets/paris.csv"
oslo = "datasets/oslo.csv"
london = "datasets/london.csv"
geneva = "datasets/geneva.csv"
dublin = "datasets/dublin.csv"
barcelona = "datasets/barcelona.csv"
athens = "datasets/athens.csv"
amsterdam = "datasets/amsterdam.csv"

d_csvs, d_names = dict(), dict()

d_csvs["csvs1"] = [madrid, barcelona]
d_csvs["csvs2"] = [madrid, barcelona, london]
d_csvs["csvs3"] = [madrid, barcelona, london, paris]
d_csvs["csvs4"] = [madrid, barcelona, london, paris, dublin]
d_csvs["csvs5"] = [madrid, barcelona, london, paris, dublin, rome]
d_csvs["csvs6"] = [madrid, barcelona, london, paris, dublin, rome, amsterdam]
d_csvs["csvs7"] = [madrid, barcelona,london, paris, dublin, rome, amsterdam, athens]
d_csvs["csvs8"] = [madrid, barcelona,london, paris, dublin, rome, amsterdam, athens, oslo]
d_csvs["csvs9"] = [madrid, barcelona,london, paris, dublin, rome, amsterdam, athens, oslo, geneva]
d_csvs["csvs10"] = [madrid, barcelona, paris, london, amsterdam, rome, dublin, geneva, athens, oslo]

d_names["names1"] = ["madrid", "barcelona"]
d_names["names2"] = ["madrid", "barcelona","london"]
d_names["names3"] = ["madrid", "barcelona","london", "paris"]
d_names["names4"] = ["madrid", "barcelona","london", "paris", "dublin"]
d_names["names5"] = ["madrid", "barcelona","london", "paris", "dublin", "rome"]
d_names["names6"] = ["madrid", "barcelona","london", "paris", "dublin", "rome", "amsterdam"]
d_names["names7"] = ["madrid", "barcelona","london", "paris", "dublin", "rome", "amsterdam", "athens"]
d_names["names8"] = ["madrid", "barcelona","london", "paris", "dublin", "rome", "amsterdam", "athens", "oslo"]
d_names["names9"] = ["madrid", "barcelona","london", "paris", "dublin", "rome", "amsterdam", "athens", "oslo", "geneva"]
d_names["names10"] = ["madrid", "barcelona","paris", "london", "amsterdam", "rome","dublin","geneva","athens","oslo"]
    

In [4]:
d_dfs = dict()

In [5]:
%%time

for i in range(1,10):
    
    d_dfs[f"instance{i}"] = airbnb_city(d_csvs[f"csvs{i}"],d_names[f"names{i}"])

Instance created!
Instance created!
Instance created!
Instance created!
Instance created!
Instance created!
Instance created!
Instance created!
Instance created!
CPU times: user 39 s, sys: 3.22 s, total: 42.2 s
Wall time: 42.8 s


In [6]:
d_dfs.values()

dict_values([<__main__.airbnb_city object at 0x7f84fa0be3d0>, <__main__.airbnb_city object at 0x7f84fa094eb0>, <__main__.airbnb_city object at 0x7f84fa059c10>, <__main__.airbnb_city object at 0x7f84fa08ffd0>, <__main__.airbnb_city object at 0x7f84fa08f4c0>, <__main__.airbnb_city object at 0x7f84f9dcc880>, <__main__.airbnb_city object at 0x7f84fa0c6a90>, <__main__.airbnb_city object at 0x7f84fa0c4cd0>, <__main__.airbnb_city object at 0x7f844d3dbdc0>])

In [7]:
%%time
for instance in d_dfs.values():
    
    instance.clean_tested_columns()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_cleaned["price"] = self.df_cleaned["price"].apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_cleaned["price"] = self.df_cleaned["price"].apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-d

CPU times: user 18.8 s, sys: 4.88 s, total: 23.7 s
Wall time: 37.4 s


In [8]:
%%time
l_dfs = list()

for instance in d_dfs.values():
    
    l_dfs.append(instance.return_cleaned())

CPU times: user 8 µs, sys: 2 µs, total: 10 µs
Wall time: 11.9 µs


In [9]:
%%time

for instance in d_dfs.values():
    
    instance.remove_outliers(accommodates=8,
                             bathrooms_min=1,
                             bathrooms_max=2,
                             bedrooms=4,
                             beds_min=1,
                             beds_max=5,
                             minimum_nights=30,
                             maximum_nights=500000,
                             nreviews=300,
                             reviews_pmonth=8,
                             price=400,
                             htlc=500000)

CPU times: user 484 ms, sys: 134 ms, total: 619 ms
Wall time: 758 ms


In [10]:
for instance in d_dfs.values():
    instance.label_encoding()

In [11]:
for instance in d_dfs.values():
    instance.normalize()

In [12]:
for instance in d_dfs.values():
    instance.tts()

X_train: (16675, 24) | y_train: (16675,)
X_test: (4169, 24) | y_test: (4169,)
X_train: (50628, 24) | y_train: (50628,)
X_test: (12658, 24) | y_test: (12658,)
X_train: (74960, 24) | y_train: (74960,)
X_test: (18740, 24) | y_test: (18740,)
X_train: (78921, 24) | y_train: (78921,)
X_test: (19731, 24) | y_test: (19731,)
X_train: (92501, 24) | y_train: (92501,)
X_test: (23126, 24) | y_test: (23126,)
X_train: (96392, 24) | y_train: (96392,)
X_test: (24099, 24) | y_test: (24099,)
X_train: (103094, 24) | y_train: (103094,)
X_test: (25774, 24) | y_test: (25774,)
X_train: (103149, 24) | y_train: (103149,)
X_test: (25788, 24) | y_test: (25788,)
X_train: (104900, 24) | y_train: (104900,)
X_test: (26226, 24) | y_test: (26226,)


In [13]:
%%time

for instance in d_dfs.values():
    
    instance.train_model()
    instance.display_metrics()

Unnamed: 0,model_name,r2,mse,model
0,RandomForestRegressor(),0.693163,0.01222,"(DecisionTreeRegressor(max_features='auto', ra..."


Unnamed: 0,model_name,r2,mse,model
0,RandomForestRegressor(),0.633222,0.014248,"(DecisionTreeRegressor(max_features='auto', ra..."


Unnamed: 0,model_name,r2,mse,model
0,RandomForestRegressor(),0.606742,0.015279,"(DecisionTreeRegressor(max_features='auto', ra..."


Unnamed: 0,model_name,r2,mse,model
0,RandomForestRegressor(),0.592482,0.015747,"(DecisionTreeRegressor(max_features='auto', ra..."


Unnamed: 0,model_name,r2,mse,model
0,RandomForestRegressor(),0.577104,0.015889,"(DecisionTreeRegressor(max_features='auto', ra..."


Unnamed: 0,model_name,r2,mse,model
0,RandomForestRegressor(),0.581902,0.016138,"(DecisionTreeRegressor(max_features='auto', ra..."


Unnamed: 0,model_name,r2,mse,model
0,RandomForestRegressor(),0.580679,0.0159,"(DecisionTreeRegressor(max_features='auto', ra..."


Unnamed: 0,model_name,r2,mse,model
0,RandomForestRegressor(),0.587691,0.015615,"(DecisionTreeRegressor(max_features='auto', ra..."


Unnamed: 0,model_name,r2,mse,model
0,RandomForestRegressor(),0.579208,0.01599,"(DecisionTreeRegressor(max_features='auto', ra..."


CPU times: user 4min 55s, sys: 4 s, total: 4min 59s
Wall time: 5min


In [14]:
adsfa

NameError: name 'adsfa' is not defined

In [None]:
%%time
d_scores_ft = dict()

for name, instance in d_dfs.items():
    
    d_scores_ft[name] = instance.final_trial_model()
    

In [None]:
d_scores_ft