In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from pprint import pprint
import folium
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.preprocessing import PowerTransformer
from collections import Counter
import json


In [2]:
class airbnb_city:
    
    def __init__(self, csv):
        
        self.csv = csv
                
        self.df_city = pd.read_csv(self.csv)
        
        print("Instance created!")
        
    def clean_columns(self):
        
        # Take only the relevant columns in the dataframe
        
        l_relevant_columns = ["host_is_superhost","neighbourhood_cleansed","neighbourhood_group_cleansed","property_type","room_type","accommodates","bathrooms_text","beds","price","minimum_nights","maximum_nights","availability_30","availability_365","number_of_reviews","instant_bookable", "amenities", "host_verifications"]

        self.df_city = self.df_city[l_relevant_columns]
        
        self.df_city["bathrooms_text"].replace(np.nan, "?", inplace = True)
        
        # Get numbers out of bathroom_text columns
        
        l_nums = [re.findall(r'\d+',i) for i in self.df_city["bathrooms_text"].values]

        l_nums_completed = []

        for i in l_nums:

            if len(i) > 1:

                l_nums_completed.append('.'.join(i))

            elif len(i) == 0:

                l_nums_completed.append('0')

            else:

                l_nums_completed.append(i[0])

        # Separate categories from bathroom_text
        
        l_category = []

        for i in self.df_city["bathrooms_text"].values:

            if "shared" in i:

                l_category.append("Shared")

            elif "private" in i:

                l_category.append("Private")

            else:

                l_category.append("Unknown")
                
        # Create two different columns replacing bathroom_text
        
        self.df_city.drop("bathrooms_text", axis = 1, inplace = True)
                
        self.df_city["num_of_baths"] = l_nums_completed
        
        self.df_city["bath_category"] = l_category
        
        self.df_city["num_of_baths"] = self.df_city["num_of_baths"].astype("float64")

        # Column["prices"]
        
        self.df_city["price"]  = self.df_city["price"] .apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x)
        
        self.df_city["amenities"] = [len(i) for i in self.df_city["amenities"]]

        self.df_city["host_verifications"] = [len(i) for i in self.df_city["host_verifications"]]
        
        
        print("Dataframe sucessfully created!")
    
    def label_encoding(self):
        
        self.df_city.dropna(inplace=True)
        
        # Encoding columns with dummies function
        
        def dummies(data, column):
            return pd.get_dummies(data = data[column], drop_first=True)
        
        self.df_city["host_is_superhost"] = dummies(self.df_city, "host_is_superhost")
        self.df_city["instant_bookable"] = dummies(self.df_city, "instant_bookable")
        
        df_room_type = dummies(self.df_city, "room_type")
        df_bath_category = dummies(self.df_city, "bath_category")
        df_bath_category = df_bath_category.rename(columns={'Shared': 'shared_bath', 'Unknown': 'unknoun_bath'})
        
        self.df_city = pd.concat([self.df_city, df_bath_category], axis = 1)

        self.df_city = pd.concat([self.df_city, df_room_type], axis = 1)

        self.df_city.drop("room_type", axis = 1, inplace = True)

        self.df_city.drop("bath_category", axis = 1, inplace = True)
        
        # Encoding categorical columns with labelEncoding function
        
        l_columns_to_labelEncode = ["neighbourhood_cleansed", "property_type", "neighbourhood_group_cleansed"]
        l_columns_encoded = list()

        for i in l_columns_to_labelEncode:

            # Inicializing object LabelEncoder()
            o_labelEncoding = LabelEncoder()

            # Training it with the column data
            o_labelEncoding.fit(self.df_city[i].values)

            # Transform the column
            l_columns_encoded.append(o_labelEncoding.transform(self.df_city[i].values))

        self.df_city["neighbourhood_cleansed"] = l_columns_encoded[0]
        self.df_city["property_type"] = l_columns_encoded[1]
        self.df_city["neighbourhood_group_cleansed"] = l_columns_encoded[2]
        
        print("Dataframe sucessfully encoded!")

        
    def return_df(self):
    
        return self.df_city
    
    def display_df(self):
    
        display(self.df_city)

In [3]:
madrid = airbnb_city("datasets/Madrid air bnb/listings_detailed.csv")

Instance created!


In [4]:
df = madrid.return_df()

# Markdown reserved

In [5]:
df_columns_chosen = df[["minimum_nights", "amenities","property_type", "room_type", "neighbourhood_group_cleansed","latitude", "longitude", "number_of_reviews", "price","reviews_per_month","beds","calculated_host_listings_count", "accommodates"]]

In [6]:
df_columns_chosen

Unnamed: 0,minimum_nights,amenities,property_type,room_type,neighbourhood_group_cleansed,latitude,longitude,number_of_reviews,price,reviews_per_month,beds,calculated_host_listings_count,accommodates
0,1,"[""Kitchen"", ""Elevator"", ""Extra pillows and bla...",Private room in apartment,Private room,Chamartín,40.45724,-3.67688,78,$60.00,0.58,1.0,1,2
1,4,"[""Bed linens"", ""Refrigerator"", ""Dishes and sil...",Private room in apartment,Private room,Latina,40.40381,-3.74130,33,$31.00,0.42,1.0,2,1
2,15,"[""Pool"", ""Dishwasher"", ""Bed linens"", ""Refriger...",Entire apartment,Entire home/apt,Arganzuela,40.38840,-3.69511,0,$50.00,,5.0,7,6
3,5,"[""Refrigerator"", ""Host greets you"", ""Dishes an...",Entire apartment,Entire home/apt,Centro,40.42183,-3.70529,10,$92.00,0.13,1.0,1,3
4,2,"[""Elevator"", ""Patio or balcony"", ""Bed linens"",...",Private room in house,Private room,Arganzuela,40.38975,-3.69018,149,$26.00,1.12,1.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19613,30,"[""Kitchen"", ""Lock on bedroom door"", ""Oven"", ""D...",Private room in apartment,Private room,Centro,40.40756,-3.69937,0,$23.00,,1.0,8,1
19614,30,"[""Kitchen"", ""Wifi"", ""Cooking basics"", ""Hot wat...",Private room in apartment,Private room,Chamberí,40.43706,-3.71364,0,$21.00,,1.0,8,1
19615,7,"[""Kitchen"", ""Toaster"", ""Oven"", ""Lock on bedroo...",Private room in apartment,Private room,Chamberí,40.43857,-3.70715,0,$22.00,,1.0,8,1
19616,30,"[""Kitchen"", ""Toaster"", ""Lock on bedroom door"",...",Private room in apartment,Private room,Salamanca,40.43027,-3.66759,0,$19.00,,1.0,8,1


In [7]:
df_columns_chosen = df_columns_chosen.dropna()

In [8]:
df_columns_chosen.reset_index(drop = True, inplace = True)

In [9]:
df_columns_chosen["price"] = df_columns_chosen["price"].apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x).values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_columns_chosen["price"] = df_columns_chosen["price"].apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x).values


In [14]:
len(json.loads(df["amenities"].values[0]))

18

In [11]:
for enum, i in enumerate(df["amenities"].values):
    
    df_columns_chosen["amenities"].values[enum] = json.load(i)

AttributeError: 'str' object has no attribute 'read'

In [None]:
a = df_columns_chosen["amenities"].explode()
a[:] = a.factorize()[0]
df_columns_chosen['amenities'] = a.groupby(level=0).agg(list)

In [None]:
dict_property_types = dict(Counter(df_columns_chosen["property_type"]))

list_others = []
for i in dict_property_types.keys():
    
    if dict_property_types[i] < 300:
        
        list_others.append(i)
        
for i in list_others:
    
    df_columns_chosen["property_type"].replace(i,"Other", inplace = True)

In [None]:
Counter(df_columns_chosen["property_type"])

In [None]:
df_columns_chosen

In [None]:
l_lbencode = ["property_type","neighbourhood_group_cleansed"]
for i in l_lbencode:

    # Inicializing object LabelEncoder()
    o_labelEncoding = LabelEncoder()

    # Training it with the column data
    o_labelEncoding.fit(df_columns_chosen[i].values)

    # Transform the column
    df_columns_chosen[i] = o_labelEncoding.transform(df_columns_chosen[i].values)

In [None]:
df_columns_chosen

In [None]:
df_columns_chosen = df_columns_chosen[df_columns_chosen["price"] < 100]

In [None]:
df_columns_chosen.reset_index(drop=True,inplace=True)

In [None]:
df_columns_chosen

In [None]:
sns.countplot(df["accommodates"])

In [None]:
for i in df["accommodates"].values:
    
    if i > 8:
        
        df["accommodates"].replace(i,9, inplace = True)
        
    elif i == 0:
        
        df["accommodates"].replace(i,1, inplace = True)

        

In [None]:
sns.countplot(df["accommodates"])

In [None]:
def dummies(data, column):
    return pd.get_dummies(data = data[column], drop_first=True)

df_room_type = dummies(df_columns_chosen, "room_type")
df_columns_chosen = pd.concat([df_columns_chosen, df_room_type], axis = 1)
df_columns_chosen.drop("room_type", axis = 1, inplace = True)

In [None]:
df_columns_chosen

# Getting rid of outliers using quantiles

In [None]:
cols = df_columns_chosen.columns

cols = cols.tolist()

cols.remove("amenities")

cols
for col in cols:
    upper_bound = df_columns_chosen[col].quantile(0.95)
    lower_bound = df_columns_chosen[col].quantile(0.05)
    listings = df_columns_chosen[df_columns_chosen[col] < upper_bound]
    listings = df_columns_chosen[df_columns_chosen[col] > lower_bound]

In [None]:
# Plot before transformation
stats.probplot(df_columns_chosen["price"], plot=plt)

# Power Transformer
numeric_cols = list(df_columns_chosen._get_numeric_data().columns)
pt = PowerTransformer(method="yeo-johnson")
df_columns_chosen[numeric_cols] = pt.fit_transform(df_columns_chosen[numeric_cols])

In [None]:
# After transformation
stats.probplot(df_columns_chosen["price"], plot=plt)

## Dividing x & y

In [None]:
X = df_columns_chosen.drop("price", axis = 1)
y = df_columns_chosen[["price"]]

## Scaling data

In [None]:
from sklearn.preprocessing import MinMaxScaler


scaler_x = MinMaxScaler()
scaler_x.fit(X)
X = scaler_x.transform(X)


scaler_y = MinMaxScaler()
scaler_y.fit(y)
y = scaler_y.transform(y)

## Feature importances

In [None]:
df_class = df_columns_chosen["price"].copy()

df_columns_chosen.drop("price", axis = 1, inplace = True)

X = np.asarray(df_columns_chosen)
y = np.asarray(df_class)

In [None]:
X.shape, y.shape

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

# Build a forest and compute the feature importances
forest = ExtraTreesRegressor(n_estimators = 250,
                              random_state = 0)

forest.fit(X, y)

importances = forest.feature_importances_

std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0)

indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f): %s" % (f + 1, indices[f], importances[indices[f]], df_columns_chosen.columns[f]))

# Plot the feature importances of the forest
plt.figure()

plt.title("Feature importances")

plt.bar(range(X.shape[1]), importances[indices], color = "r", yerr = std[indices], align = "center")

plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

## 1. LinearRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state=42)

print(f"Train data: {X_train.shape, y_train.shape}")
print(f"Test data: {X_test.shape, y_test.shape}")

In [None]:
# Regresión algorithm

regresion_lineal = LinearRegression()
regresion_lineal.fit(X_train, y_train)

# Finding coef

print ("weights:", regresion_lineal.coef_)
print ("w_0:", regresion_lineal.intercept_)

## Predicción

In [None]:
yhat = regresion_lineal.predict(X_test)

for i, j in zip(yhat[:5], y_test[:5]):
    print(f"Predicción:{i} \tValor real:{j}")

## Metrics

In [None]:
# Sklearn tiene las formulas de algunas métricas en funciones.

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
# Relative Absolute Error
RAE = np.sum(np.abs(np.subtract(y_test, yhat))) / np.sum(np.abs(np.subtract(y_test, np.mean(y_test))))

# Relative Square Error
RSE = np.sum(np.square(np.subtract(y_test, yhat))) / np.sum(np.square(np.subtract(y_test, np.mean(y_test))))

# Adjusted R**2
r2_ajustada = 1 - (1 - regresion_lineal.score(X_test, y_test))*(len(y_test) - 1)/(len(y_test) - X_test.shape[1] - 1)

In [None]:
print(f"MAE:\t {mean_absolute_error(yhat, y_test)}")
print(f"MSE:\t {mean_squared_error(yhat, y_test)}")
print(f"R**2:\t {r2_score(yhat, y_test)}")
print(f"RAE:\t {RAE}")
print(f"RSE:\t {RSE}")
print(f"Adjusted R**2:\t {r2_ajustada}")

In [None]:
# Veamos los valores de yhat, y_test y su diferencia

df_pred = pd.DataFrame()

df_pred["y_test"] = scaler_y.inverse_transform([y_test]).flatten()
df_pred["yhat"] = scaler_y.inverse_transform([yhat]).flatten()

df_pred["diferencia"] = round(abs((df_pred["y_test"] - df_pred["yhat"]) / df_pred["y_test"] * 100), 4)

df_pred = df_pred.sort_values("diferencia")

df_pred.head(20)

In [None]:
df_pred.tail(20)

In [None]:
# Distance btw real and predicted values

plt.figure(figsize = (8, 5))

sns.scatterplot(x = y_test.flatten(), y = yhat.flatten(), alpha = 0.5, color = "blue")

plt.xlabel("Valores Reales (y_train)", size = 18)
plt.ylabel("Predicciones (yhat)", size = 18)

plt.show()

# 2. KNeighborsRegressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_neighbors = 7)

model.fit(X_train, y_train)

yhat = model.predict(X_test)

yhat

In [None]:
print(f"MAE:\t {mean_absolute_error(yhat, y_test)}")
print(f"MSE:\t {mean_squared_error(yhat, y_test)}")
print(f"R**2:\t {r2_score(yhat, y_test)}")
print(f"RAE:\t {RAE}")
print(f"RSE:\t {RSE}")
print(f"Adjusted R**2:\t {r2_ajustada}")

# 3. DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
model.fit(X_train, y_train)

yhat = model.predict(X_test)

yhat

In [None]:
print(f"MAE:\t {mean_absolute_error(yhat, y_test)}")
print(f"MSE:\t {mean_squared_error(yhat, y_test)}")
print(f"R**2:\t {r2_score(yhat, y_test)}")
print(f"RAE:\t {RAE}")
print(f"RSE:\t {RSE}")
print(f"Adjusted R**2:\t {r2_ajustada}")

# 4. RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs= -1 )
model.fit(X_train, y_train)

yhat = model.predict(X_test)

yhat

In [None]:
print(f"MAE:\t {mean_absolute_error(yhat, y_test)}")
print(f"MSE:\t {mean_squared_error(yhat, y_test)}")
print(f"R**2:\t {r2_score(yhat, y_test)}")
print(f"RAE:\t {RAE}")
print(f"RSE:\t {RSE}")
print(f"Adjusted R**2:\t {r2_ajustada}")



# 4. AdaBoostRegressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor

model = AdaBoostRegressor(n_estimators = 50)
model.fit(X_train, y_train)

yhat = model.predict(X_test)

yhat

In [None]:
print(f"MAE:\t {mean_absolute_error(yhat, y_test)}")
print(f"MSE:\t {mean_squared_error(yhat, y_test)}")
print(f"R**2:\t {r2_score(yhat, y_test)}")
print(f"RAE:\t {RAE}")
print(f"RSE:\t {RSE}")
print(f"Adjusted R**2:\t {r2_ajustada}")