In [370]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [401]:
class airbnb_city:
    
    def __init__(self, csv):
        
        self.csv = csv
                
        self.df_city = pd.read_csv(self.csv)
        
        print("Instance created!")
        
    def clean_columns(self):
        
        # Take only the relevant columns in the dataframe
        
        l_relevant_columns = ["host_is_superhost","neighbourhood_cleansed","neighbourhood_group_cleansed","property_type","room_type","accommodates","bathrooms_text","beds","price","minimum_nights","maximum_nights","availability_30","availability_365","number_of_reviews","instant_bookable", "amenities", "host_verifications"]

        self.df_city = self.df_city[l_relevant_columns]
        
        self.df_city["bathrooms_text"].replace(np.nan, "?", inplace = True)
        
        # Get numbers out of bathroom_text columns
        
        l_nums = [re.findall(r'\d+',i) for i in self.df_city["bathrooms_text"].values]

        l_nums_completed = []

        for i in l_nums:

            if len(i) > 1:

                l_nums_completed.append('.'.join(i))

            elif len(i) == 0:

                l_nums_completed.append('0')

            else:

                l_nums_completed.append(i[0])

        # Separate categories from bathroom_text
        
        l_category = []

        for i in self.df_city["bathrooms_text"].values:

            if "shared" in i:

                l_category.append("Shared")

            elif "private" in i:

                l_category.append("Private")

            else:

                l_category.append("Unknown")
                
        # Create two different columns replacing bathroom_text
        
        self.df_city.drop("bathrooms_text", axis = 1, inplace = True)
                
        self.df_city["num_of_baths"] = l_nums_completed
        
        self.df_city["bath_category"] = l_category
        
        self.df_city["num_of_baths"] = self.df_city["num_of_baths"].astype("float64")

        # Column["prices"]
        
        self.df_city["price"]  = self.df_city["price"] .apply(lambda x: float(x.strip("$").replace(',', '')) if pd.notnull(x) else x)
        
        self.df_city["amenities"] = [len(i) for i in self.df_city["amenities"]]

        self.df_city["host_verifications"] = [len(i) for i in self.df_city["host_verifications"]]
        
        
        print("Dataframe sucessfully created!")
    
    def label_encoding(self):
        
        self.df_city.dropna(inplace=True)
        
        # Encoding columns with dummies function
        
        def dummies(data, column):
            return pd.get_dummies(data = data[column], drop_first=True)
        
        self.df_city["host_is_superhost"] = dummies(self.df_city, "host_is_superhost")
        self.df_city["instant_bookable"] = dummies(self.df_city, "instant_bookable")
        
        df_room_type = dummies(self.df_city, "room_type")
        df_bath_category = dummies(self.df_city, "bath_category")
        df_bath_category = df_bath_category.rename(columns={'Shared': 'shared_bath', 'Unknown': 'unknoun_bath'})
        
        self.df_city = pd.concat([self.df_city, df_bath_category], axis = 1)

        self.df_city = pd.concat([self.df_city, df_room_type], axis = 1)

        self.df_city.drop("room_type", axis = 1, inplace = True)

        self.df_city.drop("bath_category", axis = 1, inplace = True)
        
        # Encoding categorical columns with labelEncoding function
        
        l_columns_to_labelEncode = ["neighbourhood_cleansed", "property_type", "neighbourhood_group_cleansed"]
        l_columns_encoded = list()

        for i in l_columns_to_labelEncode:

            # Inicializing object LabelEncoder()
            o_labelEncoding = LabelEncoder()

            # Training it with the column data
            o_labelEncoding.fit(self.df_city[i].values)

            # Transform the column
            l_columns_encoded.append(o_labelEncoding.transform(self.df_city[i].values))

        self.df_city["neighbourhood_cleansed"] = l_columns_encoded[0]
        self.df_city["property_type"] = l_columns_encoded[1]
        self.df_city["neighbourhood_group_cleansed"] = l_columns_encoded[2]
        
        print("Dataframe sucessfully encoded!")

        
    def return_df(self):
    
        return self.df_city
    
    def display_df(self):
    
        display(self.df_city)

In [402]:
madrid = airbnb_city("/home/hack/Curso/Temario/Repositorios/Coisigna/dsb-p2-ml/datasets/Madrid air bnb/listings_detailed.csv")

Instance created!


In [403]:
madrid.clean_columns()

Dataframe sucessfully created!


In [404]:
madrid.label_encoding()

Dataframe sucessfully encoded!


In [405]:
madrid.display_df()

Unnamed: 0,host_is_superhost,neighbourhood_cleansed,neighbourhood_group_cleansed,property_type,accommodates,beds,price,minimum_nights,maximum_nights,availability_30,...,number_of_reviews,instant_bookable,amenities,host_verifications,num_of_baths,shared_bath,unknoun_bath,Hotel room,Private room,Shared room
0,1,60,4,23,2,1.0,60.0,1,1125,30,...,78,0,272,80,1.0,1,0,0,1,0
1,0,44,9,23,1,1.0,31.0,4,40,29,...,33,0,444,98,1.0,0,1,0,1,0
2,0,67,0,4,6,5.0,50.0,15,730,1,...,0,0,631,109,2.0,0,1,0,0,0
3,0,117,3,4,3,1.0,92.0,5,730,12,...,10,1,460,18,1.0,0,1,0,0,0
4,0,67,0,38,1,1.0,26.0,2,1125,30,...,149,0,215,55,1.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19612,0,98,9,4,2,2.0,40.0,30,1125,30,...,0,1,152,18,1.0,0,1,0,0,0
19613,0,50,3,23,1,1.0,23.0,30,1125,14,...,0,0,229,98,2.0,1,0,0,1,0
19614,0,56,5,23,1,1.0,21.0,30,1125,15,...,0,0,136,98,3.5,1,0,0,1,0
19615,0,13,5,23,1,1.0,22.0,7,1125,29,...,0,0,327,98,3.0,0,1,0,1,0


In [406]:
df = madrid.return_df()

In [387]:
sevilla = airbnb_city("/home/hack/Curso/Temario/Repositorios/Coisigna/dsb-p2-ml/datasets/Madrid air bnb/listings_sevilla.csv")

Instance created!


In [388]:
sevilla.clean_columns()

Dataframe sucessfully created!


In [389]:
sevilla.label_encoding()

Dataframe sucessfully encoded!


In [390]:
sevilla.display_df()

Unnamed: 0,host_is_superhost,neighbourhood_cleansed,neighbourhood_group_cleansed,property_type,accommodates,beds,price,minimum_nights,maximum_nights,availability_30,...,number_of_reviews,instant_bookable,amenities,host_verifications,num_of_baths,shared_bath,unknoun_bath,Hotel room,Private room,Shared room
0,0,87,0,10,4,2.0,99.0,2,120,0,...,168,1,275,18,1,0,1,0,0,0
1,0,83,0,10,3,1.0,75.0,3,1125,6,...,42,1,851,32,1,0,1,0,0,0
2,0,75,0,24,2,1.0,79.0,2,365,25,...,86,0,348,9,1,1,0,0,1,0
3,1,83,0,10,6,3.0,84.0,2,360,15,...,160,0,1081,18,1,0,1,0,0,0
4,0,83,0,10,6,4.0,85.0,3,365,0,...,103,1,812,18,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6489,0,97,10,10,2,1.0,56.0,2,365,10,...,0,0,796,18,1,0,1,0,0,0
6490,0,82,0,29,2,2.0,60.0,1,365,10,...,0,1,387,18,1,1,0,0,1,0
6491,0,97,10,10,4,3.0,83.0,1,365,27,...,0,1,160,32,1,0,1,0,0,0
6492,0,57,1,29,1,1.0,32.0,1,365,30,...,0,1,208,9,1,1,0,0,1,0


In [391]:
berlin = airbnb_city("/home/hack/Curso/Temario/Repositorios/Coisigna/dsb-p2-ml/datasets/Madrid air bnb/listings_berlin.csv")

Instance created!


In [392]:
berlin.clean_columns()
berlin.label_encoding()
berlin.display_df()

Dataframe sucessfully created!
Dataframe sucessfully encoded!


Unnamed: 0,host_is_superhost,neighbourhood_cleansed,neighbourhood_group_cleansed,property_type,accommodates,beds,price,minimum_nights,maximum_nights,availability_30,...,number_of_reviews,instant_bookable,amenities,host_verifications,num_of_baths,shared_bath,unknoun_bath,Hotel room,Private room,Shared room
0,0,64,10,18,5,3.0,88.0,1,1125,3,...,0,1,495,32,1,0,1,0,0,0
1,1,135,9,15,2,2.0,36.0,8,100,2,...,29,0,438,18,1,0,1,0,0,0
2,1,67,3,15,2,1.0,60.0,2,120,1,...,126,1,433,18,1,0,1,0,0,0
3,0,26,9,8,6,2.0,120.0,1,365,0,...,0,0,143,18,2,0,1,0,0,0
4,0,100,6,15,4,2.0,150.0,92,1125,15,...,147,0,279,18,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16675,0,101,11,34,3,3.0,99.0,1,1125,16,...,0,0,299,18,1,0,1,0,1,0
16676,1,77,11,15,4,2.0,160.0,4,1125,2,...,23,0,890,18,1,0,1,0,0,0
16677,0,101,11,34,6,4.0,195.0,1,1125,30,...,0,0,434,18,1,0,1,0,1,0
16678,0,101,11,15,2,1.0,60.0,90,365,0,...,0,1,674,18,1,0,1,0,0,0
