In [1]:
import pandas as pd
import numpy as np
import openpyxl
import re
from re import sub
from decimal import Decimal
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from numpy import unique, where
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.metrics import r2_score, silhouette_samples, silhouette_score , completeness_score , homogeneity_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from IPython.display import display, HTML, display_html
import pickle

In [9]:
class amenities_processer():
    def __init__(self):
        
        pass
    
    def drop_outliers_IQR(self,df):
        self.df = df
        iqr = self.df['price'].quantile(0.75) - self.df['price'].quantile(0.25)
        lower_limit = self.df['price'].quantile(0.25) - 1.5*iqr
        upper_limit = self.df['price'].quantile(0.75) + 1.5*iqr
        self.final_df = self.df[(self.df['price']>=lower_limit) & (self.df['price']<=upper_limit)]

        return self.final_df

    def process_airbnb_data(self,df,amenities_universe):
        self.df = df
        self.amenities_universe = amenities_universe
#         display(self.df.head())
        ###################################################################################################
        # Initial cleaning of Airbnb data
        ###################################################################################################
        self.df['price'] = self.df['price'].apply(lambda x: float(Decimal(sub(r'[^\d.]', '', str(x)))))
        self.df['neighbourhood_cleansed'] = self.df['neighbourhood_cleansed'].apply(lambda x: str(x).lower())
        self.df['amenities'] = self.df['amenities'].apply(lambda x: str(x).lower())
        self.df['property_type'] = self.df['property_type'].apply(lambda x: str(x).lower())
        self.df['room_type'] = self.df['room_type'].apply(lambda x: str(x).lower())
        self.df['bathrooms_text'] = self.df['bathrooms_text'].fillna(0)
        self.df['bathrooms_text'] = self.df['bathrooms_text'].apply(lambda x: str(x).lower())
        self.df['bedrooms'] = self.df['bedrooms'].fillna(0)
        self.df['beds'] = self.df['beds'].fillna(0)
        self.df['latitude'] = np.round(self.df['latitude'],5)
        self.df['longitude'] = np.round(self.df['longitude'],5)
        
        ###################################################################################################
        # AMENITIES COUNT AND VECTORIZER
        ###################################################################################################        
        # Use regex to convert the string of amenities into individual string object to check if the object is in the amenities universe
        self.pattern = r'\"(.*?)\"'
        # This step ignores the idea that there might be more than 1 amenities
        self.df['amenities_clean'] = self.df['amenities'].apply(lambda x: [i for i in re.findall(self.pattern,x) if i in self.amenities_universe])
        # Convert list of amenities for each property for subsequent use of Vectoriser
        self.df['amenities_clean_vec'] = self.df['amenities_clean'].apply(lambda x: '"'.join(x))
        # Count the number of amenities listed for the property
        self.df['amenities_count'] = self.df['amenities_clean'].apply(lambda x: len(x))
        

        # Count vectorizer - This step will naturally collate the full list of amenities based on the detail dataset
        #                    which is a subset of the amenities_universe
        # Use regex to tokenize the string for count vectorizer
        self.vectorizer_count = CountVectorizer(token_pattern = self.pattern)
        self.property_amenities_list = [i for i in self.df['amenities_clean_vec']]

        self.property_features_count_vectorized = self.vectorizer_count.fit_transform(self.property_amenities_list)
        self.amenities_feature_count_name_clean = self.vectorizer_count.get_feature_names_out()
        self.amenities_feature_count_data_clean = self.property_features_count_vectorized.toarray()
        self.amenities_feature_count_df = pd.DataFrame(self.amenities_feature_count_data_clean,columns = self.amenities_feature_count_name_clean)
        
        ###################################################################################################
        # BATHROOM
        ###################################################################################################        
        # Bathroom
        # 3 new columns:
        #   - bathroom_count
        #   - bathroom_type: Assumed to be "private" if "shared" is not mentioned 
        #   - bathrooms_type_code: shared=0 & private=1
        self.float_pattern = r"[-+]?(?:\d*\.*\d+)"

        self.df['bathrooms_count'] = self.df['bathrooms_text'].apply(lambda x: re.findall(self.float_pattern,str(x)) if x != 'half' else [0.5])
        self.df['bathrooms_count'] = self.df['bathrooms_count'].apply(lambda x: x[0] if len(x)>0 else 0)
        self.df['bathrooms_type'] = self.df['bathrooms_text'].apply(lambda x: 'shared' if 'shared' in str(x) else 'private')
        self.df['bathrooms_type_code'] = np.where(self.df['bathrooms_type']=='shared',0,1)

        ###################################################################################################
        # PROPERTY AND ROOM TYPE
        ###################################################################################################        
        # 3 new columns:
        #   - property_type_clean: Remove qualitative words in the "prohibitedwords" list to identify unique property type
        #   - property_type_code: Assign a number to each property type
        #   - room_type_code: Assign a number to each room type (This offers qualitative description for the property type i.e. "private","shared")
        self.prohibitedWords = ['private', 'shared', ' in ', 'entire', 'room'] # added spaces in front and behind 'in' to ensure accuracy
        self.big_regex = re.compile('|'.join(map(re.escape, self.prohibitedWords)))
        self.df['property_type_clean'] =  self.df['property_type'].apply(lambda x: self.big_regex.sub("", x).strip())
        self.df['property_type_clean'] = np.where(self.df['property_type_clean']=='',self.df['room_type'],self.df['property_type_clean'])

        # create dictionary for property type code
        self.property_type_list = list(np.unique(self.df['property_type_clean']))
        self.property_type_dict = dict(zip(self.property_type_list,np.arange(len(self.property_type_list))))

        # create dictionary for room type code
        self.room_type_list = list(np.unique(self.df['room_type']))
        self.room_type_dict = dict(zip(self.room_type_list,np.arange(len(self.room_type_list))))

        # Assign property type code
        self.df['property_type_code'] = self.df['property_type_clean'].apply(lambda x: self.property_type_dict[x])

        # Assign room type code
        self.df['room_type_code'] = self.df['room_type'].apply(lambda x: self.room_type_dict[x])
        
        ###################################################################################################
        # NEIGHBOURHOOD CLEANSED
        ###################################################################################################  
        # 1 new column:
        #    - neighbourhood_cleansed_code: index the neighbourhood_cleansed column to further segregate the property according to the location

        self.neighbourhood_cleansed_dict = dict(zip(np.unique(self.df['neighbourhood_cleansed']),np.arange(len(np.unique(self.df['neighbourhood_cleansed'])))))
        self.df['neighbourhood_cleansed_code'] = self.df['neighbourhood_cleansed'].apply(lambda x: self.neighbourhood_cleansed_dict[x])
        
        ###################################################################################################
        # FINAL COMBINED DATAFRAME
        ###################################################################################################        
        # Recontruct the final dataframe for analysis
        self.required_column_list = ['property_type','room_type', 'neighbourhood_cleansed',
                                'bedrooms', 'beds','bathrooms_type_code','bathrooms_count','minimum_nights', 'maximum_nights',
                                'amenities_count',]

        self.final_df = self.df[self.required_column_list].join(self.amenities_feature_count_df)
        
        return self.final_df      

In [10]:
airbnb_la_listing  = r'../../../Data/LA_data/listings_detailed.csv'

amenities_file = open(r"./amenities_universe.txt", "r",encoding='unicode escape')
amenities_universe = amenities_file.read().split('\n')


In [11]:
processor = amenities_processer()

final_df = processor.process_airbnb_data(airbnb_la_listing,amenities_universe)

TypeError: string indices must be integers

In [8]:
airbnb_la_listing

'../../../Data/LA_data/listings_detailed.csv'

In [49]:
final_df

Unnamed: 0,price,latitude,longitude,property_type_code,room_type_code,neighbourhood_cleansed_code,bedrooms,beds,bathrooms_type_code,bathrooms_count,...,tennis court,terrace,toaster,tv,washer,waterfront,wifi,window guards,wine cooler,wine glasses
0,300.0,34.02438,-118.38374,22,2,52,1.0,1.0,1,2,...,0,0,0,0,0,0,0,0,0,0
1,46.0,34.10420,-118.34748,14,2,104,1.0,2.0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,140.0,34.00985,-118.40798,21,0,52,1.0,1.0,1,1,...,0,0,0,1,0,0,0,0,0,0
3,340.0,34.05303,-118.39449,42,0,169,3.0,5.0,1,3,...,0,0,0,0,1,0,0,0,0,0
4,115.0,33.98301,-118.38607,14,0,52,2.0,3.0,1,2,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40433,128.0,34.09101,-118.34462,22,0,250,0.0,1.0,1,1,...,0,0,0,0,1,0,0,0,0,0
40434,180.0,33.97551,-118.43066,34,0,172,2.0,3.0,1,2,...,0,0,0,1,0,0,0,0,0,0
40435,104.0,34.01423,-118.32089,22,0,127,2.0,2.0,1,1.5,...,0,0,0,0,1,0,0,0,0,0
40436,167.0,33.98519,-118.39341,34,0,52,2.0,2.0,1,1,...,0,0,1,0,1,0,0,0,0,1
