In [31]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

def load_and_process(url_or_path_to_csv_file):

    # Method Chain 1 (Load data and deal with missing data)
    df1 = (
          pd.read_csv(url_or_path_to_csv_file)
          .loc[:, [
              "id",
                    "neighbourhood_cleansed",
                    "host_response_time",
                    "host_is_superhost",
                    "property_type",
                    "room_type",
                    "bathrooms_text",
                    "bedrooms",
                    "beds",
                    "amenities",
                    "price",
                    "availability_365",
                    "minimum_nights",
                    "number_of_reviews",
                    "review_scores_accuracy",
                    "review_scores_cleanliness",
                    "review_scores_communication",
                    "review_scores_value",
                    "calculated_host_listings_count"
          ]]
          .rename(columns={
                    "id": "Airbnb_Id",
                    "neighbourhood_cleansed": "Neighbourhood",
                    "host_response_time": "Host_ResponseTime",
                    "host_is_superhost": "Superhost",
                    "property_type": "Property_Type",
                    "room_type": "Room_Type",
                    "bathrooms_text": "Num_Baths_Detailed",
                    "bedrooms": "Num_Bedrooms",
                    "beds": "Num_Bedrooms",
                    "price": "Price_per_Night",
                    "amenities": "Amenities",
                    "minimum_nights": "Minimum_Nights",
                    "availability_365": "Availablility_365Days",
                    "number_of_reviews": "Num_Reviews",
                    "review_scores_accuracy": "Accuray_ReviewScore",
                    "review_scores_cleanliness": "Cleanliness_ReviewScore",
                    "review_scores_communication": "Communication_ReviewScore",
                    "review_scores_value": "Value_ReviewScore",
                    "calculated_host_listings_count": "Num_Host_Listings"
                    }, errors= "raise")
            .dropna()
      )
    
    
    # Method Chain 2 (Create new columns, drop others, and do processing)
    df2 = ( df1
           .assign(Superhost = df1["Superhost"].apply(lambda x: x == "t"))
           .assign(Num_Baths = df1["Num_Baths_Detailed"].apply(lambda x: (str(x)[0])))
           .assign(Price_per_Night = df1["Price_per_Night"].apply(lambda x: float(x[1:].replace(",",""))))
           .reset_index(drop=True)
          )

    # Shifts the columns around to the right position
    first_column = df2.pop('Num_Baths')
    df2.insert(0, 'Num_Baths', first_column)
    return df2

In [33]:
# EDA

df = load_and_process("../../../data/raw/Vancouver.csv")

df.shape

df.head()

df.describe()

Unnamed: 0,Num_Baths,Airbnb_Id,Neighbourhood,Host_ResponseTime,Superhost,Property_Type,Room_Type,Num_Baths_Detailed,Num_Bedrooms,Num_Bedrooms.1,Amenities,Price_per_Night,Availablility_365Days,Minimum_Nights,Num_Reviews,Accuray_ReviewScore,Cleanliness_ReviewScore,Communication_ReviewScore,Value_ReviewScore,Num_Host_Listings
0,2,10080,Downtown,within an hour,False,Entire condominium,Entire home/apt,2 baths,2.0,2.0,"[""Iron"", ""Dryer"", ""Bed linens"", ""Cooking basic...",150.0,346,90,16,9.0,9.0,9.0,9.0,40
1,1,13358,West End,within a day,True,Entire condominium,Entire home/apt,1 bath,1.0,1.0,"[""Iron"", ""Dryer"", ""Bed linens"", ""Cooking basic...",85.0,98,30,430,9.0,10.0,10.0,9.0,1
2,1,13490,Kensington-Cedar Cottage,within an hour,True,Entire apartment,Entire home/apt,1 bath,1.0,1.0,"[""Iron"", ""Outdoor dining area"", ""Ethernet conn...",145.0,203,30,88,10.0,10.0,10.0,10.0,1
3,1,14267,Kensington-Cedar Cottage,within a few hours,False,Entire house,Entire home/apt,1 bath,1.0,2.0,"[""Iron"", ""Dryer"", ""Cooking basics"", ""Hot water...",140.0,123,3,33,10.0,9.0,9.0,9.0,1
4,1,16611,Grandview-Woodland,a few days or more,False,Entire house,Entire home/apt,1 bath,3.0,4.0,"[""Heating"", ""Essentials"", ""Kitchen"", ""Iron"", ""...",100.0,89,30,3,8.0,6.0,9.0,7.0,5
