# AirBnB Optimal Price - EDA Notebook

## First Steps - Imports & Reading In Data

In [1]:
# Importing Libraries
import pandas as pd
pd.options.display.max_colwidth = 750   # Need it so I can better view amenities column
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import numpy as np
import pandarallel
from sklearn.preprocessing import StandardScaler, Normalizer, Binarizer, OrdinalEncoder, LabelEncoder
from category_encoders import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import keras
import tensorflow as tf

In [2]:
# Assigning filepaths of our airbnb data from Kaggle to variables
filename = "./train.csv"
filename2 = "./test.csv"

In [3]:
# Reading our CSV files into Pandas DataFrames
train = pd.read_csv(filename)
test = pd.read_csv(filename2)

## Combining Datasets

In [4]:
# Concatenating the two datasets into one. It's unnecessary to have pre-split training and test data. This is only useful for the holdout method.
# With cross validation and Keras' validation_split method, this pre-splitting of data is totally unnecessary. Besides, I want an overview of ALL
# of the data at once.
df = pd.concat([train, test])

In [5]:
# Sanity Check for confirming the concatenation worked out
train_num = train.shape[0]
print("Number of Observations for Pre-Split Training DataFrame: ", train_num)
print(" ")
test_num = test.shape[0]
print("Number of Observations for Pre-Split Testing DataFrame: ", test_num)
print(" ")
complete_num = train_num + test_num
print("Number of Observations for Concatenated DataFrame: ", complete_num)
print(" ")
if df.shape[0] == complete_num:
    print("Sanity Check -  Succesful!")
else:
    ("Sanity Check - Unsuccessful!")

Number of Observations for Pre-Split Training DataFrame:  74111
 
Number of Observations for Pre-Split Testing DataFrame:  25458
 
Number of Observations for Concatenated DataFrame:  99569
 
Sanity Check -  Succesful!


## Initial EDA & Data Clean-Up

In [6]:
df.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitchen,Heating,""Family/kid friendly"",Essentials,""Hair dryer"",Iron,""translation missing: en.hosting_amenity_50""}",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c034-459c-bc82-6522c957627c.jpg?aki_policy=small,11201.0,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitchen,Heating,""Family/kid friendly"",Washer,Dryer,""Smoke detector"",""Fire extinguisher"",Essentials,Shampoo,Hangers,""Hair dryer"",Iron,""translation missing: en.hosting_amenity_50""}",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4b65-452a-b48a-bfecb3b58a66.jpg?aki_policy=small,10019.0,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air conditioning"",Kitchen,Breakfast,""Buzzer/wireless intercom"",Heating,""Family/kid friendly"",""Smoke detector"",""Carbon monoxide detector"",""Fire extinguisher"",Essentials,Shampoo,Hangers,""Hair dryer"",Iron,""Laptop friendly workspace"",""translation missing: en.hosting_amenity_50""}",5,1.0,Real Bed,moderate,True,...,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9e3a-4fa9-aa54-bbd5ea26538d.jpg?aki_policy=small,10027.0,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Kitchen,""Indoor fireplace"",""Buzzer/wireless intercom"",Heating,Washer,Dryer,""Smoke detector"",""Carbon monoxide detector"",""First aid kit"",""Fire extinguisher"",Essentials}",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9c86-41ea-a735-43d933111063.jpg?aki_policy=small,94117.0,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditioning"",Kitchen,""Elevator in building"",Heating,""Smoke detector"",""Carbon monoxide detector"",""Fire extinguisher"",Essentials,Shampoo}",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009.0,0.0,1.0


In [7]:
df.nunique()
# Every observation has its own unique ID, so I'll both set and sort the DataFrame by the 'id' column.

id                        99569
log_price                   767
property_type                35
room_type                     3
amenities                 88651
accommodates                 16
bathrooms                    18
bed_type                      5
cancellation_policy           6
cleaning_fee                  2
city                          6
description               98683
first_review               2643
host_has_profile_pic          2
host_identity_verified        2
host_response_rate           80
host_since                 3118
instant_bookable              2
last_review                1457
latitude                  99569
longitude                 99569
name                      98653
neighbourhood               627
number_of_reviews           394
review_scores_rating         56
thumbnail_url             88468
zipcode                     801
bedrooms                     12
beds                         19
dtype: int64

In [8]:
# Confirming the datatypes in our DataFrame
df.dtypes
# We have some booleans here. They don't go well with ML models. Going to need to convert them to binary.

id                          int64
log_price                 float64
property_type              object
room_type                  object
amenities                  object
accommodates                int64
bathrooms                 float64
bed_type                   object
cancellation_policy        object
cleaning_fee                 bool
city                       object
description                object
first_review               object
host_has_profile_pic       object
host_identity_verified     object
host_response_rate         object
host_since                 object
instant_bookable           object
last_review                object
latitude                  float64
longitude                 float64
name                       object
neighbourhood              object
number_of_reviews           int64
review_scores_rating      float64
thumbnail_url              object
zipcode                    object
bedrooms                  float64
beds                      float64
dtype: object

In [9]:
# Sorting DataFrame by our index before setting it as such
df.sort_values(by=["host_since"], inplace=True, ascending=True)   # host_since is the closest thing to a date or date_time column we have
df.set_index("host_since", inplace=True)

In [10]:
# Dealing with null values
df.isnull().sum().sum()
# We're going to drop a lot of rows. Luckily, we have enough.
# If we kept these, converting the NaNs to zeroes would have given us an imbalanced dataset.
# Not worth dealing with that.

137833

In [11]:
# Removing null values. Can't interpolate them considering we lack a timeseries variable. Not every sample is worth keeping.
df.dropna(axis=0, inplace=True)

In [12]:
# Converting bools to binary ints
df.loc[(df["cleaning_fee"] == False), 'cleaning_fee'] = 0
df.loc[(df["cleaning_fee"] == True), 'cleaning_fee'] = 1

df.loc[(df["instant_bookable"] == 'f'), 'instant_bookable'] = 0
df.loc[(df["instant_bookable"] == 't'), 'instant_bookable'] = 1

df.loc[(df["host_identity_verified"] == 'f'), 'host_identity_verified'] = 0
df.loc[(df["host_identity_verified"] == 't'), 'host_identity_verified'] = 1

## Feature Engineering and Exploring our Target

In [13]:
# Inverting 'price_log' to get original price values WIP
df["price"] = np.exp(df["log_price"])

##### Need to round the decimal places to the second place!####


In [14]:
# Confirming the average prices WIP
price_mean = df["price"].mean()
price_sorted = df["price"].sort_values()

print("Average AirBnB Price Listings: ", price_mean)
print(" ")
print("Sorted AirbnB Price Listings: ", price_sorted)

Average AirBnB Price Listings:  146.1157082748948
 
Sorted AirbnB Price Listings:  host_since
2017-07-20       1.0
2013-08-14       5.0
2013-07-24      10.0
2013-10-20      10.0
2014-12-09      10.0
               ...  
2014-05-19    1950.0
2014-05-19    1950.0
2013-02-01    1950.0
2013-04-18    1995.0
2017-08-26    1999.0
Name: price, Length: 38502, dtype: float64


In [15]:
df.price.describe

<bound method NDFrame.describe of host_since
2008-03-03    175.0
2008-06-27    122.0
2008-07-31    120.0
2008-08-16     40.0
2008-08-16    150.0
              ...  
2017-09-21    140.0
2017-09-21    186.0
2017-09-22    100.0
2017-09-22     45.0
2017-09-25     81.0
Name: price, Length: 38502, dtype: float64>

In [16]:
# Confirming the absolute value of the correlations of our features with our target, 'price'.
# Its interesting that the logarithm of 'price' is only 85% correlated
abs(df.corr()["price"]).sort_values(ascending=False)

price                   1.000000
log_price               0.855574
accommodates            0.583134
bedrooms                0.550352
beds                    0.488349
bathrooms               0.459425
longitude               0.060459
review_scores_rating    0.054083
number_of_reviews       0.052034
latitude                0.012907
id                      0.005358
Name: price, dtype: float64

In [17]:
# Create heatmap


## TODO: 
---
- Do visualization of 'price' by 'zipcode'. Why? Because when one first searches for a place to stay on airbnb's site/app, they see a geomap with the prices for the various properties within that map.
- 

## Feature Engineering: Binarizing Amenities

In [18]:
df["amenities"].iloc[0]

'{TV,"Cable TV",Internet,"Wireless Internet",Heating,Washer,Dryer,"Smoke detector","Carbon monoxide detector","First aid kit","Fire extinguisher",Essentials,Shampoo,"24-hour check-in",Hangers,"Hair dryer",Iron,"Laptop friendly workspace","translation missing: en.hosting_amenity_49","translation missing: en.hosting_amenity_50","Hot water","Bed linens"}'

In [19]:
# Will use OHE to accomplish this

#### WIP ####

# Dropping high cardinality and otherwise useless columns
---
- Will need to drop 'log_price' to avoid target/data leakage

In [20]:
# Assigning high cardinal columns to a variable
cutoff = 100
HCC_cols = [col for col in df.select_dtypes('object').columns
                 if df[col].nunique() > cutoff]

In [21]:
# Confirming what the HCC columns are
HCC_cols

['amenities',
 'description',
 'first_review',
 'last_review',
 'name',
 'neighbourhood',
 'thumbnail_url',
 'zipcode']

In [22]:
# Creating a dataframe out of these HCC columns (excluding those that I know are important)
# If there is enough time for a stretch goal, I'll remove description from this list and vectorize it for sentiment analysis and include it in our
# deployed web app
df_HCC = df[[
             'description',
             'first_review',
             'last_review',
             'name',
             'thumbnail_url'
            ]]

In [23]:
df_HCC.head(3)

Unnamed: 0_level_0,description,first_review,last_review,name,thumbnail_url
host_since,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-03-03,"Our guest unit is a great way for couples or those on business to experience San Francisco. We put extra effort into making the accommodation comfortable and providing recommendations for how to have the most memorable trip. The space is an in-law unit located below the main house (where we live); it is completely private. You will have your own entrance through the backyard garden. The space consists of a bedroom with king-size bed, a very small living room, and a small bathroom; there is no kitchen available! The space is ideal for one or two adults (no children please); total of 350 square feet. There is a comfortable chair and desk if you need a space to work. There is a good natural light in the bedroom, but not in the living room ...",2014-07-24,2017-09-21,The Emerald Garden,https://a0.muscache.com/im/pictures/44267707/a7c477c9_original.jpg?aki_policy=small
2008-06-27,"*** Unit upgraded with new bamboo flooring, brand new Ultra HD 50"" Sony TV, new paint, new lighting, new mattresses, ultra fast cable Internet connection, Apple TV, (SENSITIVE CONTENTS HIDDEN) Chromecast. *** Gorgeous and Elegant Furnished Apartment in front of Culver City Fox Hills Park. Upper corner unit, total silence protected by trees. Short walk to the new Westfield Mall. Tennis courts, heated pool and jacuzzi hot tub. *** Unit upgraded with new bamboo flooring, brand new Ultra HD 50"" Sony TV, new paint, new lighting, new mattresses, ultra fast cable Internet connection. *** Gorgeous and Elegant Furnished Apartment in front of Culver City Fox Hills Park. Upper corner unit, total silence protected by trees. Short walk to the new ...",2011-08-15,2016-05-15,Amazing bright elegant condo park front *UPGRADED*,https://a0.muscache.com/im/pictures/4321499/1da9892a_original.jpg?aki_policy=small
2008-07-31,"Enjoy a fully furnished, charming one-bedroom apartment built in the 1930’s that is located in lovely residential neighborhood near the LA County Museum of Art, Peterson Auto Museum, Melrose District, Grove Shopping Mall and much more. Apartment appointed with all amenities for a comfortable stay -- full kitchen, dining room, living room, large bedroom, bathroom with separate shower and bathtub. Laundry room on premises. Parking pass supplied. Hotel Luxury, Comfort of Home, Amazing Price!!! • Smokeless Environment • Hardwood Floors • Large Bedroom with Queen Size Bed • Furnished Dining Room & Living Room • Air Mattress Available for Guests • Nicely Appointed Kitchen • Bathroom with Separate Shower & Tub • Fresh Bedding & Linen...",2013-08-30,2017-03-08,Spacious Historic 30's Apartment in Central LA,https://a0.muscache.com/im/pictures/1585/e1232151_original.jpg?aki_policy=small


In [24]:
df.shape

(38502, 29)

In [25]:
df.columns

Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'instant_bookable', 'last_review', 'latitude', 'longitude', 'name',
       'neighbourhood', 'number_of_reviews', 'review_scores_rating',
       'thumbnail_url', 'zipcode', 'bedrooms', 'beds', 'price'],
      dtype='object')

In [26]:
df.drop(columns="host_has_profile_pic", inplace=True)  # Why would a profile pic matter? lol
df.drop(columns="host_response_rate", inplace=True)   # Not important. We just need the host to respond is all. The rate at which they do is superfluous
df.drop(columns="number_of_reviews", inplace=True)   # Its not the number of reviews that matter, but the average rating of them
df.drop(columns="log_price", inplace=True)   # This will lead to data leakage
df.drop(columns="description", inplace=True)   # Literally just a written version of our lat, long, and zipcode features
df.drop(columns="first_review", inplace=True)  # Unnecessary
df.drop(columns="last_review", inplace=True)  # Useless
df.drop(columns="thumbnail_url", inplace=True)  # When do URLs ever help ML models?
df.drop(columns="id", inplace=True)   # Since we are IDs necessary? 
df.drop(columns="amenities", inplace=True) # This column is a messed up dictionary. Will be dropping it for now. Will add it back in later when perfecting the model

In [27]:
df.head(3)

Unnamed: 0_level_0,property_type,room_type,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,host_identity_verified,instant_bookable,latitude,longitude,name,neighbourhood,review_scores_rating,zipcode,bedrooms,beds,price
host_since,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2008-03-03,Guesthouse,Entire home/apt,2,1.0,Real Bed,flexible,1,SF,1,0,37.751263,-122.42606,The Emerald Garden,Noe Valley,97.0,94114,1.0,1.0,175.0
2008-06-27,Condominium,Entire home/apt,6,2.0,Real Bed,strict,1,LA,1,0,33.982095,-118.384935,Amazing bright elegant condo park front *UPGRADED*,Culver City,80.0,90230,2.0,3.0,122.0
2008-07-31,Apartment,Entire home/apt,2,1.0,Real Bed,strict,1,LA,1,0,34.071556,-118.350786,Spacious Historic 30's Apartment in Central LA,Mid-Wilshire,100.0,90036,1.0,1.0,120.0


## Plotly Express - Scatter Mapbox

In [28]:
figure = px.scatter_mapbox(
                           df,
                           lat='latitude',
                           lon='longitude',
                           hover_name='name',
                           hover_data=['price',
                                       'review_scores_rating'],
                           color_discrete_sequence=["goldenrod"],
                           zoom=3, height=350
                           )

figure.update_layout(mapbox_style="open-street-map")

figure.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

figure.show()

# Plotly Express - Analysis: 
---

We only have data on Los Angeles, San Francisco, Chicago, D.C., NYC & Boston. The Continental USA. Considering that I reversed the natural logarithm of our 'log_price' variable, it seems that those price values are all in USD.

Given this, I believe training my model on just these cities alone (with ample regularization tuning) will be enough to account for other major cities around the US.

In [29]:
# # Doing this here after the big Plotly Dash map
df.drop(columns="name", inplace=True)   # Names are purely nominal
df.drop(columns="latitude", inplace=True)    # Won't work with web app
df.drop(columns="longitude", inplace=True)
df.drop(columns="neighbourhood", inplace=True)   # Redudant when we already have zipcode

## Automated EDA 

In [53]:
Report = ProfileReport(df, explorative=True).to_notebook_iframe()
Report

Summarize dataset:   0%|          | 0/28 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Given Spearman's p, the features most correlated with price are accommodates, beds, bedrooms, and bathrooms (in that order). Review_scores_rating barely correlates at all. Which is rather surprising.  

## FINAL SELECTION OF FEATURES
---
- save for 'price' as its our target

In [None]:
# df.columns

In [None]:
# Amenities For Loop to 
# df.amenities

In [None]:
# df

In [None]:
# Not going to use ordinal, OHE, or dummy encoding. It creates far too many features. We'll run into the curse of dimensionality if I do this.
# Gonna need to embed our categorical columns. Might just use difference encoding for the converted binary columns though.
# df_dummy = pd.get_dummies(df)
# df_dummy

## Data Preprocessing

Creating both standardized and normalized versions of this dataset for our predictive model. Which preprocessing technique is best for our model? We'll just have to experiment with both!

- Will need to vectorize whichever string/object columns I end up using after dropping HCC and other useless columns up above

In [31]:
df_encoded = df.copy()

In [32]:
df_encoded.head()

Unnamed: 0_level_0,property_type,room_type,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,host_identity_verified,instant_bookable,review_scores_rating,zipcode,bedrooms,beds,price
host_since,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2008-03-03,Guesthouse,Entire home/apt,2,1.0,Real Bed,flexible,1,SF,1,0,97.0,94114,1.0,1.0,175.0
2008-06-27,Condominium,Entire home/apt,6,2.0,Real Bed,strict,1,LA,1,0,80.0,90230,2.0,3.0,122.0
2008-07-31,Apartment,Entire home/apt,2,1.0,Real Bed,strict,1,LA,1,0,100.0,90036,1.0,1.0,120.0
2008-08-16,Apartment,Private room,2,1.0,Real Bed,moderate,1,Chicago,1,0,89.0,60615,1.0,1.0,40.0
2008-08-16,Other,Entire home/apt,10,1.0,Real Bed,strict,1,Chicago,1,0,71.0,60615,4.0,4.0,150.0


In [33]:
def unique(list1):
 
    # intilize a null list
    unique_list = []
     
    # traverse for all elements
    for x in list1:
        # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x)
    # print list
    for x in unique_list:
        print (x,)

In [34]:
unique(df.city)

SF
LA
Chicago
NYC
Boston
DC


In [36]:
unique(df.property_type)

Guesthouse
Condominium
Apartment
Other
House
Townhouse
Loft
Cabin
Bungalow
Bed & Breakfast
Dorm
Guest suite
Chalet
Villa
Boat
Vacation home
Camper/RV
In-law
Treehouse
Tipi
Hostel
Serviced apartment
Yurt
Earth House
Boutique hotel
Timeshare
Tent
Castle
Cave
Train
Hut


In [37]:
unique(df.bed_type)

Real Bed
Pull-out Sofa
Airbed
Couch
Futon


In [38]:
unique(df.room_type)

Entire home/apt
Private room
Shared room


In [39]:
unique(df.cancellation_policy)

flexible
strict
moderate
super_strict_30
super_strict_60


In [40]:
unique(df.zipcode)

94114
90230
90036
60615
11221
60637
10026
10065
11223
11216
90405
90029
11106
10001.0
11365
90046
11373.0
91604
10038
60622
11206.0
02115
10024
10009
60616
20011
20020
20002
02128
94131
20003
20017
20009
20001
20010
20019
20007
10031
11105
90019
10036
94110
10016
11215
10025
10002
10002.0
10023
90066
90026
11206
11217
02119
10014
10027
10019
90292
94103
10013
90016
11211.0
11249.0
11231.0
91367
10069
11201
94115
60614
10304
02131
11222
10029.0
90291
20037
20016
02135
11218
10003.0
90042
20018
11225
11225.0
60657
11216.0
94134
90039
11233
94127
90211
90404
90021
94117.0
10009.0
02116
02114
02210
20005
10039
91601
94117
90012
02108
90045
90028
02120
11434
90802
11237
11238
90277
94118
90048
11103
20012
90302
11226.0
02134
10011.0
02113
90212
94122
90068
11205
02130
10304.0
20008
90254
90004
90024
11101
90266
91030
10034
94121
11205.0
60647
11207
11230
10017
10010
91101
11693
10128
10018
91607
60618
60605
10162
94115.0
11231
11102
11226
11692
10012
11228.0
11238.0
91106
60611
02125
02122


In [None]:
# List comprehension version that returns numpy arrays 
# label_encoder = LabelEncoder()

# objects = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city', 'name', 'neighbourhood']

# [label_encoder.fit_transform(df[obj]).astype(int) for obj in objects]

In [41]:
# USING LABEL ENCODING IN A FOR LOOP! THIS IS SO MUCH EASIER!! WE KEEP THE SAME NUMBER OF FEATURES! 
# OHE is better used to binarize categories within a feature, turning them into their own individual features
# Attempting this for loop with a OrdinalEncoder doesn't work. 
# Got this error back - ValueError: Expected 2D array, got 1D array instead:
#                       array=['Condominium' 'Other' 'Apartment' ... 'Apartment' 'Condominium' 'Guesthouse']. Reshape your data either using
# array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample
label_encoder = LabelEncoder()

objects = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city']

for obj in objects:
  df_encoded[obj] = label_encoder.fit_transform(df_encoded[obj]).astype(int)

In [42]:
Profile = ProfileReport(df_encoded, explorative=True).to_notebook_iframe()
Profile

Summarize dataset:   0%|          | 0/29 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Want to refactor code to grab all object dtypes and place them in this empty list
# dtype_obj = []

# for obj in df.dtypes:
#     return dtype_obj.append()

In [43]:
# I want to refactor this into a conditional statement
df['cleaning_fee'] = df['cleaning_fee'].astype(int)
df['host_identity_verified'] = df['host_identity_verified'].astype(int)
df['instant_bookable'] = df['instant_bookable'].astype(int)

In [None]:
# df.zipcode.nunique()

In [None]:
# This column is entirely screwed up! I'll have to drop it until I clean it up. Much like the amenities column.
# df['zipcode'] = df['zipcode'].astype(int)

In [44]:
df_encoded.drop(columns='zipcode', inplace=True)
df.drop(columns='zipcode', inplace=True)

In [None]:
df_encoded.head()

In [None]:
list(df_encoded.city)

In [47]:
unique(df_encoded.city)

5
3
1
4
0
2


In [48]:
unique(df_encoded.property_type)

14
10
0
20
16
25
19
5
4
1
11
13
9
29
2
28
6
18
27
24
15
21
30
12
3
23
22
7
8
26
17


In [49]:
unique(df_encoded.bed_type)

4
3
0
1
2


In [50]:
unique(df_encoded.room_type)

0
1
2


In [51]:
unique(df_encoded.cancellation_policy)

0
2
1
3
4


In [52]:
unique(df_encoded.zipcode)

94114
90230
90036
60615
11221
60637
10026
10065
11223
11216
90405
90029
11106
10001.0
11365
90046
11373.0
91604
10038
60622
11206.0
02115
10024
10009
60616
20011
20020
20002
02128
94131
20003
20017
20009
20001
20010
20019
20007
10031
11105
90019
10036
94110
10016
11215
10025
10002
10002.0
10023
90066
90026
11206
11217
02119
10014
10027
10019
90292
94103
10013
90016
11211.0
11249.0
11231.0
91367
10069
11201
94115
60614
10304
02131
11222
10029.0
90291
20037
20016
02135
11218
10003.0
90042
20018
11225
11225.0
60657
11216.0
94134
90039
11233
94127
90211
90404
90021
94117.0
10009.0
02116
02114
02210
20005
10039
91601
94117
90012
02108
90045
90028
02120
11434
90802
11237
11238
90277
94118
90048
11103
20012
90302
11226.0
02134
10011.0
02113
90212
94122
90068
11205
02130
10304.0
20008
90254
90004
90024
11101
90266
91030
10034
94121
11205.0
60647
11207
11230
10017
10010
91101
11693
10128
10018
91607
60618
60605
10162
94115.0
11231
11102
11226
11692
10012
11228.0
11238.0
91106
60611
02125
02122


In [None]:
# Exporting for use in our Model Notebook
df.to_csv('EDA Notebook.csv')

# Scaling our Code with Functions

## WIP
---

Run all code cells prior to this wrangle function, then comment out the code that I pass in to this in order to perserve the history of my EDA before I cleaned it all up with a scalable function.

In [None]:
# def wrangle_clean_data():
#     """
#     Wrangling our AirBnB historical data that has been collected over ____ period for ____ features.
#     Cleans & feature engineers data into a format that will allow for easy FCFF NN modeling.
#     -----
#     Links to source of data: https://www.kaggle.com/rudymizrahi/airbnb-listings-in-major-us-cities-deloitte-ml
#     Parameters
#     -----
#     None
    
#     Returns
#     -----
#     train: pandas dataframe for fitting our model
#            Contains ____
#     test: pandas dataframe for testing against our trained/fitted model
#            Contains ____
#     """
    