In [None]:
import numpy as np
import pandas as pd
import os
print(os.listdir("../input"))
from re import sub
from decimal import Decimal

# Grabbing Airbnb Listings Data

We chose specific data points from the listings data that we thought were easy for us to categorize during our data cleaning process, and, intuitively, seemed most relevant to predicting the pricing.

Data points:
- host_is_superhost
- neighbourhood_group_cleansed
- property_type
- room_type
- latitude
- longitude
- guests_included
- bathrooms
- bedrooms
- beds
- bed_type
- amenities
- price
- cleaning_fee
- instant_bookable
- cancellation_policy

In [None]:
listings = pd.read_csv('../input/listings.csv')
ld = listings.loc[:,['host_is_superhost','neighbourhood_group_cleansed', 'property_type', 'room_type', 'latitude', 'longitude', 'guests_included', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price', 'cleaning_fee', 'instant_bookable', 'cancellation_policy']] 

In [None]:
display(listings.head())

In [None]:
ld.count()

# Removing all the listings with missing values

In this step, we are looking for all the listings containing missing values.  We will remove them, and store them in another dataframe. 

In [None]:
ld = ld.dropna(subset=['host_is_superhost','neighbourhood_group_cleansed', 'property_type', 'room_type', 'latitude', 'longitude', 'guests_included', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price', 'instant_bookable', 'cancellation_policy']) 
ld.count()

In [None]:
ld['cleaning_fee'] = ld['cleaning_fee'].fillna(0)

In [None]:
ld['cleaning_fee'].head()

In [None]:
ld.head()

## Column 1: host_is_superhost
- Boolean declaring whether host fulfills Airbnb's superhost requirements: https://www.airbnb.ca/help/article/829/how-do-i-become-a-superhost
- Convert `True = 1` and `False = 0`

In [None]:
ld.loc[ld.loc[:, 'host_is_superhost'] == 't', 'host_is_superhost'] = 1
ld.loc[ld.loc[:, 'host_is_superhost'] == 'f', 'host_is_superhost'] = 0

In [None]:
ld.head()

## Column 2: neighbourhood_group_cleansed
- neighborhood category string that states which neighbourhood it belongs to

In [None]:
# all the possible values
set(ld['neighbourhood_group_cleansed'])

In [None]:
neighbourhood = pd.get_dummies(ld['neighbourhood_group_cleansed'].str.lower().str.replace(' ', '_'))

In [None]:
neighbourhood.head()

In [None]:
ld_1 = pd.merge(ld, neighbourhood, left_index=True, right_index=True)
ld_1 = ld_1.drop('neighbourhood_group_cleansed', 1)

In [None]:
ld_1.head()

 ## Column 3: Property type
 
 - Column indicates which property type it is(ex. house, apartment, etc)

In [None]:
property_type = pd.get_dummies(ld_1['property_type'])

In [None]:
property_type.head()

In [None]:
ld_2 = pd.merge(ld_1, property_type, left_index=True, right_index=True)
ld_2 = ld_2.drop('property_type', 1)

In [None]:
ld_2.head()

## Column 4: Room type
 
 - Column indicates which room type it is(ex. Entire home/apt )

In [None]:
room_type = pd.get_dummies(ld_2['room_type'])

In [None]:
room_type.head()

In [None]:
ld_3= pd.merge(ld_2, room_type, left_index=True, right_index=True)
ld_3 = ld_3.drop('room_type', 1)

## Column 5: Bed type
 
 - Column states what kind of bed the listing has(ex. Real Bed, Futon, etc )

In [None]:
bed_type = pd.get_dummies(ld_3['bed_type'])

In [None]:
bed_type.head()

In [None]:
ld_4= pd.merge(ld_3, bed_type, left_index=True, right_index=True)
ld_4 = ld_4.drop('bed_type', 1)

## Column 6: Instant Bookable
 
- Boolean declaring whether or not the listing can be instant booked. 
- Convert `True = 1` and `False = 0`

In [None]:
ld_4.loc[ld_4.loc[:, 'instant_bookable'] == 't', 'instant_bookable'] = 1
ld_4.loc[ld_4.loc[:, 'instant_bookable'] == 'f', 'instant_bookable'] = 0

In [None]:
ld_4.head()

## Column 7: Cancellation policy
- Column indicates which kind of standardlized cancellation policy the host chooses.
- There are three cancellation policies - flexible, moderate and strict 

In [None]:
cancellation = pd.get_dummies(ld_4['cancellation_policy'])

In [None]:
cancellation.head()

In [None]:
ld_5= pd.merge(ld_4, cancellation, left_index=True, right_index=True)
ld_5 = ld_5.drop('cancellation_policy', 1)

# Column 8: Guest included 
- column states the number of guests can be accomodated for each listing
- we need to normalize the value to 0-1

In [None]:
ld_5['guests_included'].max()

In [None]:
def normalizing(column):
    new_column = (column - column.min()) / (column.max() - column.min())
    return new_column

In [None]:
ld_5['guests_included'] = normalizing(ld_5['guests_included'])

In [None]:
ld_5.head()

# Column 8, 9, 10: bathrooms, bedrooms, beds
- column states the number of bathrooms, bedrooms, and beds in each listing
- normalize the value to 0-1

In [None]:
ld_5['bathrooms'] = normalizing(ld_5['bathrooms'])

In [None]:
ld_5['bedrooms'] = normalizing(ld_5['bedrooms'])

In [None]:
ld_5['beds'] = normalizing(ld_5['beds'])

In [None]:
ld_5.head()

# Column 11, 12: Longitude and Latitude 
- column states the longitude and latitude of each listing 
- we can use these two values, and map them to x, y and z coordinates. In this way we can make sure close points in the 3D space are close to each other. 
- x = cos(lat) * cos(lon)
- y = cos(lat) * sin(lon), 
- z = sin(lat) 

In [None]:
x = np.cos(ld_5['latitude']) * np.cos(ld_5['longitude'])
x = normalizing(x)
y = np.cos(ld_5['latitude']) * np.sin(ld_5['longitude'])
y = normalizing(y)
z = np.sin(ld_5['longitude']) 
z = normalizing(z)

In [None]:
ld_5['location_x'] = x
ld_5['location_y'] = y
ld_5['location_z'] = z

In [None]:
ld_6 = ld_5.drop('latitude', 1)
ld_7 = ld_6.drop('longitude', 1)

In [None]:
ld_7.head()

# Column 13: price, cleaning fee
- column includes the avrage price and cleaning_fee per night for each listing

In [None]:
ld_7['price'] = ld_7['price'].replace('[\$,]','',regex=True).astype(float)
ld_7['cleaning_fee'] = ld_7['cleaning_fee'].replace('[\$,]','',regex=True).astype(float)
ld_7['price'] = normalizing(ld_7['price'])
ld_7['cleaning_fee'] = normalizing(ld_7['cleaning_fee'] )

In [None]:
ld_7.head()

# Column 14: Amenities 
- column indudes all the amentities for each listing

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

ld_7["amenities"] = ld_7["amenities"].str.lower().str.replace('{','').str.replace('}','').str.replace('"','').str.replace(' ','_').str.split(',')
ld_7.head()
mlb = MultiLabelBinarizer()
final_df = ld_7.join(pd.DataFrame(mlb.fit_transform(ld_7.pop('amenities')),
                          columns=mlb.classes_,
                          index=ld_7.index))



In [None]:
final_df.head()

In [None]:
# Exporting as csv for later use
final_df.to_csv('aps360_airbnb1.csv')