# Getting the Data Ready
This notebook is for cleaning up data, generate more columns, change data type, etc.
Also playing around with diff type regression


In [121]:
import os
import io
import re
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [87]:
#read data, specify index so it's easier to join and search using loc
path = '../../data/new-york-city-airbnb-open-data/'
listings_csv = os.path.join(path,'selected_columns_listings.csv')
listings = pd.read_csv(listings_csv)

In [88]:
#shape
listings.shape

(50796, 71)

In [118]:
#head
listings.head(1)

Unnamed: 0,id,name,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,city,state,zipcode,market,smart_location,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,2060,Modern NYC,2259,Jenny,2008-08-18,US,a few days or more,22%,50%,f,Washington Heights,0.0,0.0,['reviews'],f,"New York, NY, United States",Manhattan,Washington Heights,New York,NY,10040,New York,"New York, NY",40.85722,-73.9379,t,Other,Private room,2,,1.0,2.0,Real Bed,"{Internet,Wifi}",100.0,,,1,$0.00,1,730,1,1,730,730,1.0,730.0,4 weeks ago,30,60,90,365,1,0,2008-09-22,2008-09-22,80.0,,,,,,,f,flexible,1,0,1,0,0.01,0,0,1,0,0


In [90]:
#to get all columns, set option
pd.set_option('display.max_columns', 107)
#to get text with no truncation
pd.set_option('display.max_colwidth', -1)

In [91]:
#all colums
list(listings.columns.values)

['id',
 'name',
 'host_id',
 'host_name',
 'host_since',
 'host_location',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'price',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'calendar_updated',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'numbe

In [92]:
#convert values of host_verifications to list
host_verifications = listings['host_verifications'].values.tolist()

In [93]:
#host verivication split
#host_verification_ = host_verifications[0:10] -- for testing
host_verification_unique = []

for row in host_verifications:
    row_clean = row.replace('[','').replace(']','')
    methods = row_clean.split(',')
    for method in methods:
        method_clean = method.replace("'","").strip()
        if method_clean not in host_verification_unique:
            host_verification_unique.append(method_clean)

print(host_verification_unique)

['reviews', 'email', 'phone', 'offline_government_id', 'kba', 'selfie', 'government_id', 'identity_manual', 'work_email', 'jumio', 'facebook', 'google', 'manual_online', 'sent_id', 'manual_offline', 'weibo', 'None', 'sesame', 'sesame_offline', 'zhima_selfie', '']


In [94]:
#convert values of amenities to list
amenities = listings['amenities'].values.tolist()

In [95]:
#amenities split
amenities_unique = []

for row in amenities:
    row_string = str(row)
    row_string_clean = row_string.replace('{','').replace('}','')
    amenities_rows = row_string_clean.split(',')
    for row in amenities_rows:
        row_clean = row.replace('"','')
        if row_clean not in amenities_unique:
            amenities_unique.append(row_clean)
            
print(amenities_unique)

['Internet', 'Wifi', 'TV', 'Air conditioning', 'Kitchen', 'Paid parking off premises', 'Free street parking', 'Heating', 'Family/kid friendly', 'Smoke detector', 'Carbon monoxide detector', 'Fire extinguisher', 'Essentials', 'Lock on bedroom door', 'Hair dryer', 'Iron', 'Laptop friendly workspace', 'Self check-in', 'Keypad', 'Private living room', 'Bathtub', 'Hot water', 'Bed linens', 'Extra pillows and blankets', 'Ethernet connection', 'Coffee maker', 'Refrigerator', 'Dishes and silverware', 'Cooking basics', 'Oven', 'Stove', 'Luggage dropoff allowed', 'Long term stays allowed', 'Cleaning before checkout', 'Cable TV', 'Pets allowed', 'Shampoo', '24-hour check-in', 'Hangers', 'Lockbox', 'High chair', 'Stair gates', 'Children’s books and toys', 'Pack ’n Play/travel crib', 'Microwave', 'Buzzer/wireless intercom', 'First aid kit', 'translation missing: en.hosting_amenity_49', 'translation missing: en.hosting_amenity_50', 'Host greets you', 'Elevator', 'Washer', 'Dryer', 'Pets live on this

In [137]:
#cleaning up price
listings['price'] = listings['price'].str.replace(',', '')
listings['price'] = listings['price'].str.replace('$', '')
listings['price'] = listings['price'].astype(float)
listings['price'].head()

In [None]:
#cleaning up extra people
listings['extra_people'] = listings['extra_people'].str.replace(',', '')
listings['extra_people'] = listings['extra_people'].str.replace('$', '')
listings['extra_people'] = listings['extra_people'].astype(float)
listings['extra_people'].head()

In [98]:
listings['reviews_per_month'].isna().sum()

10453

In [147]:
temp = listings.groupby('property_type')['property_type'].count()
temp.head(40)

property_type
Aparthotel                17   
Apartment                 39632
Barn                      2    
Bed and breakfast         51   
Boat                      8    
Boutique hotel            422  
Bungalow                  27   
Bus                       2    
Cabin                     2    
Camper/RV                 15   
Casa particular (Cuba)    3    
Castle                    3    
Cave                      4    
Condominium               1716 
Cottage                   8    
Dome house                2    
Dorm                      1    
Earth house               7    
Farm stay                 1    
Guest suite               425  
Guesthouse                81   
Hostel                    46   
Hotel                     298  
House                     4164 
Houseboat                 3    
In-law                    1    
Island                    2    
Lighthouse                2    
Loft                      1376 
Other                     101  
Resort                    

In [99]:
#review per month
filt = listings['reviews_per_month'].isnull()
listings[['reviews_per_month', 'last_review']][filt].count()

reviews_per_month    0
last_review          0
dtype: int64

In [134]:
#fill with 0
listings['reviews_per_month'] = listings['reviews_per_month'].replace(np.nan, 0)
listings['bathrooms'] = listings['bathrooms'].replace(np.nan, 0)
listings['bedrooms'] = listings['bedrooms'].replace(np.nan, 0)

In [None]:
#hosting since - drop na then create new column based on hosting since
#filt = listings['host_since'].isnull()
#listings[filt]
#listings['host_since'].isnull().sum()
listings = listings.dropna(subset=['host_since'])
listings['host_since'] = pd.to_datetime(listings['host_since'])
listings['today'] = pd.datetime.now()
listings['host_since_calc'] = (listings['today'] - listings['host_since']).dt.days
#see new column
listings['host_since_calc'].head()

In [101]:
# Get one hot encoding of column neighbourhood_group_cleansed
one_hot = pd.get_dummies(listings['neighbourhood_group_cleansed'])
# Drop original column as it is now encoded
listings = listings.drop('neighbourhood_group_cleansed',axis = 1)
# Join the encoded df
listings = listings.join(one_hot)

In [75]:
#test model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#not sure how to use this one
from sklearn.model_selection import cross_val_score

In [139]:
#exclude for now
X = listings[['Bronx', 'Bronx', 'Bronx', 'Bronx', 'Queens', 'Staten Island', 'reviews_per_month', 'host_since_calc', 'bathrooms', 'bedrooms', 'extra_people']]
y = listings['price']

# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize our algorithm
lr = LogisticRegression(random_state=1, solver='liblinear')

# Fit the training data to the model
log_reg = lr.fit(X_train, y_train)

#Score
log_reg.score(X_test, y_test)

0.05551727532237425

In [140]:
from sklearn import linear_model

clf = linear_model.Lasso(alpha=0.1)
log_clf = clf.fit(X_train, y_train)
log_clf.score(X_test, y_test)

0.03301977847634596

In [141]:
from sklearn.linear_model import Ridge

rr = Ridge(alpha=0.01)
log_rr = rr.fit(X_train, y_train) 
log_rr.score(X_test, y_test)


0.03293357452870738