# Exploring the Data
This is using NY data. Looks like inside aribnb data is same format so it can be used for other cities


In [56]:
import os
import io
import re
import numpy as np;
import pandas as pd;
import matplotlib;
import matplotlib.pyplot as plt;
import seaborn as sns;

import warnings
warnings.filterwarnings('ignore')

In [57]:
#read data, specify index so it's easier to join and search using loc
path = '../data/new-york-city-airbnb-open-data/'
listings_csv = os.path.join(path,'listings.csv')

listings = pd.read_csv(listings_csv, index_col = 'id')

In [58]:
#shape
listings.shape

(50796, 105)

In [59]:
#to get all columns, set option
pd.set_option('display.max_columns', 107)
#to get text with no truncation
pd.set_option('display.max_colwidth', -1)

In [60]:
#get all columns exported into a file
buffer = io.StringIO()
listings.info(buf=buffer, verbose=True, null_counts=True)
s = buffer.getvalue()

with open('listings_info.txt', 'w',encoding="utf-8") as f:
    f.write(s)

In [61]:
#clean up exported file from above so it can be imported to dataframe easily
with open('listings_info.txt', 'r') as f:
    lines = f.readlines()
with open('listings_info_clean.txt', 'w') as f:
    line_number = 0
    for line in lines:
        line_number = line_number + 1
        if line_number not in [1,2,3,109,110]:
            line1 = re.sub(' +', ' ', line)
            f.write(line1)

In [65]:
#read the output from cell above to data frame so it can be queried later
listing_columns = pd.read_csv("listings_info_clean.txt", delimiter=' ', header=None)

In [66]:
listing_columns.head()

Unnamed: 0,0,1,2,3
0,listing_url,50796,non-null,object
1,scrape_id,50796,non-null,int64
2,last_scraped,50796,non-null,object
3,name,50779,non-null,object
4,summary,48341,non-null,object


In [67]:
#filter columns with 50% blank
filt = (listing_columns[1] < 25000)
remove_columns = listing_columns[0][filt].tolist()
remove_columns

['notes',
 'thumbnail_url',
 'medium_url',
 'xl_picture_url',
 'square_feet',
 'weekly_price',
 'monthly_price',
 'license',
 'jurisdiction_names']

In [68]:
listings.drop(remove_columns, inplace=True, axis=1)
#chek if columns are removed
listings.shape

(50796, 96)

In [None]:
#show all columns for one listing
listings.loc[702825]

In [None]:
#new df with only int and float columns, exclude object type
listings_int_float = listings.select_dtypes(exclude=['object'], );
listings_int_float.head()

In [None]:
#new df, select only few
listings_select = listings[[#'id','scrape_id','host_id','host_listings_count',
                          'host_total_listings_count','neighbourhood_cleansed',
                          #'latitude','longitude',
                          'accommodates','bathrooms',
                          'bedrooms','beds','guests_included',
                          'minimum_nights','maximum_nights',
                          #'minimum_minimum_nights','maximum_minimum_nights',
                          #'minimum_maximum_nights','maximum_maximum_nights',
                          #'minimum_nights_avg_ntm','maximum_nights_avg_ntm',
                          'availability_30','availability_60',
                          #'availability_90','availability_365',
                          'number_of_reviews','number_of_reviews_ltm'#,
                          #'review_scores_rating','review_scores_accuracy',
                          #'review_scores_cleanliness','review_scores_checkin',
                          #'review_scores_communication',
                          #'review_scores_location','review_scores_value','reviews_per_month',
                          #'calculated_host_listings_count',
                          #'calculated_host_listings_count_entire_homes',
                          #'calculated_host_listings_count_private_rooms',
                          #'calculated_host_listings_count_shared_rooms'
                         ]]



In [None]:
# Using the Seaborn Pairplot to get cross plots between different attributes
sns.pairplot(listings_select);

In [None]:
#heatmap for all int/float columns
plt.figure(figsize=(12,10))
correlations = listings_int_float.corr()
sns.heatmap(correlations,
           cmap='coolwarm');