# CS 109A Data Science: AirBnB Pricing Predictions - Milestone #3
**Harvard University**<br>
**Fall 2016**<br>
**Instructors: W. Pan, P. Protopapas, K. Rader**<br>
**Due Date: ** Wednesday, November 5th, 2016 at 11:59pm

#### By Ayo Opeyemi, Chase Davis, & Dino Rodriguez

### Import Libaries:

In [None]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy.stats import mode
from sklearn import linear_model
import matplotlib
import matplotlib.pyplot as plt
from sklearn import discriminant_analysis
from sklearn.decomposition import PCA
from sklearn import preprocessing
from collections import Counter
%matplotlib inline

### Step 1: Import listings, clean data, and extract features
<p> We begin by importing the data. A visual examination reveals missing values across several columns.</p>
* TODO: Clean data and fill in missing values
* TODO: Extract appropriate features

In [None]:
# Read in the data 
listings = pd.read_csv('listings.csv', delimiter=',')

# Visualize initial data
print 'Fig 1'
print ''
print listings.columns.values

In [None]:
# Visualize list to get an idea of data types we have
print 'Fig 2'
print ''
listings.head(n = 3)

### Step 2: Explore Data

In [None]:
# Visualize price table, changing them to floats and replacing the commas with a blank
prices = listings['price'].apply(lambda s: float(s[1:].replace(',','')))

print 'Fig 3'
print ''
print prices.shape
prices.head(n = 5)

In [None]:
# Histogram of pricing data
def price_hist(n, titles, ranges):
    fig, ax = plt.subplots(n, figsize=(8, 7.5))
    for i in range(n):
        d, bins, patches = ax[i].hist(ranges[i], 60, normed=1, facecolor='green', alpha=0.75)
        ax[i].set_title(titles[i])
        ax[i].set_xlabel("Value")
        ax[i].set_ylabel("Frequency")
    plt.tight_layout()
    plt.show()

In [None]:
print 'Fig 4, 5, 6'
print ''
price_hist(3, ['Prices Histogram: All Data', 'Prices Histogram: \$0 - \$1000', 'Prices Histogram: \$0 - \$200'], 
           [prices, prices[prices <= 1000], prices[prices < 250]])

In [None]:
# Variation of method at https://www.mapr.com/blog/predicting-airbnb-listing-prices-scikit-learn-and-apache-spark

nb_counts = Counter(listings.neighbourhood_cleansed)

print 'Figure 7'
print ''
print 'Number of Neighborhoods: ', len(nb_counts)

tdf = pd.DataFrame.from_dict(nb_counts, orient='index').sort_values(by=0)
ax = tdf.plot(kind='bar', figsize = (50,4))
ax.set_title("Neighborhoods by House #")
ax.set_xlabel("Neighborhood")
ax.set_ylabel("# of Listings")
plt.show()

##### Double click the above plot to view suppressed output!

In [None]:
nb_counts = Counter(listings.neighbourhood_cleansed)

for frequency in nb_counts.keys():  ## creates a list of all keys
    if nb_counts[frequency] < 100:
        del nb_counts[frequency]

print 'Figure 8'
print ''       
print 'Number of Neighborhoods: ', len(nb_counts)

tdf = pd.DataFrame.from_dict(nb_counts, orient='index').sort_values(by=0)
ax = tdf.plot(kind='bar', figsize = (22,4))
ax.set_title("Neighborhoods by House # (Top 48)")
ax.set_xlabel("Neighborhood")
ax.set_ylabel("# of Listings")

plt.show()

In [None]:
listings['price'] = listings["price"].apply(lambda s:float(s[1:].replace(',','')))

neighborhoods = list(set(listings['neighbourhood_cleansed'].tolist()))
p = []
for nb in neighborhoods:
    p.append(np.mean(listings[listings['neighbourhood_cleansed'] == nb]['price'].tolist()))

data = zip(p, neighborhoods)

print 'Figure 8'
print '' 

plt.figure(figsize=(186, 8))
ax = plt.bar(np.arange(len(p)), p, align='center', width = 1)
plt.xticks(np.arange(len(p)), neighborhoods, rotation = 90)
plt.title("Avg. Price of House in Neighborhood")
plt.xlabel("Avg. Price")
plt.ylabel("Neighborhood")
# for price, neighborhood in zip(prices, neighborhoods):
#     ax.text(rect.get_x() + rect.get_width()/2, price, neighborhood, ha='center', va='bottom')
plt.show()

##### Double click the above plot to view suppressed output!

In [None]:
price = listings['price'].tolist()
price_sum = sum(price)
price_max = max(price)
norm = [float(i)/price_sum for i in price]
norm = [float(i)/max(norm) for i in norm]

print 'Figure 9'
print ''

# Plot listings on scatter
plt.figure(figsize=(10, 8))
plt.scatter(listings['latitude'], listings['longitude'], alpha = .5, c = norm, label = 'Listing') #plot x against y
plt.title('New York City - AirBnB Listings')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(loc = 'best')

plt.colorbar()
plt.show()

In [None]:
print 'Figure 10'
print ''

correlation_matrix = listings[['price','latitude', 'longitude',
       'accommodates', 'bathrooms', 'bedrooms', 'beds', 'square_feet','guests_included', 'minimum_nights', 'maximum_nights',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value',
       'host_listing_count']].corr()

correlation_matrix

In [None]:
print 'Figure 11.1'
print ''

plt.figure(figsize=(7, 7))

plt.pcolor(correlation_matrix, cmap="RdBu")
plt.xlabel("predictor number")
plt.ylabel("predictor number")
plt.title("Correlation Heat Map")
plt.colorbar()

plt.show()

In [None]:
print 'Figure 11.2'
print ''

key = pd.DataFrame({"Predictor Number": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
                                         16,17,18,19,20,21,22,23], "Name": ['price','latitude', 'longitude',
       'accommodates', 'bathrooms', 'bedrooms', 'beds', 'square_feet','guests_included', 'minimum_nights', 'maximum_nights',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value',
       'host_listing_count']})
key = key[["Predictor Number","Name"]]

key