# AirBNB In SanFran

In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

In [2]:
def get_config():
    config_file = open('./data/config_airbnb.tsv')
    config = {}
    for line in config_file:
        config[line.split()[0]] = line.split()[1]
    config_file.close()
    return config

In [3]:
config = get_config()

In [4]:
engine = create_engine("postgresql+psycopg2://{}:{}@{}/{}".format(config['usr'],
                                                                  config['pwd'],
                                                                  config['url'],
                                                                  config['db']))

## Exploratory Data Analysis
Load the full tables _sfo_listings_, _sfo_calendar_, and _sfo_reviews_ in `DataFrame`s and view their structure.

In [5]:
query = '''
SELECT
    *
FROM
    sfo_listings;
'''

In [6]:
sfo_listings = pd.read_sql_query(query, con=engine)
sfo_listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6633 entries, 0 to 6632
Data columns (total 16 columns):
id                                6633 non-null int64
name                              6633 non-null object
host_id                           6633 non-null int64
host_name                         6633 non-null object
neighbourhood_group               0 non-null object
neighbourhood                     6633 non-null object
latitude                          6633 non-null object
longitude                         6633 non-null object
room_type                         6633 non-null object
price                             6633 non-null float64
minimum_nights                    6633 non-null int64
number_of_reviews                 6633 non-null int64
last_review                       5491 non-null object
reviews_per_month                 5491 non-null float64
calculated_host_listings_count    6633 non-null int64
availability_365                  6633 non-null object
dtypes: float64(2),

In [7]:
query = '''
SELECT
    *
FROM
    sfo_calendar;
'''

In [8]:
sfo_calendar = pd.read_sql_query(query, con=engine)
sfo_calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2383085 entries, 0 to 2383084
Data columns (total 4 columns):
listing_id       int64
calender_date    object
available        object
price            object
dtypes: int64(1), object(3)
memory usage: 72.7+ MB


In [9]:
available_types = list(np.unique(sfo_calendar.available.values))
print('The values for sfo_calendar.available are: {}'.format(available_types))

The values for sfo_calendar.available are: ['f', 't']


In [10]:
taken = len(sfo_calendar[sfo_calendar.available == 't'])
free = len(sfo_calendar[sfo_calendar.available == 'f'])
print('The total number of taken properties is {}.'.format(taken))
print('The total number of available properties is {}.'.format(free))

The total number of taken properties is 1037179.
The total number of available properties is 1345906.


In [11]:
query = '''
SELECT
    *
FROM
    sfo_reviews;
'''

In [12]:
sfo_reviews = pd.read_sql_query(query, con=engine)
sfo_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280574 entries, 0 to 280573
Data columns (total 6 columns):
listing_id       280574 non-null int64
id               280574 non-null int64
review_date      280574 non-null object
reviewer_id      280574 non-null int64
reviewer_name    280574 non-null object
comments         280574 non-null object
dtypes: int64(3), object(3)
memory usage: 12.8+ MB


## Question 1:
What's the most expensive listing? What else can you tell me about the listing?

Steps:
1. Find the most expensive listing and additional features of this listing, such as name, neighborhood, price, etc.
2. Gather seasonal information by including months derived from the _sfo_calendar_ table.
3. Restrict query to only include properties that have been booked and the most expensive property.

In [13]:
query = '''
SELECT
    sfo_listings.name,
    sfo_listings.price,
    sfo_listings.neighbourhood,
    sfo_listings.room_type,
    sfo_listings.minimum_nights,
    sfo_listings.number_of_reviews,
    DATE_PART('month', sfo_calendar.calender_date) AS month,
    COUNT(*) num_stays
FROM
    sfo_listings
JOIN
    sfo_calendar
ON
    sfo_listings.id = sfo_calendar.listing_id
WHERE
    (sfo_listings.price = 
        (SELECT
            MAX(price)
          FROM
            sfo_listings)
    AND
        sfo_calendar.available = 't'
    )
GROUP BY
    1, 2, 3, 4, 5, 6, 7
;
'''

In [14]:
df = pd.read_sql_query(query, con=engine)
df.head(n=50)

Unnamed: 0,name,price,neighbourhood,room_type,minimum_nights,number_of_reviews,month,num_stays
0,"Full House Victorian: 7500 SqFt, 4 Floors, Hot...",10000.0,Western Addition,Entire home/apt,2,3,9.0,11
1,"Full House Victorian: 7500 SqFt, 4 Floors, Hot...",10000.0,Western Addition,Entire home/apt,2,3,10.0,4
2,"Full House Victorian: 7500 SqFt, 4 Floors, Hot...",10000.0,Western Addition,Entire home/apt,2,3,11.0,5
3,"Full House Victorian: 7500 SqFt, 4 Floors, Hot...",10000.0,Western Addition,Entire home/apt,2,3,12.0,8


## Question 2: 
What neighborhoods seem to be the most popular?

Steps:
1. Find the booking counts for all of the properties that have been booked.
2. Narrow query by finding the maximum booking count and only returning the row with this maximum value.

In [15]:
query = '''
WITH
    booking_counts
AS (
    SELECT
        sfo_listings.neighbourhood,
        COUNT(*) booking_count
    FROM
        sfo_listings
    JOIN
        sfo_calendar
    ON
        sfo_listings.id = sfo_calendar.listing_id
    WHERE
        sfo_calendar.available = 't'
    GROUP BY
        1
    ORDER BY
        booking_count DESC
)
SELECT
    neighbourhood,
    booking_count
FROM
    booking_counts
WHERE 
    booking_count = 
        (SELECT 
            MAX(booking_count)
         FROM
             booking_counts)
;
'''

In [16]:
df = pd.read_sql_query(query, con=engine)
df.head()

Unnamed: 0,neighbourhood,booking_count
0,Mission,89156


## Question 3a:
What time of year is the cheapest time to go to San Francisco?

Steps:
1. Find the average price per month.
2. Restrict to properties that have been booked.
3. Order by the average price to show cheapest at top.

In [17]:
query = '''
SELECT
    DATE_PART('month', sfo_calendar.calender_date) AS month,
    AVG(sfo_listings.price) mean_price
FROM
    sfo_calendar
JOIN
    sfo_listings
ON
     sfo_calendar.listing_id = sfo_listings.id
WHERE
    sfo_calendar.available = 't'
GROUP BY
    month
ORDER BY
    mean_price
;
'''

In [18]:
df = pd.read_sql_query(query, con=engine)
df.head(n=20)

Unnamed: 0,month,mean_price
0,4.0,197.07995
1,3.0,197.934214
2,1.0,198.431316
3,5.0,198.689699
4,11.0,200.599025
5,12.0,200.759216
6,2.0,200.874781
7,6.0,201.572516
8,8.0,202.472754
9,7.0,203.080884


## Question 3b:
What time of year is the busiest time to go to San Francisco?

Steps:
1. Count the number of bookings by month.
2. Only include properties that have been booked.
3. Order by descending booking count to show busiest time of year at the top.

In [19]:
query = '''
SELECT
    COUNT(*) booking_count,
    DATE_PART('month', sfo_calendar.calender_date) AS month
FROM
    sfo_calendar
JOIN
    sfo_listings
ON
     sfo_calendar.listing_id = sfo_listings.id
WHERE
    sfo_calendar.available = 't'
GROUP BY
    month
ORDER BY
    booking_count DESC, 
    month;
'''

In [20]:
df = pd.read_sql_query(query, con=engine)
df.head(n=12)

Unnamed: 0,booking_count,month
0,96674,1.0
1,94814,11.0
2,92452,12.0
3,91751,2.0
4,82464,3.0
5,80712,5.0
6,76469,8.0
7,76373,4.0
8,75503,7.0
9,73294,6.0
