# Analysis of Historical NYC Sales Data 2003-2018

### Import required libraries

In [4]:
# Import required analysis libraries
import numpy as np
import pandas as pd
import csv

In [41]:
# Import required visualization libraries
%matplotlib inline
import hvplot.pandas
import plotly.express as px
from panel.interact import interact

### Import borough sales data, pre-processed using SQL

In [150]:
# import data from four boroughs using pandas
boroughs_path = 'four_boroughs.csv'
boroughs_df = pd.read_csv(boroughs_path)

# check several rows of the data frame
boroughs_df.head()

Unnamed: 0,NEIGHBORHOOD,BUILDING CLASS CATEGORY,BLOCK,LOT,ZIPCODE,SALE PRICE,SALE DATE
0,BATHGATE,01 ONE FAMILY HOMES,2907,23,10457.0,0.0,2003-03-24 00:00:00
1,BATHGATE,01 ONE FAMILY HOMES,2917,15,10457.0,130000.0,2003-05-27 00:00:00
2,BATHGATE,01 ONE FAMILY HOMES,3028,25,10457.0,204000.0,2003-04-07 00:00:00
3,BATHGATE,01 ONE FAMILY HOMES,3030,55,10457.0,235000.0,2003-07-24 00:00:00
4,BATHGATE,01 ONE FAMILY HOMES,3035,2,10457.0,125500.0,2003-05-02 00:00:00


### Clean up data and add required columns

In [151]:
# rename SALE DATE column and create new Year column using the first four elements of SALEDATE string
boroughs_df.rename(columns={'SALE DATE':'SALEDATE'}, inplace=True)
boroughs_df['Year'] = boroughs_df.SALEDATE.str[:4]
boroughs_df['Year']=boroughs_df['Year']

# strip whitespace off of NEIGHBORHOOD COLUMN
boroughs_df['NEIGHBORHOOD']=boroughs_df['NEIGHBORHOOD'].str.strip()

# drop irrelevant columns
boroughs_df.drop(columns=['BLOCK','LOT','ZIPCODE','SALEDATE'], inplace=True)

# filter out transfers by removing transactions under $5000
boroughs_sales_df = boroughs_df[boroughs_df['SALE PRICE']>5000]

# check the stats for the remaining dataframe and check several rows of resulting dataframe
print(boroughs_sales_df.describe())
boroughs_sales_df.head()


         SALE PRICE
count  9.461990e+05
mean   1.277961e+06
std    1.459841e+07
min    5.001000e+03
25%    2.850000e+05
50%    4.950000e+05
75%    8.170000e+05
max    4.111112e+09


Unnamed: 0,NEIGHBORHOOD,BUILDING CLASS CATEGORY,SALE PRICE,Year
1,BATHGATE,01 ONE FAMILY HOMES,130000.0,2003
2,BATHGATE,01 ONE FAMILY HOMES,204000.0,2003
3,BATHGATE,01 ONE FAMILY HOMES,235000.0,2003
4,BATHGATE,01 ONE FAMILY HOMES,125500.0,2003
5,BATHGATE,01 ONE FAMILY HOMES,215000.0,2003


### Define New Dataframe with Sales by Neighborhood & Year

In [153]:
# add together all the sales in a particular neighborhood for a particular year
all_neigh_sales = boroughs_sales_df.groupby(['NEIGHBORHOOD','Year']).sum().reset_index()

# check several rows of new dataframe
all_neigh_sales.head()


Unnamed: 0,NEIGHBORHOOD,Year,SALE PRICE
0,3004,2006,681408.0
1,AIRPORT JFK,2006,12177408.0
2,AIRPORT JFK,2016,7800000.0
3,AIRPORT LA GUARDIA,2003,2556890.0
4,AIRPORT LA GUARDIA,2004,3634000.0


### Export Sales Data by Neighborhood & Year for Dashboard

In [162]:
# use 'to_csv' function to export dataframe to csv for use in combined dashboard
all_neigh_sales.to_csv('dash_sales.csv')

## NYC Real Estate Sales by Neighbhorhood (2003-2018)

### Define list of unique neighborhoods

In [154]:
# slice the neighborhood series
all_neighs=all_neigh_sales['NEIGHBORHOOD']

# drop all nulls in neighborhood series
all_neighs.dropna(inplace=True)

# use 'set' function to get a set of unique neighborhoods
# use list function to turn the set into a list, so that it's subscriptable
all_neighborhoods = list(set(all_neighs))

### Plot Real Estate Sales by Neighborhood to observe trends 

In [155]:
# define function to slice dataframe and return plot of an individual neighborhood's sales over time
def all_neigh_sales_plot(Neighborhood):
    
    # slice dataframe by neighborhood
    all_slice = all_neigh_sales[all_neigh_sales['NEIGHBORHOOD'].isin([Neighborhood])]
    
    # return hvplot of sale price vs year
    return all_slice.hvplot.line(
        x='Year',
        y='SALE PRICE',
        title='NYC Real Estate Sales'
        ).opts(xlabel='Year', ylabel='Total Sales',yformatter="%.0f")

# use interact to make an interactive plot where the plot responds to neighborhood dropdown
interact(all_neigh_sales_plot, Neighborhood=all_neighborhoods)

## Rate of Change and Other Metrics by Neighborhood

### Define function to calculate metrics

In [159]:
# define function to calculate metrics based on neighborhood
def neighborhood_stats(Neighborhood):
    
    # slice dataframe per neighborhood
    all_slice = all_neigh_sales[all_neigh_sales['NEIGHBORHOOD'].isin([Neighborhood])]
    
    # calculate rate of change using pct_change function
    returns = all_slice['SALE PRICE'].pct_change()
    
    # calculate avg yearly sales across all years
    present_mean = all_slice['SALE PRICE'].mean()
    
    # calculate avg rate of change across all years
    avg_pct_change = round(returns.mean(),4)
    
    # calculate avg rate of change for the last 3 years
    pct_change_3yr = round(returns[-3:].mean(),4)
    
    # calculate difference between avg change and 3 year change
    comparison_3yr = round(pct_change_3yr - avg_pct_change,4)
    
    # calcualte avg rate of change for the last 5 years
    pct_change_5yr = round(returns[-5:].mean(),4)
    
    # calculate difference between avg change and 5 year change
    comparison_5yr = round(pct_change_5yr - avg_pct_change,4)
    
    # return a list of the yearly avg, avg change, and other metrics
    return [present_mean, avg_pct_change, pct_change_3yr, comparison_3yr, pct_change_5yr, comparison_5yr]


### Generate a dictionary that contains Metrics by Neighborhood

In [160]:
# initialize sales_stats dictionary, later to be converted to a dataframe
sales_stats = {}

# use a for loop to iterate over the list of neighborhoods, defined above
for Neighborhood in manhattan_neighborhoods:
    
    # call the neighborhood_stats function per neighborhood and write results into sales_stats dictionary
    sales_stats[Neighborhood] = neighborhood_stats(Neighborhood)

# check the length of the resulting dictionary
len(sales_stats)

204

### Convert the dictionary into a dataframe

In [158]:
# convert dictionary to dataframe
stats_df = pd.DataFrame(sales_stats)

# define new column names
new_columns = ['avg_yearly_sale','avg_pct_change','pct_change_3yr','comparison_3yr','pct_change_5yr','comparison_5yr']

# transpose the dataframe to neighborhood is in rows
sales_df = stats_df.transpose()

# add new column names
sales_df.columns=new_columns

# drop nulls
sales_df.dropna(inplace=True)

# sort values by the average yearly sales
sales_df.sort_values(['avg_yearly_sale'], ascending=False, inplace=True)

# check the top 50 rows of the dataframe
sales_df.head(50)

Unnamed: 0,avg_yearly_sale,avg_pct_change,pct_change_3yr,comparison_3yr,pct_change_5yr,comparison_5yr
MIDTOWN WEST,6054405000.0,0.3036,0.0389,-0.2647,0.1308,-0.1728
UPPER EAST SIDE (59-79),4235901000.0,0.0831,-0.0774,-0.1605,0.0112,-0.0719
MIDTOWN CBD,3705496000.0,0.2531,-0.0645,-0.3176,-0.0884,-0.3415
UPPER WEST SIDE (59-79),3279761000.0,0.1025,0.0332,-0.0693,0.0963,-0.0062
UPPER EAST SIDE (79-96),2897718000.0,0.0642,-0.0251,-0.0893,0.0421,-0.0221
CHELSEA,2410984000.0,0.23,0.2658,0.0358,0.2615,0.0315
FINANCIAL,1891710000.0,0.4328,-0.1515,-0.5843,-0.1384,-0.5712
MIDTOWN EAST,1811546000.0,0.0513,-0.0668,-0.1181,0.0181,-0.0332
FLATIRON,1720011000.0,0.4997,-0.1332,-0.6329,0.3567,-0.143
FASHION,1656225000.0,0.4381,0.3915,-0.0466,0.2498,-0.1883


### Export dataframe as csv to feed into Joint_Analysis.ipynb

In [None]:
# use 'to_csv' function to export as csv
sales_df.to_csv('sales_stats.csv')