# Bootstrapping Historical Sales Price from King County Property Sales

## Load necessary modules

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

## Load necessary data

In [None]:
king_county_sales = pd.read_csv("write_data/king_county_sales.csv", low_memory=False)
king_county_sales.head()

## Preprocess the `documentdate` to transform it from a `string` to a `datetime` object

In [None]:
king_county_sales["clean_date"] = pd.to_datetime(king_county_sales["documentdate"], 
                                                 format="%m/%d/%Y")
king_county_sales["year"] = king_county_sales["clean_date"].dt.year

## Subset the sales price of properties in two zipcodes: 98122 and 98105

In [None]:
only_98122 = king_county_sales.loc[(king_county_sales["zipcode"] == "98122") &
                                   (king_county_sales["year"] == 2019), "saleprice"]
only_98122[0:5]

In [None]:
only_98105 = king_county_sales.loc[(king_county_sales["zipcode"] == "98105") &
                                   (king_county_sales["year"] == 2019), "saleprice"]
only_98105[0:5]

## Check the number of observations in both objects

In [None]:
only_98122.size

In [None]:
only_98105.size

## Create a function that returns the sample means from each bootstrapped sample

In [None]:
n_bootstraps = 10**4

def bootstrap(data, n_bootstraps=n_bootstraps):
    """Return array of sample means from bootstrapping n_samples times."""
    output = np.zeros(n_bootstraps)
    n = len(data)
    for i in range(n_bootstraps):
        bootstrap_sample = np.random.choice(data, 
                                            size=n,
                                            replace=True)
        output[i] = (bootstrap_sample.sum() / n)
    return output

## Use `bootstrap()` on the sales price for each zipcode

In [None]:
only_98122_dist = bootstrap(only_98122, n_bootstraps)
only_98105_dist = bootstrap(only_98105, n_bootstraps)

## Visualize the two bootstrapped distributions

In [None]:
fig, ax = plt.subplots()
ax.hist(only_98105_dist, alpha=.5, label="Zipcode 98105")
ax.hist(only_98122_dist, alpha=.5, label="Zipcode 98122")
ax.legend()
ax.set_xlabel("% Property Sales Price")
plt.show()

## Calculate the probability where bootstrapped sampled properties in 98122 sold for more than properties in 98105 during the 2019 year

In [None]:
(only_98122_dist > only_98105_dist).sum() / n_bootstraps