In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import geopandas
import geopy
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import QuantileTransformer
import sklearn
from sklearn.neighbors import NearestNeighbors

In [2]:
data = pd.read_csv('data/kc_house_data.csv')

In [3]:
data['price_sqft'] = data['price'] / data['sqft_living']
data['price_sqft_transformed'] = data['price'] / data['sqft_living']

data['price']
std = data.price.std()
mean = data.price.mean()
std_15 = mean + (1.5*std)
std_15m = mean - (1.5*std)
# removing outliers
# focusing on data within 1.5 standard deviations from the mean
data = data.loc[(data['price']<std_15) & (data['price']>std_15m)]

In [4]:
drop = ['id','date', 'yr_built', 'bedrooms', 'bathrooms','sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'sqft_living15', 'sqft_lot15']
data = data.drop(columns = drop, axis=1)

In [5]:
data.isnull().sum()

price                     0
sqft_living               0
grade                     0
lat                       0
long                      0
price_sqft                0
price_sqft_transformed    0
dtype: int64

#### Adding park locations

In [6]:
# for entire data scraping process, please see other notebook

# reading the csv file
king_parks = pd.read_csv('data/ParkAddresses_Revised_wLatLong.csv', index_col='ID')

In [7]:
#create function to find distances between all points in DF and return matrix
def find_distance(dataframe):
    dist = sklearn.neighbors.DistanceMetric.get_metric('haversine')
    
    #convert lat and long to radians
    dataframe[['lat_radians','long_radians']] = (np.radians(dataframe.loc[:,['Lat','Long']]))
    
    #create list matrix (results in km)          
    dist_matrix = (dist.pairwise
    (data[['lat_radians_A','long_radians_A']],
     dataframe[['lat_radians','long_radians']])*6371)
                                                 
    #return a matrix DataFrame
    return pd.DataFrame(dist_matrix)

In [8]:
#convert lat and long to radians in housing data
data[['lat_radians_A','long_radians_A']] = (np.radians(data.loc[:,['lat','long']]))

In [9]:
park_matrix = find_distance(king_parks)

In [10]:
#find min distance in each row
park_min_matrix = park_matrix.where(park_matrix.values == park_matrix.min(
    axis=1)[:,None]).drop_duplicates()

In [11]:
#create a new column with only min distance and remove the rest 
park_min_matrix['min_dist_park'] = park_min_matrix[park_min_matrix.columns[0:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1)
nearest_park = park_min_matrix['min_dist_park']

In [12]:
nearest_park

0          2.038307293948517
1         5.6653668000626025
2          1.337990461344532
3          2.448557143643891
4          3.723027946782503
                ...         
20361    0.22072098764999543
20362     5.3240353166635686
20365    0.16836104215013137
20366      5.811244676660208
20368      5.779031635163857
Name: min_dist_park, Length: 19612, dtype: object

In [13]:
data = data.join(nearest_park, how='left')
data['min_dist_park']= data['min_dist_park'].astype('float64')
data.head()

Unnamed: 0,price,sqft_living,grade,lat,long,price_sqft,price_sqft_transformed,lat_radians_A,long_radians_A,min_dist_park
0,221900.0,1180,7,47.5112,-122.257,188.050847,188.050847,0.829227,-2.133787,2.038307
1,538000.0,2570,7,47.721,-122.319,209.338521,209.338521,0.832889,-2.134869,5.665367
2,180000.0,770,6,47.7379,-122.233,233.766234,233.766234,0.833184,-2.133368,1.33799
3,604000.0,1960,7,47.5208,-122.393,308.163265,308.163265,0.829394,-2.136161,2.448557
4,510000.0,1680,8,47.6168,-122.045,303.571429,303.571429,0.83107,-2.130087,3.723028


#### Adding top school locations

In [14]:
# importing school data
# for entire data obtaining process, please see other notebook

# reading the csv file
top_schools_df = pd.read_csv('data/top_schools.csv')
# previewing the DataFrame
top_schools_df.head()

Unnamed: 0,Column1,year,ncessch,school_name,state_name,lea_name,zip_location,latitude,longitude,county_code,school_level,school_type
0,43,2015,530039000000.0,Ardmore Elementary School,Washington,Bellevue School District,98008,47.639591,-122.117467,53033,Primary,Regular school
1,44,2015,530039000000.0,Bellevue High School,Washington,Bellevue School District,98004,47.604668,-122.197234,53033,High,Regular school
2,45,2015,530039000000.0,Bennett Elementary School,Washington,Bellevue School District,98008,47.624703,-122.101974,53033,Primary,Regular school
3,46,2015,530039000000.0,Cherry Crest Elementary School,Washington,Bellevue School District,98005,47.63944,-122.173711,53033,Primary,Regular school
4,47,2015,530039000000.0,Chinook Middle School,Washington,Bellevue School District,98004,47.62785,-122.210751,53033,Middle,Regular school


In [15]:
top_schools_df.drop(columns = 'Column1', axis=1, inplace=True)

In [16]:
import haversine as hs 
#geographic distance calculator
#function that identifies the distance between a point of interest and house 
def distance_to(point_of_interest):
    distance = data[['lat','long']].apply(lambda x: hs.haversine(x.tolist(), point_of_interest), axis=1)
    return distance

In [17]:
top_school_coordinates = []
x = round(top_schools_df.latitude, 2)
y = round(top_schools_df.longitude, 2)
top_school_coordinates = list(zip(x,y))

In [18]:
for i in range(len(top_school_coordinates)):
    data[f'top_school_{i}'] = distance_to(top_school_coordinates[i])

top_school_cols = []
for i in range(len(top_school_coordinates)):
    top_school_cols.append(f'top_school_{i}')
    data['closest_distance_to_top_school'] = data[top_school_cols].min(axis=1)

In [19]:
data.drop(columns = top_school_cols, axis=1, inplace=True)
rad_cols = ['lat_radians_A', 'long_radians_A']
data.drop(columns=rad_cols, axis=1, inplace=True)
data.head()

Unnamed: 0,price,sqft_living,grade,lat,long,price_sqft,price_sqft_transformed,min_dist_park,closest_distance_to_top_school
0,221900.0,1180,7,47.5112,-122.257,188.050847,188.050847,2.038307,0.261867
1,538000.0,2570,7,47.721,-122.319,209.338521,209.338521,5.665367,0.682377
2,180000.0,770,6,47.7379,-122.233,233.766234,233.766234,1.33799,2.003
3,604000.0,1960,7,47.5208,-122.393,308.163265,308.163265,2.448557,1.729431
4,510000.0,1680,8,47.6168,-122.045,303.571429,303.571429,3.723028,1.179255


#### Adding Scientolgy locations

In [20]:
#locations pulled from scientology-seattle.org
church_of_scientology_mission = (47.818100, -122.315430)
church_of_scientology_washington = (47.622380, -122.361020)
church_of_scientology_life_improvement_center = (47.615060, -122.327580)

In [21]:
#function that identifies the distance between a point of interest and house 
def distance_to(point_of_interest):
    distance = data[['lat','long']].apply(lambda x: hs.haversine(x.tolist(), point_of_interest), axis=1)
    return distance

In [22]:
#creating new columns of distances from houses to point of interest
data['distance_to_scientology_m'] = distance_to(church_of_scientology_mission)
data['distance_to_scientology_w'] = distance_to(church_of_scientology_washington)
data['distance_to_scientology_l'] = distance_to(church_of_scientology_life_improvement_center)
data['closest_distance_to_scientology'] = data[['distance_to_scientology_m',
                                                        'distance_to_scientology_w',
                                                        'distance_to_scientology_l']].min(axis=1)

In [23]:
sci_cols = ['distance_to_scientology_m', 'distance_to_scientology_w',
           'distance_to_scientology_l']
data.drop(columns = sci_cols, axis=1, inplace=True)

#### Add great coffee shop locations

def get_keys(path):
    with open(path) as f:
        return json.load(f)

keys = get_keys("/Users/dtunnicliffe/.secret/yelp_api.json")
api_key = keys['api_key']

term = 'coffee'
location = 'King County, WA'
SEARCH_LIMIT = 10
espresso = pd.DataFrame([])
def yelp(term, location, SEARCH_LIMIT):
    global espresso
    url = 'https://api.yelp.com/v3/businesses/search'
    headers = {
    'Authorization': f'Bearer {api_key}',
    }
    url_params = {
    'term': term.replace(' ', '+'),
    'location': location.replace(' ', '+'),
    'limit': SEARCH_LIMIT,
    'sort_by': 'rating'
    }
    response = requests.get(url, headers=headers, params=url_params)
    df_temp = pd.DataFrame.from_dict(response.json()['businesses'])
    espresso = espresso.append(df_temp)
    return espresso

espresso = yelp(term, location, SEARCH_LIMIT)

great_coffee_coordinates = []
x = [round(coordinate['latitude'], 2) for coordinate in espresso['coordinates']]
y = [round(coordinate['longitude'], 2) for coordinate in espresso['coordinates']]
great_coffee_coordinates = list(zip(x,y))

for i in range(len(great_coffee_coordinates)):
    data[f'great_coffee_{i}'] = distance_to(great_coffee_coordinates[i])

great_coffee_cols = []
for i in range(len(great_coffee_coordinates)):
    great_coffee_cols.append(f'great_coffee_{i}')
    data['closest_distance_to_great_coffee'] = data[great_coffee_cols].min(axis=1)

#dropping unnecessary columns
data = data.drop(columns = great_coffee_cols, axis=1)
data.head()

#### Use quantile transformation for all features 

In [26]:
qt = QuantileTransformer(output_distribution='normal')
to_transform= ['sqft_living', 'min_dist_park', 'closest_distance_to_top_school', 'closest_distance_to_scientology', 'price_sqft_transformed']
data[to_transform] = qt.fit_transform(data[to_transform])

In [27]:
data.to_csv('data/ master_data_wo_coffee.csv')

In [None]:
data.shape

In [None]:
sns.distplot(data['price_sqft']);
print('Kurtosis: ' + str(data['price_sqft'].kurt()))
print('Skewness: ' + str(data['price_sqft'].skew()))

In [None]:
sns.distplot(data['closest_distance_to_top_school'], color = 'blue');
sns.distplot(data['closest_distance_to_great_coffee'], color = 'yellow');
sns.distplot(data['closest_distance_to_scientology'], color = 'green');
sns.distplot(data['closest_distance_to_great_coffee'], color = 'orange');
sns.distplot(data['min_dist_park_rv'], color = 'red');

In [None]:
data['min_dist_park_rv']
std = data.min_dist_park_rv.std()
mean = data.min_dist_park_rv.mean()
std_15 = mean + (1.5*std)
std_15m = mean - (1.5*std)
# removing outliers
# focusing on data within 1.5 standard deviations from the mean
data = data.loc[(data['min_dist_park_rv']<std_15) & (data['min_dist_park_rv']>std_15m)]

In [None]:
sns.distplot(data['min_dist_park_rv'], color = 'red');