In [80]:
import pandas as pd
import numpy as np
from scipy import stats

path = './datasets/kc_house_data.csv'
date_columns = ['date', 'yr_built', 'yr_renovated']
pd.options.display.float_format = '{:,.2f}'.format
mydateparser = lambda x: pd.to_datetime(x, errors='coerce')

In [102]:
# FUNCTION DECLARATIONS
def load_data(path, date_columns, date_parser):
    return pd.read_csv(path, parse_dates=date_columns, date_parser=date_parser)

def convert_sqft_to_m2(data):
    data['m2_lot'] = data['sqft_lot'] * 0.092903
    data['m2_above'] = data['sqft_above'] * 0.092903
    data['m2_basement'] = data['sqft_basement'] * 0.092903
    data['m2_living'] = data['sqft_living'] * 0.092903
    data['m2_living15'] = data['sqft_living15'] * 0.092903
    data['m2_lot15'] = data['sqft_lot15'] * 0.092903
    data = data.drop(['sqft_lot', 'sqft_above', 'sqft_basement',
                     'sqft_living', 'sqft_living15', 'sqft_lot15'], axis=1)
    data['price_m2'] = data['price'] / data['m2_lot']
    return data

def remove_outliers(data):
    return (data - data.mean()/data.std())


In [103]:
data = load_data(path, date_columns, mydateparser)
data = convert_sqft_to_m2(data)

In [106]:
print(data.shape)

(21613, 22)
(21613, 19)


Unnamed: 0,id,price,bedrooms,bathrooms,floors,waterfront,view,condition,grade,zipcode,lat,long,m2_lot,m2_above,m2_basement,m2_living,m2_living15,m2_lot15,price_m2
0,7129300518.41,221898.53,-0.62,-1.75,-1.77,-0.09,-0.31,-2.24,0.49,96344.94,-295.72,745.56,524.54,107.47,-0.66,107.36,121.59,524.43,421.80
1,6414100190.41,537998.53,-0.62,-0.50,-0.77,-0.09,-0.31,-2.24,0.49,96291.94,-295.51,745.50,672.44,199.44,36.50,236.50,154.11,709.22,798.69
2,5631500398.41,179998.53,-1.62,-1.75,-1.77,-0.09,-0.31,-2.24,-0.51,96194.94,-295.50,745.59,928.67,69.38,-0.66,69.27,249.80,748.52,192.80
3,2487200873.41,603998.53,0.38,0.25,-1.77,-0.09,-0.31,-0.24,0.49,96302.94,-295.72,745.43,464.15,95.39,83.88,179.83,123.45,464.05,1299.34
4,1954400508.41,509998.53,-0.62,-0.75,-1.77,-0.09,-0.31,-2.24,1.49,96240.94,-295.62,745.78,750.29,153.92,-0.66,153.81,164.33,696.58,678.46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000016.41,359998.53,-0.62,-0.25,0.23,-0.09,-0.31,-2.24,1.49,96269.94,-295.54,745.48,104.71,139.98,-0.66,139.88,139.24,139.72,3425.23
21609,6600060118.41,399998.53,0.38,-0.25,-0.77,-0.09,-0.31,-2.24,1.49,96312.94,-295.73,745.46,539.68,212.45,-0.66,212.34,167.11,668.43,739.73
21610,1523300139.41,402099.53,-1.62,-2.00,-0.77,-0.09,-0.31,-2.24,0.49,96310.94,-295.64,745.52,125.05,92.60,-0.66,92.50,91.86,185.99,3205.11
21611,291310098.41,399998.53,-0.62,-0.25,-0.77,-0.09,-0.31,-2.24,1.49,96193.94,-295.70,745.75,221.49,146.49,-0.66,146.38,128.09,119.10,1802.05


In [78]:
# H1: Imóveis que possuem vista para água, são 30% mais caros, na média.
waterfront_mean = data.loc[data['waterfront'] == 1]['price'].mean()
no_waterfront_mean = data.loc[data['waterfront'] == 0]['price'].mean()
h1 = waterfront_mean / no_waterfront_mean
h1


3.126391696351833