# House Rocket Project
The aim of the project is to define the best trade opportunities within the House Rocket portfolio. To find what houses should be bought, two approach were taken. First, a general comparison between houses with same condition and same region was done. The second approach takes the effects of the features of the houses to refine the analysis trying to find high proftable opportunities.

In [69]:
import pandas as pd
import numpy as np
import plotly.express as px
import folium
from math import sin, cos, sqrt, atan2, radians
data = pd.read_csv('dataset/kc_house_data.csv')

In [70]:
def remove_outliers(data, column):
        
    # Defining the quartiles
    Q1 = np.quantile(data[column], 0.25, interpolation = 'midpoint')
    # Q2 = np.quantile(data[column], 0.50, interpolation = 'midpoint')
    Q3 = np.quantile(data[column], 0.75, interpolation = 'midpoint') 
    FIQ = Q3 - Q1
    
    # Removing lowers outliers
    data = data.loc[data[column] > ( Q1 - 1.5*FIQ )]

    # Removing highers outliers
    data = data.loc[data[column] < ( Q3 + 1.5*FIQ )]
    
    data.reset_index(inplace = True)
    data.drop(columns = 'index', axis = 1, inplace= True)
    return data

def take_houses(data, house_data,H):
    sample = pd.DataFrame()
    data_aux = pd.DataFrame()
    for dado in house_data:
        data_aux = data.loc[data['id'] == dado[3]]
        sample = pd.concat([sample, data_aux])
        
    print(len(sample), ' houses attend the hypothesis ', H)
    sample.reset_index(inplace = True)
    for i in range(len(sample)):
        for dado in house_data:
            if sample.loc[i, 'id'] == dado[3]:
                sample.loc[i,'median_price'] = dado[2]
                sample.loc[i,'estimate_profit'] = dado[0]
                sample.loc[i,'estimate_profit%'] = '%.1f' %dado[1]
                sample.loc[i,'x%_lower'] = '%.1f' %(100*(1 - (sample.loc[i,'price']/dado[2])))
            
            
    sample.dropna(axis = 0, inplace = True)
    sample.reset_index(inplace = True)
    sample['hypothesis'] = H
    sample.drop(['index'], axis = 1, inplace = True)
    return sample

def show_profit_estimation(data):
    data['estimate_profit%'] = data['estimate_profit%'].astype(float)
    print('Profit estimation')
    print('Minimal estimated profit, %.1f' %data['estimate_profit%'].min(),'%')
    print('Maximum estimated profit, %.1f' %data['estimate_profit%'].max(),'%')
    print('Maximum estimated profit, %.1f' %data['estimate_profit%'].mean(),'%')

def calculate_distance(lat1, lon1, lat2, lon2):
    # approximate radius of earth in m
    R = 6371000.0

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

data = remove_outliers(data, 'price')

### Data Over view

In [71]:
data['date'] = pd.to_datetime(data['date'])
data['condition'] = data['condition'].astype(str)
print(data.shape)
data.head()

(20467, 21)


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180.0,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170.0,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770.0,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050.0,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680.0,0,1987,0,98074,47.6168,-122.045,1800,7503


#### After outliers removo, there are 20467 houses. The houses has 21 features.

#### Meidan prices per condition

In [72]:
def mediam_price_per_condition(data):
    """
    mostra a mediana dos preços dos imóveis para cada uma das condições.
    """
    
    price_per_condition = data[['price', 'condition']].groupby('condition').median().reset_index()
    price_per_condition.columns=['condition', 'median_price']
    
    fig = px.line(price_per_condition, x = 'condition', y = 'median_price', title = 'Averaged price per condition')
    fig.update_layout(
        font_size = 20,
        title = "Houses price as function of houses condition",
        xaxis_title = 'Houses condition',
        yaxis_title = 'Media price (USD)'            
    )
    fig.show()
mediam_price_per_condition(data)

#### Amount of houses per condition

In [73]:
def houses_per_condition(data):
    condition = data[['id', 'condition']].groupby('condition').count().reset_index()
    condition.columns = ['condition', 'number of houses per condition']
    condition['% of houses'] = 100*condition['number of houses per condition']/20467
    
    fig = px.bar(condition, x = 'condition', y = 'number of houses per condition', title = 'Amount of houses per condition')
    fig.update_layout(
        font_size = 20,
        title = "Number of houses per houses condition",
        xaxis_title = 'Houses condition',
        yaxis_title = 'Number of houses'            
    )
    fig.show()
    print(condition.head())
    
houses_per_condition(data)

  condition  number of houses per condition  % of houses
0         1                              29     0.141692
1         2                             170     0.830605
2         3                           13284    64.904480
3         4                            5423    26.496311
4         5                            1561     7.626912


#### Defining status and new features
##### Status
Defines if a house should be bought or not. One house should be bought if the house price is lower than the median price for houses in the same condition and region.
##### x% lower
Shows the discount, in other words, how much the house's price is lower the than median price for a given condition and region.

In [74]:
def median_price_per_condition_and_zipcode(data):
    data_zip = data[['price', 'condition', 'zipcode']].groupby(['condition', 'zipcode']).median().reset_index()
    data_zip.columns = ['condition', 'zipcode', 'median_price']
    return data_zip
    
def define_status(data, data_zip):
       
    data['median_price'] = 0
    data['status'] = 'Buy'
    data['x% lower'] = 0
    
    for i in range(len(data_zip)):
        data.loc[(data['condition'] == data_zip.loc[i ,'condition']) & (data['zipcode'] == data_zip.loc[i, 'zipcode']), 'median_price'] = data_zip.loc[i, 'median_price']
        data.loc[(data['condition'] == data_zip.loc[i ,'condition']) & (data['price'] > data_zip.loc[i, 'median_price']) & (data['zipcode'] == data_zip.loc[i, 'zipcode']), 'status'] = 'Not Buy'
    
    for i in range(len(data_zip)):
        data.loc[(data['condition'] == data_zip.loc[i ,'condition']) & (data['zipcode'] == data_zip.loc[i, 'zipcode']), 'estimate_profit'] = data_zip.loc[i, 'median_price'] - data['price']
        data.loc[(data['condition'] == data_zip.loc[i ,'condition']) & (data['zipcode'] == data_zip.loc[i, 'zipcode']), 'estimate_profit%'] = 100*(data_zip.loc[i, 'median_price']/data['price'] - 1)
    
    data.loc[(data['price'] < data['median_price']), 'x% lower'] = 100*(1 - data['price']/data['median_price'])
    data['x% lower'] = data['x% lower'].astype(float)
    data.round({'x% lower': 2})
           
define_status(data, median_price_per_condition_and_zipcode(data))
data[['price', 'median_price', 'estimate_profit','estimate_profit%','x% lower', 'status']].head(10)

Unnamed: 0,price,median_price,estimate_profit,estimate_profit%,x% lower,status
0,221900.0,267000.0,45100.0,20.32447,16.891386,Buy
1,538000.0,416000.0,-122000.0,-22.67658,0.0,Not Buy
2,180000.0,449950.0,269950.0,149.972222,59.995555,Buy
3,604000.0,560000.0,-44000.0,-7.284768,0.0,Not Buy
4,510000.0,645000.0,135000.0,26.470588,20.930233,Buy
5,257500.0,275900.0,18400.0,7.145631,6.669083,Buy
6,291850.0,261000.0,-30850.0,-10.570499,0.0,Not Buy
7,229500.0,271875.0,42375.0,18.464052,15.586207,Buy
8,323000.0,350000.0,27000.0,8.359133,7.714286,Buy
9,662500.0,552000.0,-110500.0,-16.679245,0.0,Not Buy


#### Mapping the best opportunities.
Distribuiton of houses with status By.

In [75]:
def best_opportunities(data):
    best = []
    for i in range(len(data)):
        if data.loc[i, 'x% lower'] > 0:
            best.append(data.loc[i, 'x% lower'])

    best_df = pd.DataFrame({'x% lower': best})
    fig = px.histogram(best_df, x = 'x% lower', nbins = 60)
    fig.update_layout(
        font_size = 20,
        title = "Discount distribution.",
        xaxis_title = 'Discount (%)',
        yaxis_title = 'Number of houses'            
    )
    fig.show()
    print(best_df.describe())
  
    for i in range(len(data)):
        if (data.loc[i, 'x% lower'] > 27):
            data.loc[i, 'status'] = 'Buy_SRP'
    
    sample_general = data.loc[data['status'] != 'Not Buy'].reset_index()
    sample_general.drop('index', axis = 1, inplace = True)
    sample_general.sort_values('estimate_profit%', ascending = False)
    return sample_general

define_status(data, median_price_per_condition_and_zipcode(data))
best_opportunities(data).to_csv('dataset/sample_general.csv')

           x% lower
count  10113.000000
mean      18.912551
std       13.557148
min        0.000111
25%        8.104560
50%       16.428571
75%       27.184466
max       79.545455


#### Updating the status
For houses with discounto higher than 27% the status will be updated to Buy_SRP (Strong Recommendation Purchase)

In [76]:
print(str(data.loc[data['status'] == 'Buy_SRP', 'id'].count()), ' houses found for the best opportunities.')
print('In the first approach a general view of the houses was done and all houses with price bellow the median price was considered a trade opportunitie.')
print(' - The min profit for the best opportunitires is: %.0f ' %data.loc[data['status'] == 'Buy_SRP', 'estimate_profit%'].min(),'%')
print(' - The max profit for the best opportunitires is: %.0f ' %data.loc[data['status'] == 'Buy_SRP', 'estimate_profit%'].max(),'%')
print(' - The averaged profit for the best opportunitires is: %.0f ' %data.loc[data['status'] == 'Buy_SRP', 'estimate_profit%'].mean(),'%')

print(str(data.loc[data['status'] == 'Buy', 'id'].count()), ' houses found for the regular opportunities.')
print(' - The min profit for the commum opportunitires is: %.0f' %data.loc[data['status'] == 'Buy', 'estimate_profit%'].min(),'%')
print(' - The max profit for the commum opportunitires is: %.0f' %data.loc[data['status'] == 'Buy', 'estimate_profit%'].max(),'%')
print(' - The averaged profit for the commum opportunitires is: %.0f' %data.loc[data['status'] == 'Buy', 'estimate_profit%'].mean(),'%')

data[data['x% lower'] > 0][['price', 'median_price', 'estimate_profit','estimate_profit%','x% lower', 'status']].head(10)

2566  houses found for the best opportunities.
In the first approach a general view of the houses was done and all houses with price bellow the median price was considered a trade opportunitie.
 - The min profit for the best opportunitires is: 37  %
 - The max profit for the best opportunitires is: 389  %
 - The averaged profit for the best opportunitires is: 65  %
7789  houses found for the regular opportunities.
 - The min profit for the commum opportunitires is: 0 %
 - The max profit for the commum opportunitires is: 37 %
 - The averaged profit for the commum opportunitires is: 15 %


Unnamed: 0,price,median_price,estimate_profit,estimate_profit%,x% lower,status
0,221900.0,267000.0,45100.0,20.32447,16.891386,Buy
2,180000.0,449950.0,269950.0,149.972222,59.995555,Buy_SRP
4,510000.0,645000.0,135000.0,26.470588,20.930233,Buy
5,257500.0,275900.0,18400.0,7.145631,6.669083,Buy
7,229500.0,271875.0,42375.0,18.464052,15.586207,Buy
8,323000.0,350000.0,27000.0,8.359133,7.714286,Buy
10,468000.0,570000.0,102000.0,21.794872,17.894737,Buy
11,310000.0,425000.0,115000.0,37.096774,27.058824,Buy_SRP
12,400000.0,550000.0,150000.0,37.5,27.272727,Buy_SRP
15,395000.0,414500.0,19500.0,4.936709,4.704463,Buy


### Set of Hypothesis
#### H1 -  Houses with waterfront are, in average, 20% more expensive.


In [77]:
# Select data where the houses have waterfront
# Take the averaged price per zipcode and condition
data_wf = data.loc[data['waterfront'] == 1, ['condition', 'zipcode', 'price']].groupby(['condition', 'zipcode']).median().reset_index()
data_wf.columns = ['condition', 'zipcode', 'median_price']

# Select data where the houses haven't waterfront
# Take the averaged price per zipcode and condition
data_nwf = data.loc[data['waterfront'] == 0, ['condition', 'zipcode', 'price']].groupby(['condition', 'zipcode']).median().reset_index()
data_nwf.columns = ['condition', 'zipcode', 'median_price']

# Compare the price for houses with the same condition and region
profit = []
for i in range(len(data_wf)):
    for k in range(len(data_nwf)):
        if (data_wf.loc[i,'zipcode'] == data_nwf.loc[k,'zipcode']) & (data_wf.loc[i,'condition'] == data_nwf.loc[k,'condition']):
            profit.append(100*(data_wf.loc[i,'median_price']/data_nwf.loc[k, 'median_price'] - 1))
            
print('%.1f' %(sum(profit)/len(profit)),'%')

96.6 %


##### H1 confirmed
##### Value found 97%

Actilly the prices of houses with waterfront are, in the average, about 97% higher than prices of houses without waterfront.
Lets check price distribution for houses with waterfront'

In [78]:
data_aux = data.loc[data['waterfront'] == 1]
fig = px.histogram(data_aux, x = 'price')
fig.update_layout(
    font_size = 20,
    title = 'Price distribution for houses with waterfront',
    xaxis_title = 'Price (USD)',
    yaxis_title = 'Number of houses'
)
fig.show()

data_aux.reset_index(inplace = True)
data_aux.drop(columns = 'index', inplace = True)   
house_data = []
for i in range(len(data_wf)):
    for k in range(len(data_aux)):
        if (data_wf.loc[i, 'median_price'] > data_aux.loc[k, 'price']) & (data_wf.loc[i,'zipcode'] == data_aux.loc[k,'zipcode']) & (data_wf.loc[i,'condition'] == data_aux.loc[k,'condition']):
            house_data.append([(data_wf.loc[i,'median_price'] - data_aux.loc[k, 'price']),100*(data_wf.loc[i,'median_price']/data_aux.loc[k, 'price'] - 1), data_wf.loc[i, 'median_price'],data_aux.loc[k, 'id']])



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#### Profit Estimation for H1

In [79]:
sample_h1 = take_houses(data, house_data,'H1')
show_profit_estimation(sample_h1)

25  houses attend the hypothesis  H1
Profit estimation
Minimal estimated profit, 2.5 %
Maximum estimated profit, 112.1 %
Maximum estimated profit, 24.7 %


#### H2 - Houses with year built lower than 1950 are, in average, 20% cheaper.

In [80]:
# Take the averaged price per zipcode and condition
data_very_old = data.loc[data['yr_built'] <1950, ['condition', 'zipcode', 'price']].groupby(['condition', 'zipcode']).median().reset_index()
data_very_old.columns = ['condition', 'zipcode', 'median_price']

# Select data where year built >= 1950
# Take the averaged price per zipcode and condition
data_not_too_old = data.loc[data['yr_built'] >= 1950, ['condition', 'zipcode', 'price']].groupby(['condition', 'zipcode']).median().reset_index()
data_not_too_old.columns = ['condition', 'zipcode', 'median_price']

# Compare the price for houses with the same condition and region
profit = []
for i in range(len(data_very_old)):
    for k in range(len(data_not_too_old)):
        if (data_very_old.loc[i,'zipcode'] == data_not_too_old.loc[k,'zipcode']) & (data_very_old.loc[i,'condition'] == data_not_too_old.loc[k,'condition']):
            profit.append(100*(1 - data_very_old.loc[i, 'median_price']/data_not_too_old.loc[k, 'median_price']))

print('%.1f' %(sum(profit)/len(profit)),'%')

10.2 %


##### H2 refuted
##### Value found: 10%
Houses with year built lower than 1950 are, in average, about 10% cheaper.

 #### H3 - Houses with basement are, in average, 40% more expensive.

In [81]:
# Select houses with basement
# Take the averaged price per zipcode and condition
data_wb = data.loc[data['sqft_basement'] != 0, ['condition', 'zipcode', 'price']].groupby(['condition', 'zipcode']).median().reset_index()
data_wb.columns = ['condition', 'zipcode', 'median_price']

# Select houses without basement
# Take the averaged price per zipcode and condition
data_nwb = data.loc[data['sqft_basement'] == 0, ['condition', 'zipcode', 'price']].groupby(['condition', 'zipcode']).median().reset_index()
data_nwb.columns = ['condition', 'zipcode', 'median_price']

profit = []
for i in range(len(data_wb)):
    for k in range(len(data_nwb)):
        if (data_wb.loc[i,'zipcode'] == data_nwb.loc[k,'zipcode']) & (data_wb.loc[i,'condition'] == data_nwb.loc[k,'condition']):
            profit.append(100*(data_wb.loc[i, 'median_price']/data_nwb.loc[k, 'median_price'] - 1))

print('%.1f' %(sum(profit)/len(profit)),'%')

17.3 %


##### H3 refuted
##### Value found: 17%
House with basement are, in averege, about 18% more expensive than houses without basement.

 #### H4 - The YoY price, for year built, increase, in average, 10%.

In [82]:
# Select data grouped by condition, zipcode and year built
data_yoy = data[['condition', 'zipcode','yr_built', 'price']].groupby(['condition', 'zipcode','yr_built']).median().reset_index()
data_yoy.columns = ['condition', 'zipcode','yr_built', 'median_price']

# Sort data by year built
data_yoy.sort_values('yr_built', inplace = True)
data_yoy.reset_index(inplace = True)

# Take the unique values of year built
year_built = data_yoy['yr_built'].unique().tolist()

profit = []
for year in year_built:
    data_aux_1 = data_yoy.loc[(data_yoy['yr_built'] == year)]
    data_aux_1.reset_index(inplace= True)
    if year + 1 in year_built:
        data_aux_2 = data_yoy.loc[(data_yoy['yr_built'] == year + 1)]
        data_aux_2.reset_index(inplace= True)

        for i in range(len(data_aux_1)):
            for k in range(len(data_aux_2)):
                if(data_aux_1.loc[i,'condition'] == data_aux_2.loc[k,'condition']):
                    if(data_aux_1.loc[i,'zipcode'] == data_aux_2.loc[k,'zipcode']):
                        profit.append(100*(data_aux_2.loc[k,'median_price']/data_aux_1.loc[i,'median_price']-1))

print('%.1f' %(sum(profit)/len(profit)),'%')

5.2 %


##### H4 refuted
##### Value found: 5%
The YoY prices increses about 5%, in average.

#### H5 -  House that have more than one bathroom are, in average, 15% more expensive.

In [83]:
# Select houses with one bathroom
# Take the averaged price per zipcode and condition
data_onebathroom = data.loc[data['bathrooms'] <= 1, ['condition', 'zipcode', 'price']].groupby(['condition', 'zipcode']).median().reset_index()
data_onebathroom.columns = ['condition', 'zipcode', 'median_price']

# Select houses with more than one bathroo
# Take the averaged price per zipcode and condition
data_m_onebathrooms = data.loc[data['bathrooms'] > 1, ['condition', 'zipcode', 'price']].groupby(['condition', 'zipcode']).median().reset_index()
data_m_onebathrooms.columns = ['condition', 'zipcode', 'median_price']

profit = []
for i in range(len(data_onebathroom)):
    for k in range(len(data_m_onebathrooms)):
        if (data_onebathroom.loc[i,'zipcode'] == data_m_onebathrooms.loc[k,'zipcode']) & (data_onebathroom.loc[i,'condition'] == data_m_onebathrooms.loc[k,'condition']):
            profit.append(100*(data_m_onebathrooms.loc[k, 'median_price']/data_onebathroom.loc[i, 'median_price'] - 1))
print('%.1f' %(sum(profit)/len(profit)),'%')

42.7 %


##### H5 confirmed
##### Value found: 43%
Houses with more than one bathrooms are, in average, about 43% more expensive'. Lets see how many houses there are in the portifolio for each amount of bathrooms.

In [84]:
bathrooms = data['bathrooms'].unique().tolist()
bath_count = data['bathrooms'].value_counts().to_frame().reset_index()
bath_count.columns = ['bathrooms','count']

fig = px.bar(bath_count, x = 'bathrooms', y = 'count')
fig.update_layout(
    font_size = 20,
    title = 'Amount of houses for each number of bathrooms',
    xaxis_title = 'Number of bathrooms',
    yaxis_title = 'Number of houses'
)
fig.show()

The most part of the houses has one or more bathroomsn and less than 2.5 bathrooms.
##### Lest's select huouses that has these amount of bathrooms and attend the hypothesis H5.

In [85]:
data_m_onebathrooms = data.loc[(data['bathrooms'] >= 1) & (data['bathrooms'] <= 2.5), ['bathrooms','condition','zipcode','price']].groupby(['bathrooms', 'condition', 'zipcode']).median().reset_index()
data_m_onebathrooms.columns = ['bathrooms','condition', 'zipcode', 'median_price']
bathrooms = data_m_onebathrooms['bathrooms'].unique().tolist()

house_data.clear()
for bathroom in bathrooms:
    data_aux_1 = data.loc[data['bathrooms'] == bathroom].reset_index() 
    data_aux_2 = data_m_onebathrooms.loc[data_m_onebathrooms['bathrooms'] == bathroom].reset_index()
    for i in range(len(data_aux_2)):
        for k in range(len(data_aux_1)):
            if (data_aux_2.loc[i, 'median_price'] > data_aux_1.loc[k, 'price']) & (data_aux_2.loc[i,'zipcode'] == data_aux_1.loc[k,'zipcode']) & (data_aux_2.loc[i,'condition'] == data_aux_1.loc[k,'condition']):
                house_data.append([(data_aux_2.loc[i,'median_price'] - data_aux_1.loc[k, 'price']),100*(data_aux_2.loc[i, 'median_price']/data_aux_1.loc[k, 'price'] - 1), data_aux_2.loc[i, 'median_price'],data_aux_1.loc[k, 'id']])


#### Profit Estimation for H5

In [86]:
sample_h5 = take_houses(data, house_data,'H5')
show_profit_estimation(sample_h5)

8462  houses attend the hypothesis  H5
Profit estimation
Minimal estimated profit, 0.0 %
Maximum estimated profit, 299.6 %
Maximum estimated profit, 20.6 %


#### H6 - Houses near to water, but without waterfront, are, in average, 20% cheaper than houses with waterfront.

In [87]:
# Select zipcode of houses with waterfront
# Select houses without waterfront
data_wf = data.loc[data['waterfront'] == 1, ['lat', 'long', 'condition', 'zipcode', 'price']]
data_wf.reset_index(inplace = True)
data_wf.drop(columns = 'index', inplace = True)

data_nwf = data.loc[(data['waterfront'] == 0) & (data['zipcode'].isin(data_wf['zipcode'].unique())) & (data['condition'].isin(data_wf['condition'].unique())), ['id','lat', 'long', 'condition', 'zipcode', 'price']]
data_nwf.reset_index(inplace = True)
data_nwf.drop(columns = 'index', inplace = True)

for i in range(len(data_wf)):
    for k in range(len(data_nwf)):
        if(data_wf.loc[i, 'zipcode'] == data_nwf.loc[k, 'zipcode']):
            if (calculate_distance(data_wf.loc[i, 'lat'], data_wf.loc[i, 'long'], data_nwf.loc[k, 'lat'], data_nwf.loc[k, 'long']) > 100):
                data_nwf.loc[k, 'zipcode'] = np.nan
            else:
                print(calculate_distance(data_wf.loc[i, 'lat'], data_wf.loc[i, 'long'], data_nwf.loc[k, 'lat'], data_nwf.loc[k, 'long']))

data_nwf.dropna(axis = 0, inplace= True)
data_nwf.reset_index(inplace = True)
print(data_nwf['id'].count(), ' houses found.')

0  houses found.


##### H6 refuted
##### There aren't houses that attend the hypothesis six.
See below the houses with water front in the map.

In [88]:
waterfront_map = folium.Map(location = [data_wf['lat'].mean(), data_wf['long'].mean()], zoom_start = 10)
for i in range(len(data_wf)):
    coordinate = [data_wf.loc[i, 'lat'], data_wf.loc[i, 'long']]
    marker = folium.map.Marker(
        coordinate,
        # Create an icon as a text label
        icon=folium.Icon(color='white', icon_color = 'green')   
    )
    waterfront_map.add_child(marker)
waterfront_map

#### H7 - Houses with only one floor are, in average, 20% more expensive. Due to people likes houses without laders.

In [89]:
# Select houses with one floor
# Take the averaged price per zipcode and condition
data_one_floor = data.loc[data['floors'] == 1, ['condition', 'zipcode', 'price']].groupby(['condition', 'zipcode']).median().reset_index()
data_one_floor.columns = ['condition', 'zipcode', 'median_price']

# Select houses with more than one floor
# Take the averaged price per zipcode and condition
data_m_one_floors = data.loc[data['floors'] > 1, ['condition', 'zipcode', 'price']].groupby(['condition', 'zipcode']).median().reset_index()
data_m_one_floors.columns = ['condition', 'zipcode', 'median_price']

profit = []
avg_price_ratio = pd.DataFrame()
for i in range(len(data_one_floor)):
    for k in range(len(data_m_one_floors)):
        if (data_one_floor.loc[i,'zipcode'] == data_m_one_floors.loc[k,'zipcode']) & (data_one_floor.loc[i,'condition'] == data_m_one_floors.loc[k,'condition']):
            profit.append(100*(data_one_floor.loc[i, 'median_price']/data_m_one_floors.loc[k, 'median_price'] - 1))

print('%.2f' %(sum(profit)/len(profit)),'%')

-15.44 %


##### H7 refuted
##### Value found: -18%
Actilly, houses with more than one floor are more expensive tha houses with only one floor.

#### H8 - Houses price increase with the increase of the livingroom area.

In [90]:
# Plot the houses price accross livingroom area.
data.sort_values('sqft_living', inplace= True)
fig = px.scatter(data, x = 'sqft_living', y = 'price', size = data['price'], color = data['condition'], trendline = 'ols', trendline_scope = 'overall', trendline_color_override = 'red')
fig.update_layout(
    font_size = 20,
    title = 'Price trend as function of living room area',
    xaxis_title = 'Living room arear (sqft)',
    yaxis_title = 'Houses prices (USD)'
)
fig.show()

# results = px.get_trendline_results(fig)
# results = results.iloc[0]['px_fit_results'].summary()
# results

#### H8 confirmed
From the chart we can see houses with good condition, 3 and 4, below to trendline. These houses are good transactions opportunities.

In [91]:
#H8
data_below_trend_line = data.loc[data['price'] < (data['sqft_living']*167.3602 + 146400), ['id', 'sqft_living','condition', 'zipcode', 'price']].reset_index()
house_data = []
conditions = data['condition'].unique().tolist()
zipcodes = data['zipcode'].unique().tolist()
for condition in conditions:
    for zipcode in zipcodes:
        data_aux = data_below_trend_line.loc[(data_below_trend_line['condition'] == condition) & (data_below_trend_line['zipcode'] == zipcode)].reset_index()
        for i in range(len(data_aux)):
            house_data.append([data_aux.loc[i,'sqft_living']*167.3602 + 146400 - data_aux.loc[i, 'price'], 100*((data_aux.loc[i,'sqft_living']*167.3602 + 146400)/data_aux.loc[i, 'price'] - 1),data_aux.loc[i,'sqft_living']*167.3602 + 146400, data_aux.loc[i, 'id']])   

In [92]:
sample_h8 = take_houses(data, house_data, 'H8')
show_profit_estimation(sample_h8)

11105  houses attend the hypothesis  H8
Profit estimation
Minimal estimated profit, 0.0 %
Maximum estimated profit, 257.9 %
Maximum estimated profit, 43.8 %


#### H9 - Houses with year built higher than 2010 are, in average, 30% more expensive.

In [93]:
# Select data with year built higher than 2010.
# Group the data by condition and zipcode 
data_new_houses = data.loc[data['yr_built'] >= 2010, ['condition', 'zipcode', 'price']].groupby(['condition', 'zipcode']).median().reset_index()
data_new_houses.columns = ['condition', 'zipcode', 'avg_price']

# Select data with year built lower than 2010.
# Group the data by condition and zipcode 
data_old_houses = data.loc[data['yr_built'] < 2010, ['condition', 'zipcode', 'price']].groupby(['condition', 'zipcode']).median().reset_index()
data_old_houses.columns = ['condition', 'zipcode', 'avg_price']

profit = []
for i in range(len(data_new_houses)):
    for k in range(len(data_old_houses)):
        if(data_new_houses.loc[i, 'condition'] ==  data_old_houses.loc[k, 'condition']) & (data_new_houses.loc[i, 'zipcode'] ==  data_old_houses.loc[k, 'zipcode']):
            profit.append( 100*(data_new_houses.loc[i, 'avg_price']/data_old_houses.loc[k, 'avg_price'] - 1) )

print('%.2f' %(sum(profit)/len(profit)),'%')

28.52 %


#### H9 refuted
#### Value found: 28%
Although the hypothesis was refuted, there are opportunities to be considered, once the averaged prices, for houses built after 2010, are about 28% higher than averaged price for houses built before 2010. Lets visualize it.

In [94]:
fig = px.scatter(
    data,
    x = 'yr_built',
    y = 'price',
    size = 'price',
    color = 'condition',
    title = 'House prices along year built'
    ) 
fig.update_layout(
    font_size = 20,
    title = 'Houses prices along the year built',
    xaxis_title = 'Year built',
    yaxis_title = 'Houses prices (USD)'
)
fig.show()

From the chart is possible see that the condition of house built after 2010 are in major 3, but the prices of these houses can be higher than the price of houses built before 2010, even with better condition. Let's estimate de profit buying houses with year built 2010 and higher.

In [95]:
data_zip = data[['price', 'condition', 'zipcode']].groupby(['condition', 'zipcode']).median().reset_index()
data_zip.columns = ['condition', 'zipcode', 'median_price']
houses_2010 =  data.loc[(data['yr_built'] >= 2010), ['id', 'price', 'condition', 'zipcode']].reset_index()
house_data = []
for i in range(len(houses_2010)):
    for k in range(len(data_zip)):
        if (houses_2010.loc[i, 'condition'] == data_zip.loc[k, 'condition']) & (houses_2010.loc[i, 'zipcode'] == data_zip.loc[k, 'zipcode']) & (houses_2010.loc[i, 'price'] < data_zip.loc[k, 'median_price']):
            house_data.append([data_zip.loc[k, 'median_price'] - houses_2010.loc[i, 'price'],100*(data_zip.loc[k, 'median_price']/houses_2010.loc[i, 'price'] - 1), data_zip.loc[k, 'median_price'], houses_2010.loc[i, 'id'] ])   

In [96]:
# Sample of general data analysis
sample_h9 = take_houses(data, house_data, 'H9')
show_profit_estimation(sample_h9)

286  houses attend the hypothesis  H9
Profit estimation
Minimal estimated profit, 0.0 %
Maximum estimated profit, 169.7 %
Maximum estimated profit, 21.1 %


#### H10 - Houses price increase with the increase of the lot area.

In [97]:
data.sort_values('sqft_lot', inplace= True)
fig = px.scatter(data, x = 'sqft_lot', y = 'price', size = 'price', color = 'condition')
fig.update_layout(
    font_size = 20,
    title = 'Houses price as function of lot area',
    xaxis_title = 'Lot area (Sqft)',
    yaxis_title = 'Houses prices (USD)'
)
fig.show()

#### H10 refuted
#### From the chart we can't see the expected results.