# Create and answer our own hypotheses

In this section, we will create and answer our own hypotheses based on our knowledge of the dataset.

In [92]:
import os
import pandas as pd
from notebooks.get_local_folder import get_local_folder

In [93]:
# Load data
abs_path_to_folder = get_local_folder()
dataset_path = os.path.join(abs_path_to_folder, "../data/cleaned/houses_data.parquet")
houses = pd.read_parquet(dataset_path)
houses_wo = pd.read_parquet(os.path.join(abs_path_to_folder, "../data/without_outliers/houses_without_outliers.parquet"))

## Hypothesis 1: The season when most homes became available for purchase was spring.

#### Answer: Yes, the spring season had 6520 homes available. That may be why the average house price announced in the spring is the highest.

In [94]:
def create_season_column(df: pd.DataFrame):
    """Create a new colunm named "Season", which contains the season based in the "Date" colunm"""
    import pandas as pd
    import numpy as np

    df['Month'] = pd.to_datetime(df['Date']).dt.strftime('%m')
    df['Month'] = df['Month'].astype(np.int64)


    df['Season'] = 'Winter'
    df.loc[(df['Month'] >= 3) & (df['Month'] < 6), 'Season'] = 'Spring'
    df.loc[(df['Month'] >= 6) & (df['Month'] < 9), 'Season'] = 'Summer'
    df.loc[(df['Month'] >= 9) & (df['Month'] < 12), 'Season'] = 'Fall'

    return df.drop(columns="Month")

houses = create_season_column(houses.copy())
houses['Season'].value_counts()

  df['Month'] = pd.to_datetime(df['Date']).dt.strftime('%m')


Summer    6077
Spring    6075
Fall      5317
Winter    4144
Name: Season, dtype: int64

## Hypothesis 2: Houses overlooking the water are more expensive. 

#### Answer: Yes, houses overlooking the sea are 53% more expensive.

In [96]:
waterview_price = houses_wo[['Price','Waterfront']].groupby('Waterfront').mean().reset_index().round(1)

print('Houses with water view are {:.2f}% more expansive.'.format(
( waterview_price.iloc[0,1] * 100 ) / waterview_price.iloc[1,1] ) )

Houses with water view are 53.79% more expansive.


## Hypothesis 3: Only 10% of homes have been renovated.

#### Answer: No, only 4.42% of homes have been renovated.

In [95]:
houses_wo['Renovated'] = 'No' 
houses_wo.loc[ houses_wo['Year Renovated'] != 0, 'Renovated'] = 'Yes'
renovated_count = houses_wo['Renovated'].value_counts()

print('Only {:.2f}% of houses have already been renovated.'.format(
(renovated_count.iloc[1] * 100 ) / renovated_count.iloc[0]))

Only 4.21% of houses have already been renovated.
