In [110]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [81]:
df = pd.read_csv("./data/synthetic_house_prices_20_years.csv") 
neighborhoods = pd.read_csv("./data/vancouver_neighborhoods_coordinates.csv")

In [82]:
df.fillna(method ='ffill', inplace = True)

  df.fillna(method ='ffill', inplace = True)


In [83]:
latitudes = []
longitudes = []
for i, row in df.iterrows():
    neighb = row['Neighborhood']
    latitudes.append(neighborhoods[neighborhoods['Neighborhood'] == neighb]['Latitude'].iloc[0])
    longitudes.append(neighborhoods[neighborhoods['Neighborhood'] == neighb]['Longitude'].iloc[0])
        
        
df['Neighborhood_Lat'] = latitudes
df['Neighborhood_Long'] = longitudes

In [84]:
# neighborhoods.value_counts()
df.head()

Unnamed: 0,Neighborhood,Year,Season,Property Type,Bedrooms,Bathrooms,Year Built,Renovation Year,Garage Type,Square Footage (House),Square Footage (Land),Basement,Legal Units,Market Price,Neighborhood_Lat,Neighborhood_Long
0,Kitsilano,2004,Spring,Triplex,1,3,1954,2000.0,Double,1308,6742,Not Finished,2,1734587.4,49.2643,-123.168
1,Kitsilano,2004,Summer,Condo,3,1,2022,2000.0,Single,3947,2592,Finished,0,951731.62,49.2643,-123.168
2,Kitsilano,2004,Fall,House,4,1,2006,2000.0,Single,919,9519,Not Finished,2,1782016.8,49.2643,-123.168
3,Kitsilano,2004,Winter,Condo,2,2,1960,2000.0,Triple,2352,1555,Not Finished,2,335768.06,49.2643,-123.168
4,Kitsilano,2005,Spring,Duplex,5,2,1960,1992.0,Triple,2474,7031,Not Finished,0,1571528.42,49.2643,-123.168


In [132]:
def timeline_test_train_split(df):
    df = df.sort_values('Year')
    X = df.drop('Market Price', axis=1)
    y = df['Market Price']

    threshold = int(X.shape[0] * 1)
    X_train = X.iloc[0:threshold]
    X_test = X.iloc[threshold::]
    y_train = y.iloc[0:threshold]
    y_test = y.iloc[threshold::]
    return X_train, X_test, y_train, y_test


# Linear Regression
## Linear Regression - Cleaning

In [133]:
drop_columns = ['Neighborhood']
onehot_cols = ['Season', 'Basement']
ordinal1_col = 'Property Type'
ordinal2_col = 'Garage Type'
ordinal_values1 = ['Condo', 'Duplex', 'Triplex', 'Townhouse', 'House']
ordinal_values2 = ['Single', 'Triple', 'Double']
numeric_cols = X_train.columns.difference(drop_columns + onehot_cols + [ordinal1_col, ordinal2_col] )

In [134]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), onehot_cols),
        ('ord1', OrdinalEncoder(categories=[ordinal_values1], dtype=int), [ordinal1_col]),
        ('ord2', OrdinalEncoder(categories=[ordinal_values2], dtype=int), [ordinal2_col]),
        ('num', StandardScaler(), numeric_cols)
    ],
     remainder='drop'
)

In [120]:
df_lr = df

In [121]:
X_train, X_test, y_train, y_test = timeline_test_train_split(df_lr)

In [122]:
X_train_final = preprocessor.fit_transform(X_train)
X_test_final = preprocessor.transform(X_test)

In [131]:
X_train['Year'].value_counts()
X_test['Year'].value_counts()

Year
2020    176
2021    176
2022    176
2023    176
Name: count, dtype: int64

## Linear Regression - Modelling

In [123]:
regr = LinearRegression()

regr.fit(X_train_final, y_train)
print(regr.coef_)

[-2.01078916e+04  4.24678782e+04  3.72887681e+04 -5.96487547e+04
  1.96682482e+04 -1.96682482e+04  2.92097360e+05 -1.74886603e+04
 -6.41662019e+03 -1.24536458e+02  1.18020834e+04 -2.13487640e+04
  1.31706641e+04  3.11216033e+04 -1.47691974e+04 -4.39794341e+03
  3.61365029e+05 -9.24178590e+03]


In [124]:
print(regr.score(X_test_final, y_test))

0.11193902800337718


## Ridge - Modelling

In [125]:
from sklearn.linear_model import Ridge

In [128]:
ridge = Ridge()
ridge.fit(X_train_final, y_train)
print(ridge.coef_)

[-2.00772326e+04  4.24078744e+04  3.72310384e+04 -5.95616801e+04
  1.96575214e+04 -1.96575214e+04  2.92043410e+05 -1.74793633e+04
 -6.41036676e+03 -1.22228622e+02  1.17960999e+04 -2.13413199e+04
  1.31688218e+04  3.11126885e+04 -1.47647446e+04 -4.39551882e+03
  3.61235610e+05 -9.23676581e+03]


In [127]:
print(ridge.score(X_test_final, y_test))

0.11182989642388153


# EXPORTING MODEL

In [None]:
import joblib

In [None]:
joblib.dump(ridge, "model.pkl") 