# 🌰 Base Model with only House data and ZipCodes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor

from helper_funcs import *

In [2]:
RF = get_all()

----pulling Redfin data from Azure storage----
----Done----
----pulling schools data from Azure storage----
----Done----
----merging all data----
----Done----
Shape: (75360, 22)


In [3]:
RF.Prop_Type.value_counts()

Single Family Residential    50143
Condo/Co-op                  16190
Townhouse                     7551
Multi-Family (2-4 Unit)       1227
Ranch                          190
Multi-Family (5+ Unit)          59
Name: Prop_Type, dtype: int64

In [4]:
RF.drop(RF[RF.BEDS.isna() | RF.BATHS.isna()].index, inplace=True)
RF.loc[RF.overallRating.isna(),'overallRating'] = 5

#### For the model to work, we need to incorporate all ZIP codes into both X_train and X_test
* A 0.25 test ratio will be used, so first eliminate all ZIP codes with less than FOUR houses
* Then do train_test_split for every set of ZIPcodes

In [8]:
#pd.set_option('display.max_rows', None)
zip_houses = RF.groupby('zip').agg('count')['PRICE'].to_dict()

In [12]:
RF['houses_perZIP'] = RF.zip.apply(lambda r: zip_houses[r])

In [16]:
RF = RF[RF.houses_perZIP>4]

In [26]:
# Use groupby to split the df into smaller dfs for each ZIP
groups = RF.groupby('zip')
dfs = [groups.get_group(x) for x in groups.groups]

In [85]:
# Use first df to initiate X_train, X_test, etc
for d in dfs[:1]:
    feat = d[['Prop_Type','BEDS','BATHS','SF','Lot_Size','YearBuilt','zip']]
    feat.zip = feat.zip.astype('object')
    y = np.log10(d.PRICE)
    X_train, X_test, y_train, y_test = train_test_split(feat, y)

In [86]:
# train_test_split each df then stack together
for d in dfs[1:]:
    feat = d[['Prop_Type','BEDS','BATHS','SF','Lot_Size','YearBuilt','zip']]
    feat.zip = feat.zip.astype('object')
    y = np.log10(d.PRICE)
    X_trainpiece, X_testpiece, y_trainpiece, y_testpiece = train_test_split(feat, y, test_size=0.25)
    X_train = X_train.append(X_trainpiece)
    X_test = X_test.append(X_testpiece)
    y_train = y_train.append(y_trainpiece)
    y_test = y_test.append(y_testpiece)

In [80]:
# Check rows
print(f'X_train rows: {X_train.shape}')
print(f'X_test rows: {X_test.shape}')

X_train rows: (54839, 7)
X_test rows: (19199, 7)


In [81]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

In [82]:
lm = LinearRegression()
lm.fit(X_train, y_train)
print(f'train R2: {lm.score(X_train, y_train)}')
print(f'train R2: {lm.score(X_test, y_test)}')

train R2: 0.7523405437871363
train R2: 0.7332501096474893


## 🐱 Try CatBoost
* Label Encode **Prop_Type & zip**

In [87]:
X_train_le = X_train.copy()
X_test_le = X_test.copy()

le = LabelEncoder()
X_train_le.Prop_Type = le.fit_transform(X_train_le.Prop_Type)
X_test_le.Prop_Type = le.fit_transform(X_test_le.Prop_Type)
X_train_le.zip = le.fit_transform(X_train_le.zip)
X_test_le.zip = le.fit_transform(X_test_le.zip)

In [89]:
cat = CatBoostRegressor(verbose=False)
cat.get_params()

{'loss_function': 'RMSE', 'verbose': False}

In [90]:
cat.fit(X_train_le, y_train)
print(f'Train R2: {cat.score(X_train_le, y_train)}')
print(f'Test R2: {cat.score(X_test_le, y_test)}')

Train R2: 0.8161256670954876
Test R2: 0.7904923768338273
