In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

from helper_funcs import *

In [None]:
zipdata = get_zipdata()
zipdata.shape

In [None]:
#zipdata = zipdata.loc[zipdata.Population!=0]

In [None]:
zipdata.isna().sum()

Check Some correlations

In [None]:
corr1 = zipdata.iloc[:,0:14].corr()
corr2 = zipdata.iloc[:,14:].corr()

mask1 = np.triu(np.ones_like(corr1, dtype=bool)) # Hide upper redundant heatmap
mask2 = np.triu(np.ones_like(corr2, dtype=bool)) # Hide upper redundant heatmap
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
sns.heatmap(corr1, mask=mask1, ax=ax[0], annot=False, cmap='mako_r')
sns.heatmap(corr2, mask=mask2, ax=ax[1], annot=False, cmap='mako_r')

Create Population Ratios

### 🎚️ Scale data before clustering

In [None]:
mms = MinMaxScaler()
# REMOVE AverageHouseValue
to_use = ['Population', 'HouseholdsPerZipCode', 'Blacks_ratio', 'Hispanics_ratio', 
          'Asians_ratio', 'Indians_ratio', 'Others_ratio', 'Male_ratio',                # Population
          'IncomePerHousehold', 'NumberOfBusinesses','UE_rate', 'BEA_percap_income',    # Econ
          'over_65_ratio','MedianAge',                                                  # Age
          'HPI', 'Demand_score','Supply_score', 'listviews_vs_US', 'med_days_on_mkt', 'nielson_rank', # FRED
         ]

data_scaled = pd.DataFrame(mms.fit_transform(zipdata[to_use].values), 
                           columns=zipdata[to_use].columns, 
                           index=zipdata[to_use].index)
# data_scaled = pd.DataFrame(mms.fit_transform(zipdata.drop(columns=['AverageHouseValue','school_rating']).values), 
#                            columns=zipdata.drop(columns=['AverageHouseValue','school_rating']).columns, 
#                            index=zipdata.drop(columns=['AverageHouseValue','school_rating']).index)

### K-means Clustering of ZipCode data to produce desirability Clusters

In [None]:
kmeans = KMeans()
inertias = {}
for n in range(1,15):
    kmeans.set_params(n_clusters=n)
    kmeans.fit(data_scaled[['HPI', 'Demand_score','Supply_score', 'listviews_vs_US', 'med_days_on_mkt', 'nielson_rank']])
    inertias[n] = kmeans.inertia_

In [None]:
plt.plot(inertias.keys(),inertias.values())

#### Try X Clusters

In [None]:
kmeans.set_params(n_clusters=5)
kmeans.fit(data_scaled[['HPI', 'Demand_score','Supply_score', 'listviews_vs_US', 'med_days_on_mkt', 'nielson_rank']])
zipdata['fred_cluster'] = kmeans.labels_

In [None]:
zipdata.columns

In [None]:
print(zipdata.groupby('cluster').agg('count')['Population'])
zipdata.groupby('cluster').agg('mean')[['AverageHouseValue','Population','over_65_ratio',
                                       'IncomePerHousehold','school_rating','Hosp_count',
                                       'HPI','UE_rate','Zillow_HVF','NumberOfBusinesses']].round(2).sort_values(by='AverageHouseValue')

## 🎰 Model with Clusters

In [None]:
RF = get_redfin_csv()
print(RF.shape)

In [None]:
RF.Detached.value_counts()

#### Merge cluster into House data

In [None]:
print(f'shape before merge: {RF.shape}')
RF = RF.merge(zipdata, how = 'left', left_on = 'zip', right_index=True)
RF.cluster = RF.cluster.astype('object')
print(f'shape after merge: {RF.shape}')

In [None]:
sns.boxplot(x=RF.cluster, y=np.log10(RF.PRICE))

## Try Linear Model and CatBoost🐱
* Label Encode **Prop_Type & zip**

In [103]:
# target_feats = ['Prop_Type','BEDS','BATHS','SF','Lot_Size','YearBuilt','zip']
# target_feats = ['Prop_Type','BEDS','BATHS','SF','Lot_Size','YearBuilt','zip','cluster']
# target_feats = ['Prop_Type','BEDS','BATHS','SF','Lot_Size','YearBuilt','zip','cluster','school_rating']
target_feats = ['Prop_Type','BEDS','BATHS','SF','Lot_Size','YearBuilt','zip',
               'pop_cluster', 'econ_cluster', 'age_cluster', 'fred_cluster','school_rating','Zillow_HVF']
# target_feats = ['Prop_Type', 'BEDS', 'BATHS', 'SF', 'Lot_Size', 'YearBuilt', 'zip',
#         'Population', 'HouseholdsPerZipCode', 'MedianAge', 'NumberOfBusinesses', 
#         'over_65_ratio', 'school_rating', 'Hosp_count', 'HPI', 'Demand_score', 
#         'Supply_score', 'listviews_vs_US', 'med_days_on_mkt', 'nielson_rank', 
#         'UE_rate', 'Zillow_HVF', 'BEA_percap_income', 'Blacks_ratio', 
#         'Hispanics_ratio', 'Asians_ratio', 'Indians_ratio', 'Others_ratio', 
#         'Male_ratio']

lm = LinearRegression()
cat = CatBoostRegressor(verbose=False)

LM_train_R2 = []
LM_test_R2 = []

CAT_train_R2 = []
CAT_test_R2 = []

for i in range(5):
    X_train_dum, X_test_dum, X_train_le, X_test_le, y_train, y_test = get_train_test(RF,target_feats)
    lm.fit(X_train_dum, y_train)
    LM_train_R2.append(lm.score(X_train_dum, y_train))
    LM_test_R2.append(lm.score(X_test_dum, y_test))
   
    cat.fit(X_train_le, y_train)
    CAT_train_R2.append(cat.score(X_train_le, y_train))
    CAT_test_R2.append(cat.score(X_test_le, y_test))
    print(i+1)

print(f'Linear Model train R2: {np.average(LM_train_R2)}')
print(f'Linear Model test R2: {np.average(LM_test_R2)}')
print(f'CatBoost train R2: {np.average(CAT_train_R2)}')
print(f'CatBoost Model test R2: {np.average(CAT_test_R2)}')

4
5
Linear Model train R2: 0.7564106237398096
Linear Model test R2: 0.6940527804434616
CatBoost train R2: 0.8662031744466588
CatBoost Model test R2: 0.8394565184933434


## 🏕️ Try RandomForest and parameter tuning... manually

In [None]:
randoforest = RandomForestRegressor(n_jobs=-1)
randoforest.get_params()

In [None]:
target_feats = ['Prop_Type','BEDS','BATHS','SF','Lot_Size','YearBuilt','zip']

FST_train_R2 = []
FST_test_R2 = []

for i in range(5):
    X_train, X_test, y_train, y_test = get_train_test_LE(RF,target_feats)
    randoforest.fit(X_train, y_train)
    FST_train_R2.append(randoforest.score(X_train, y_train))
    FST_test_R2.append(randoforest.score(X_test, y_test))
    print(i+1)

print(f'RandForest train R2: {np.average(FST_train_R2)}')
print(f'RandForest test R2: {np.average(FST_test_R2)}')

In [None]:
randoforest.set_params(max_depth=5,n_estimators=50)

target_feats = ['Prop_Type','BEDS','BATHS','SF','Lot_Size','YearBuilt','zip']

FST_train_R2 = []
FST_test_R2 = []

for i in range(5):
    X_train, X_test, y_train, y_test = get_train_test_LE(RF,target_feats)
    randoforest.fit(X_train, y_train)
    FST_train_R2.append(randoforest.score(X_train, y_train))
    FST_test_R2.append(randoforest.score(X_test, y_test))
    print(i+1)

print(f'RandForest train R2: {np.average(FST_train_R2)}')
print(f'RandForest test R2: {np.average(FST_test_R2)}')

In [None]:
randoforest.set_params(max_depth=8,n_estimators=50)

target_feats = ['Prop_Type','BEDS','BATHS','SF','Lot_Size','YearBuilt','zip']

FST_train_R2 = []
FST_test_R2 = []

for i in range(5):
    X_train, X_test, y_train, y_test = get_train_test_LE(RF,target_feats)
    randoforest.fit(X_train, y_train)
    FST_train_R2.append(randoforest.score(X_train, y_train))
    FST_test_R2.append(randoforest.score(X_test, y_test))
    print(i+1)

print(f'RandForest train R2: {np.average(FST_train_R2)}')
print(f'RandForest test R2: {np.average(FST_test_R2)}')

In [None]:
randoforest.set_params(max_depth=25, min_samples_leaf=10, n_estimators=50)

target_feats = ['Prop_Type','BEDS','BATHS','SF','Lot_Size','YearBuilt','zip']

FST_train_R2 = []
FST_test_R2 = []

for i in range(5):
    X_train, X_test, y_train, y_test = get_train_test_LE(RF,target_feats)
    randoforest.fit(X_train, y_train)
    FST_train_R2.append(randoforest.score(X_train, y_train))
    FST_test_R2.append(randoforest.score(X_test, y_test))
    print(i+1)

print(f'RandForest train R2: {np.average(FST_train_R2)}')
print(f'RandForest test R2: {np.average(FST_test_R2)}')

## Miniscule improvements with Cluster information

🦏 Try **Full** Model by directly adding ZipCode Level information & feature select with Lasso

In [None]:
# Use all columns except 
# PRICE, Sale_Type, Sold_Date, ADDRESS, CITY, State, px_perSF, HOA_perMonth, 
# Status, LATITUDE, LONGITUDE, LOCATION, Days_on_Mkt, AverageHouseValue
unwanted = ['PRICE','Sale_Type', 'Sold_Date', 'ADDRESS', 'CITY', 'State', 'LOCATION',
            'Days_on_Mkt', 'px_perSF', 'HOA_perMonth', 'STATUS', 'LATITUDE', 'LONGITUDE','AverageHouseValue']
RF.drop(columns=unwanted).columns

In [None]:
# target_feats = ['Prop_Type','BEDS','BATHS','SF','Lot_Size','YearBuilt','zip']
# target_feats = ['Prop_Type','BEDS','BATHS','SF','Lot_Size','YearBuilt','zip','cluster']
target_feats = ['Prop_Type', 'BEDS', 'BATHS', 'SF', 'Lot_Size', 'YearBuilt', 'zip',
        'Population', 'HouseholdsPerZipCode', 'MedianAge', 'NumberOfBusinesses', 
        'over_65_ratio', 'school_rating', 'Hosp_count', 'HPI', 'Demand_score', 
        'Supply_score', 'listviews_vs_US', 'med_days_on_mkt', 'nielson_rank', 
        'UE_rate', 'Zillow_HVF', 'BEA_percap_income', 'Blacks_ratio', 
        'Hispanics_ratio', 'Asians_ratio', 'Indians_ratio', 'Others_ratio', 
        'Male_ratio']

lm = LinearRegression()
cat = CatBoostRegressor(verbose=False)

LM_train_R2 = []
LM_test_R2 = []

CAT_train_R2 = []
CAT_test_R2 = []

for i in range(5):
    X_train_dum, X_test_dum, X_train_le, X_test_le, y_train, y_test = get_train_test(RF,target_feats)
    lm.fit(X_train_dum, y_train)
    LM_train_R2.append(lm.score(X_train_dum, y_train))
    LM_test_R2.append(lm.score(X_test_dum, y_test))
   
    cat.fit(X_train_le, y_train)
    CAT_train_R2.append(cat.score(X_train_le, y_train))
    CAT_test_R2.append(cat.score(X_test_le, y_test))
    print(i+1)

print(f'Linear Model train R2: {np.average(LM_train_R2)}')
print(f'Linear Model test R2: {np.average(LM_test_R2)}')
print(f'CatBoost train R2: {np.average(CAT_train_R2)}')
print(f'CatBoost Model test R2: {np.average(CAT_test_R2)}')

In [None]:
lasso = Lasso()
lasso.set_params(normalize=True, max_iter=10000, random_state=44)
lasso.get_params()

In [None]:
#try different alphas
alphas = np.linspace(0.00001,0.0005,20)
coefs_lasso = []
R2_train = []
R2_test = []

for a in alphas:
    lasso.set_params(alpha=a)
    lasso.fit(X_train_dum, y_train)
    coefs_lasso.append(lasso.coef_)
    R2_train.append(lasso.score(X_train_dum, y_train))
    R2_test.append(lasso.score(X_test_dum, y_test))

In [None]:
coefs_lasso = pd.DataFrame(coefs_lasso, index = alphas, columns = X_train_dum.columns)
R2_lasso = pd.DataFrame({'train': R2_train,'test':R2_test}, index = alphas)
coefs_lasso.head(2)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14,6))
fig.suptitle('Lasso Regression results')

for feat in coefs_lasso.columns:
    ax1.plot(coefs_lasso.index, 10**coefs_lasso[feat]) 
ax1.set_xlabel(r'hyperparameter $\lambda$')
ax1.set_ylabel(r'slope values')
#ax1.legend(loc=1)
for each in R2_lasso.columns:
    ax2.plot(R2_lasso.index, R2_lasso[each], label=each)
ax2.set_xlabel(r'hyperparameter $\lambda$')
ax2.legend(loc=1)

In [None]:
feat_importances = pd.DataFrame(cat.get_feature_importance(), index=X_train_le.columns, columns=['feat_importance'])
feat_importances = feat_importances.sort_values(by='feat_importance', ascending=False)

f, ax = plt.subplots(figsize=(7, 7))
sns.barplot(x='feat_importance', y=feat_importances.index, data=feat_importances, color="teal");

In [None]:
Cat_scores = []

for num in range(3,len(feat_importances.index)):
    print(num)
    X_train_new = X_train_le[[*feat_importances.index[:num]]]
    X_test_new = X_test_le[[*feat_importances.index[:num]]]
    cat.fit(X_train_new, y_train)
    Cat_scores.append(cat.score(X_test_new, y_test))

In [None]:
plt.plot(Cat_scores)