In [1]:
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, OneHotEncoder

from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.svm import SVR

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


## Import Dataset

In [16]:
df = pd.read_csv('C:/Users/Gerardo/Documents/Projects/ds4a/datasets/current_final_datset.csv')

cols_to_scale = ['Community_Spending','Unexpected_Housing_Spending','Govt_Direct_Expenditure','MedianIncome','Number_Interest_Groups','Property_Rights']
cols_to_onehot = ['No_Discrimination_Laws','Private_Fair_Housing',
                  'Public_Fair_Housing','Urban_Fair_Housing',
                  'Banned_Discrimination_Public_Housing',
                  'Banned_Discrimination_Private_Housing',
                  'Legislation_Public_Housing','Rent_Control',
                  'State_Aid_Allowed','Federal_Aid_Allowed',
                  'Prohibit_Rent_Control','Metro']
possible_labels = ['FMR0','Rent50','MedianIncome','FMRRentPercentInc','Rent50PercInc',
                   'Income_Adjusted_FMR0','Income_Adjusted_Rent50','Income_Adjusted_HousingPrices',
                   'Affordability_Price_Point','Housing_Prices_Quarter']

label_col = ['Income_Adjusted_FMR0']

cols_to_think_about = ['State','Year','County','Is_FMR0_Affordable','Is_Rent_Affordable']

y_df = df[possible_labels]
X_df = df[cols_to_scale + cols_to_onehot]

print('number of continuous features: ' + str(len(cols_to_scale)))
print('number of categorical features: ' + str(len(cols_to_onehot)))



number of continuous features: 6
number of categorical features: 12


In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
class ExtractColumns(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y = None):
        return self
    
    def transform(self,X):
        return X[self.attribute_names].values

In [19]:
def my_train_test_split(X,y,test_size = .2):
    tot_size = len(X)
    test_size = int(tot_size * test_size)
    

    test_indx = random.sample(range(tot_size),test_size)
    train_indx = np.setdiff1d(range(tot_size),test_indx)
   
    X_train = X.iloc[train_indx,:]
    X_test = X.iloc[test_indx,:]
   
    y_train = y.iloc[train_indx,:]
    y_test = y.iloc[test_indx,:]
    
    return X_train,X_test,y_train,y_test

In [20]:
label_pipeline = Pipeline([
    ('get_cols',ExtractColumns(label_col)),
    ('minmaxer',MinMaxScaler())
])

num_pipeline = Pipeline([
    ('get_cols',ExtractColumns(cols_to_scale)),
    ('scaler',StandardScaler())
])
cat_pipeline = Pipeline([
    ('get_cols',ExtractColumns(cols_to_onehot)),
    ('onehot',OneHotEncoder())
])

my_pipeline = ColumnTransformer([
    ('numerical',num_pipeline,cols_to_scale),
    ('categorical',cat_pipeline,cols_to_onehot),
    ('poly',PolynomialFeatures(degree=2),cols_to_onehot + cols_to_scale)
])



In [24]:
X_train, X_test, y_train, y_test = my_train_test_split(X_df,y_df,test_size = .3) #70% training
X_val, X_test, y_val,y_test = my_train_test_split(X_test,y_test,test_size = .5) #15% validation

y_train = label_pipeline.fit_transform(y_train) #only fit the scaling to the test dataset
X_train = my_pipeline.fit_transform(X_train)

y_test = label_pipeline.transform(y_test) #transform but don't fit the scaling to the test/validation dataset
X_test = my_pipeline.transform(X_test)
y_val = label_pipeline.transform(y_val)
X_val = my_pipeline.transform(X_val)



In [25]:
def train_model(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    
    score = np.sqrt(mean_squared_error(y_test,pred))
    plt.scatter(pred,y_test)
    plt.plot([0,pred.max()],[0,pred.max()],color ='red');
    plt.title('score: ' + str(score))
    
    return score,model

In [26]:
for_reg = RandomForestRegressor(n_estimators = 500, max_depth = 8,min_samples_split = 16)
svr_reg = SVR(kernel='poly',degree=2,C=1E3)
sgd_reg = SGDRegressor(penalty= 'elasticnet',l1_ratio = .5,random_state= 42,learning_rate = 'constant',eta0 = 1E-7)
ridge_reg = Ridge(alpha = .9)

ensemble = VotingRegressor(estimators = [
    ('random_forest',for_reg),
    ('svr',svr_reg),
    ('sgd',sgd_reg),
    ('ridge',ridge_reg)
], verbose = True)

train_model(ensemble,X_train,X_test,y_train,y_test)

  y = column_or_1d(y, warn=True)


[Voting] ............ (1 of 4) Processing random_forest, total= 4.9min
