In [28]:
# libraries whatever
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

from sklearn.linear_model import LinearRegression as lm
from sklearn.neighbors import KNeighborsRegressor as knn
from sklearn.tree import DecisionTreeRegressor as dt
from sklearn.ensemble import RandomForestRegressor as rf, GradientBoostingRegressor as gbf
from sklearn.svm import SVR as svr
from sklearn.neural_network import MLPRegressor as mlp

from sklearn.metrics import max_error as me, mean_absolute_error as mae, r2_score, mean_squared_error as mse # ,mean_absolute_percentage_error as mape
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

In [29]:
# whoever is working on this just change the file path if working from different computer
# Import data set
path = r"C:\Users\6260GHEREJ\Downloads\Consolidated Data.csv"
data = pd.read_csv(path)

# clean data
data.drop(columns=['zip_code'], inplace=True)
data.sort_index(axis=1, inplace=True)
data.head()

Unnamed: 0,gdp,has_garage,house_age,in_city,median_income_nat,median_income_reg,mi_to_airport,mi_to_grocery_store,mi_to_hospital,mortgage_rate,...,num_of_beds,population,price,recession,snd500,square_foot,unemployment_rate_nat,unemployment_rate_reg,waterfront,year_sold
0,3471828,1,46.1,1,64499.7,105256.26,38.4,7.6,16.4,0.038,...,5,10065,4347351,0,147.38,5575,0.0746,0.0514,1,2012
1,4904994,1,46.1,0,95366.82,123812.43,28.1,5.4,9.0,0.0543,...,6,10944,4187963,0,211.34,12511,0.0557,0.0527,0,2020
2,2156511,1,47.6,1,36898.52,107563.5,39.2,5.1,11.3,0.044,...,8,60224,3850155,1,88.86,6090,0.102,0.0743,0,2007
3,6037751,0,48.5,1,85816.71,100445.78,40.0,4.8,11.1,0.0537,...,8,24720,3488730,0,186.75,9504,0.0563,0.0609,0,2019
4,3937991,1,48.8,1,58384.56,76393.73,46.2,4.3,10.9,0.0416,...,8,12332,2820629,0,148.88,14049,0.0684,0.0577,0,2010


In [30]:
def cust_mape(actual, pred): 
    # sklearn mape is broken
    data = pd.DataFrame({'act':actual,'pred':pred})
    data = data[data['act'] != 0]
    return np.mean(np.abs((data['act'] - data['pred']) / data['act']))

def evaluate_regression(tr_act, tr_pred, te_act = None, te_pred = None):

    pd.options.display.float_format = '{:.4f}'.format
    
    metrics = ['Correl','R^2', 'MAE', 'RMSE', 'MAPE', 'Max Err']
    tr_stats = []
    
    # Training set
        # Correl
    rho, pval = pearsonr(tr_act, tr_pred)
    tr_stats.append(rho)
        # R^2
    tr_stats.append(r2_score(tr_act, tr_pred))
        # MAE
    tr_stats.append(mae(tr_act, tr_pred))
        # RMSE
    tr_stats.append(mse(tr_act, tr_pred)**.5)
        # MAPE
    tr_stats.append(cust_mape(tr_act, tr_pred))
        # Max Error
    tr_stats.append(me(tr_act, tr_pred))
    data = pd.DataFrame({'metric':metrics,'train':tr_stats})
    
    # Test Set
    if type(te_act) != type(None):
        te_stats = []
        rho, pval = pearsonr(te_act, te_pred)
        te_stats.append(rho)
            # R^2
        te_stats.append(r2_score(te_act, te_pred))
            # MAE
        te_stats.append(mae(te_act, te_pred))
            # RMSE
        te_stats.append(mse(te_act, te_pred)**.5)
            # MAPE
        te_stats.append(cust_mape(te_act, te_pred))
            # Max Error
        te_stats.append(me(te_act, te_pred))   
        data['test'] = te_stats
        
    return data

In [31]:
# Split target from data
target = 'price'

# Separate target from the rest of the data
cols = list(data.columns)
cols.remove(target)

# Define dependent and independent variables
y = data[target]
X = data[cols]

# Immediately train test split
x_tr, x_te, y_tr, y_te = train_test_split(X, y, test_size = 0.3, random_state = 16)

# Combine X_train and y_train
train_full = pd.concat([y_tr, x_tr], axis=1)

# Export training set
train_full.to_csv("training.csv", index=False)

In [32]:
# Linear Regression
# Instantiate    
    # No Hyperparameters for this model
lm_model = lm()

# Train the model 
lm_model.fit(x_tr, y_tr)

# Generate Predictions
lm_tr = lm_model.predict(x_tr)
lm_te = lm_model.predict(x_te)

# Evaluate
evaluate_regression(y_tr, lm_tr, y_te, lm_te)

Unnamed: 0,metric,train,test
0,Correl,0.8381,0.8441
1,R^2,0.7023,0.7125
2,MAE,76517.829,75370.0084
3,RMSE,133309.3608,128820.7333
4,MAPE,0.4251,0.4217
5,Max Err,4066995.5179,3089747.8672


In [33]:
# K nearest neighbors
# Instantiate    
    # Hyperparameters:
        # n_neighbors: integer
        # weights: set string {'uniform', 'distance'}
knn_model = knn(n_neighbors = 5, weights = 'uniform')

    # KNN requires the independent variables to be scaled
scaler = StandardScaler()
scaler.fit(x_tr)
x_tr_sc = scaler.transform(x_tr)
x_te_sc = scaler.transform(x_te)

# Train the model 
knn_model.fit(x_tr_sc, y_tr)

# Generate Predictions
knn_tr = knn_model.predict(x_tr_sc)
knn_te = knn_model.predict(x_te_sc)

# Evaluate
evaluate_regression(y_tr, knn_tr, y_te, knn_te)

Unnamed: 0,metric,train,test
0,Correl,0.9034,0.8532
1,R^2,0.8112,0.7266
2,MAE,54582.728,65884.9062
3,RMSE,106173.6406,125610.5315
4,MAPE,0.2299,0.2802
5,Max Err,3042133.6,2865817.4


In [34]:
# grad boost forest
# Instantiate    
    # Hyperparameters:
        # n_estimators: integer
        # max_depth: integer
        # max_features: set string {“auto”, “sqrt”, “log2”} or None
gbf_model = gbf(n_estimators = 100, max_depth = 4, max_features = 'sqrt')

# Train the model 
gbf_model.fit(x_tr, y_tr)

# Generate Predictions
gbf_tr = gbf_model.predict(x_tr)
gbf_te = gbf_model.predict(x_te)

# Evaluate
evaluate_regression(y_tr, gbf_tr, y_te, gbf_te)

Unnamed: 0,metric,train,test
0,Correl,0.9131,0.8943
1,R^2,0.833,0.7998
2,MAE,54963.2484,56625.8174
3,RMSE,99859.4645,107496.0601
4,MAPE,0.2302,0.2343
5,Max Err,1982761.3791,2435410.4275


In [None]:
# Grid Search
# Define model
model = gbf()

# Define parameter ranges
param_grid = {
              'n_estimators': [100, 150, 200, 250],
              'max_depth': [2,3,4,6],
              'max_features': ['sqrt', 'log2', None]
             }

# Create GridSearchCV object
folds = 5
grid_search = GridSearchCV(model, param_grid, cv=folds)

# Fit the model to the data
grid_search.fit(x_tr, y_tr)

# Print the best parameters
best = grid_search.best_params_
print("Best parameters: ", best)

# Generate Predictions
grid_tr = grid_search.predict(x_tr)
grid_te = grid_search.predict(x_te)

# Evaluate
evaluate_regression(y_tr, grid_tr, y_te, grid_te)

In [None]:
results = pd.DataFrame({
    'actual_price': y_te,
    'predicted_price': grid_te
})

results.to_csv(r"C:\Users\6260GHEREJ\Downloads\best_model_predictions.csv", index=False)

In [None]:
final_preds = final.predict(X)

final_results = pd.DataFrame({
    'actual_price': y,
    'predicted_price': final_preds
})

final_results.to_csv(r"C:\Users\6260GHEREJ\Downloads\final_model_predictions.csv", index=False)