In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Train Validation Split


In [4]:
housing = pd.read_csv('ds_housing.csv')
# housing.head()
# housing = housing.dropna()

In [9]:
housing.head()

Unnamed: 0,neighborhood,type,units,year_built,sq_ft,income,income_per_sq_ft,expense,expense_per_sq_ft,net_income,value,value_per_sq_ft,boro
0,FINANCIAL,R9-CONDOMINIUM,42,1920.0,36500,1332615,36.51,342005,9.37,990610,7300000,200.0,Manhattan
1,FINANCIAL,R4-CONDOMINIUM,78,1985.0,126420,6633257,52.47,1762295,13.94,4870962,30690000,242.76,Manhattan
2,FINANCIAL,RR-CONDOMINIUM,500,,554174,17310000,31.24,3543000,6.39,13767000,90970000,164.15,Manhattan
3,FINANCIAL,R4-CONDOMINIUM,282,1930.0,249076,11776313,47.28,2784670,11.18,8991643,67556006,271.23,Manhattan
4,TRIBECA,R4-CONDOMINIUM,239,1985.0,219495,10004582,45.58,2783197,12.68,7221385,54320996,247.48,Manhattan


In [5]:
x = pd.get_dummies(housing[['units', 'sq_ft', 'boro']], drop_first=True)
#print(x)

In [6]:
y = housing["value_per_sq_ft"]

In [7]:
# Get Training and Testing (Validation) data
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.20)

In [8]:
# Score Linear Regression
lr = LinearRegression().fit(xTrain, yTrain)
#lr.score?
print(lr.score(xTest, yTest))

0.6543294824479968


# K-Fold Cross Validation

In [7]:
housing = pd.read_csv('housing.csv')

x = pd.get_dummies(housing[['units', 'sq_ft', 'boro']], drop_first=True)
y = housing["value_per_sq_ft"]

kf = KFold(n_splits=5)

coefs = []
scores = [] # R^2
RMSE = [] # Square root of Mean Squared Error 
RMAE = [] # Square root of Mean Absolute Error 

for trainIndex, testIndex in kf.split(x):
    #print(trainIndex, testIndex)

    # Use iLoc with Dataframe, no iLoc with list
    xTrain, xTest = x.iloc[trainIndex], x.iloc[testIndex]
    yTrain, yTest = y.iloc[trainIndex], y.iloc[testIndex]
    
    lr = LinearRegression().fit(xTrain, yTrain)
    
    # Coefficients/Parameters/Weights
    coefs.append(pd.DataFrame(lr.coef_))
    
    # Evaluation
    scores.append(lr.score(xTest, yTest)) # 'r^2' score
    
    # More creteria
    yPred = lr.predict(xTest)
    RMSE.append(np.sqrt(mean_squared_error(yPred, yTest)))
    RMAE.append(np.sqrt(mean_absolute_error(yPred, yTest)))      

In [8]:
print(scores)
print(RMSE)
print(RMAE)

print(np.mean(scores))
print(np.mean(RMSE))
print(np.mean(RMAE))

[0.027314162908477768, -0.5538362212170342, -0.15636371687906858, -0.32342020619393663, -1.692965558651288]
[48.50565100436402, 56.86049715104664, 62.814519127147435, 23.86714762978918, 50.58929461326704]
[6.291404732856928, 6.86217976719242, 6.946889246154169, 4.125021074523702, 6.41595480271768]
-0.5398543080065699
48.527421905122864
6.12828992468898


In [9]:
# As an alternative to all that...
model = LinearRegression()

scores = cross_val_score(model, x, y, cv=5) # scoring = 'r^2', by default
#cross_val_score?
print(scores)

[ 0.02731416 -0.55383622 -0.15636372 -0.32342021 -1.69296556]


# K-Fold Cross Validation with Regularization

In [10]:
from sklearn.linear_model import Lasso, Ridge

x = pd.get_dummies(housing[["units", "sq_ft", "boro"]], drop_first=True)
y = housing["value_per_sq_ft"]

kf = KFold(n_splits=5, shuffle=True)

coefs  = {}
scores = {} # R^2
RMSE   = {}
RMAE    = {}

# Constant that multiplies the L1 (Lasso) or L2 (Ridge) term.
alphas = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]
for alpha in alphas:
    for trainIndex, testIndex in kf.split(x):

        xTrain, xTest = x.iloc[trainIndex], x.iloc[testIndex]
        yTrain, yTest = y.iloc[trainIndex], y.iloc[testIndex]

        lr = Lasso(alpha=alpha).fit(xTrain, yTrain)
        #lr = Ridge(alpha=alpha).fit(xTrain, yTrain)
        
        coefs[alpha] = pd.DataFrame(lr.coef_)

        yPred = lr.predict(xTest)

        scores[alpha] = lr.score(xTest, yTest)
        RMSE[alpha] = np.sqrt(mean_squared_error(yPred, yTest))
        RMAE[alpha] = np.sqrt(mean_absolute_error(yPred, yTest))

In [11]:
df1 = pd.DataFrame(list(scores.items()), columns = ['Alpha','R^2'])
df2 = pd.DataFrame(list(RMSE.items()), columns = ['Alpha','RMSE'])
df3 = pd.DataFrame(list(RMAE.items()), columns = ['Alpha','RMAE'])

pd.merge(pd.merge(df1, df2, on='Alpha'), df3, on='Alpha')

Unnamed: 0,Alpha,R^2,RMSE,RMAE
0,0.1,0.599488,44.945075,5.812412
1,0.2,0.590361,44.754986,5.841926
2,0.3,0.591975,42.822051,5.665421
3,0.4,0.581052,44.009686,5.835189
4,0.5,0.616653,42.602228,5.675589
5,0.6,0.526571,46.981524,5.953416
6,0.7,0.587262,44.528094,5.790746
7,0.8,0.527888,44.584244,5.778948
8,0.9,0.564607,44.441736,5.848046
9,1.0,0.552299,48.652862,6.069315


In [12]:
# Previously w/o regularization, we got -0.5399, 48.5274, 6.1283 for R^2, RMSE, RMAE