In [129]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt




In [130]:
df = pd.read_csv('processed.csv', index_col=0)

df = df.sample(frac=1, random_state=1)

X = df.drop(labels=['Price'], axis=1, inplace=False)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression

In [131]:
lin_reg = LinearRegression()



model = lin_reg.fit(X_train, y_train)

train_cross_val = cross_val_score(lin_reg, X_train, y_train, cv=10, scoring='r2')


In [132]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared 0.6138758676100822
Test set Mean Squared Error: 0.20633639318812177


In [133]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: 0.6300993680040909 Standard Deviation: 0.04538360053876054


### DT

In [134]:
DT = DecisionTreeRegressor() ## since it seems the first 11 features capture good variance for PCA

model = DT.fit(X_train, y_train)

train_cross_val = cross_val_score(DT, X_train, y_train, cv=10, scoring='r2')


In [135]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared 0.42393571643518857
Test set Mean Squared Error: 0.30783630585210797


In [136]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: 0.3990889404532899 Standard Deviation: 0.11019763267000438


In [137]:
DT.max_depth = 11

model = DT.fit(X_train, y_train)

train_cross_val = cross_val_score(DT, X_train, y_train, cv=10, scoring='r2')

In [138]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared 0.5181405407709556
Test set Mean Squared Error: 0.25749528325387816


In [139]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: 0.4504852887333743 Standard Deviation: 0.0827610490448805


### Random Forest

In [140]:
RF = RandomForestRegressor()

model = RF.fit(X_train, y_train)

train_cross_val = cross_val_score(RF, X_train, y_train, cv=10, scoring='r2')


In [141]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared 0.6783368999844214
Test set Mean Squared Error: 0.17188981032633738


In [142]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: 0.6657367216736787 Standard Deviation: 0.07030130834636664


In [143]:
RF.max_depth = 11

model = RF.fit(X_train, y_train)

train_cross_val = cross_val_score(RF, X_train, y_train, cv=10, scoring='r2')

In [144]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared 0.6823256272051472
Test set Mean Squared Error: 0.16975832068583815


In [145]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: 0.6714660933305339 Standard Deviation: 0.06378921686048301


### SVM

In [146]:
SVR = LinearSVR()

model = SVR.fit(X_train, y_train)

train_cross_val = cross_val_score(SVR, X_train, y_train, cv=10, scoring='r2')



In [147]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared -2.7345099463461113
Test set Mean Squared Error: 1.9956414220598016


In [148]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: -2.249697484341862 Standard Deviation: 3.540826428769984


### MLP

In [149]:
MLP = MLPRegressor()

model = SVR.fit(X_train, y_train)

train_cross_val = cross_val_score(MLP, X_train, y_train, cv=10, scoring='r2')



In [150]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared -10.629955426800402
Test set Mean Squared Error: 6.214796886306391


In [151]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: -6.9621960138002965 Standard Deviation: 4.121610416785178
