In [176]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt




In [153]:
df = pd.read_csv('processed.csv', index_col=0)

df = df.sample(frac=1, random_state=1)

X = df.drop(labels=['Price'], axis=1, inplace=False)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression

In [154]:
lin_reg = LinearRegression()



model = lin_reg.fit(X_train, y_train)

train_cross_val = cross_val_score(lin_reg, X_train, y_train, cv=10, scoring='r2')


In [155]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared 0.63896017311228
Test set Mean Squared Error: 0.23894763008951134


In [156]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: 0.6211271008675674 Standard Deviation: 0.0870258027178451


### DT

In [157]:
DT = DecisionTreeRegressor() ## since it seems the first 11 features capture good variance for PCA

model = DT.fit(X_train, y_train)

train_cross_val = cross_val_score(DT, X_train, y_train, cv=10, scoring='r2')


In [158]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared 0.4873187892286531
Test set Mean Squared Error: 0.33930871660685835


In [159]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: 0.4254033016323029 Standard Deviation: 0.13121943911950476


In [160]:
DT.max_depth = 11

model = DT.fit(X_train, y_train)

train_cross_val = cross_val_score(DT, X_train, y_train, cv=10, scoring='r2')

In [161]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared 0.5424940041026036
Test set Mean Squared Error: 0.302792006116882


In [162]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: 0.5003286385600598 Standard Deviation: 0.13212748767728455


### Random Forest

In [163]:
RF = RandomForestRegressor()

model = RF.fit(X_train, y_train)

train_cross_val = cross_val_score(RF, X_train, y_train, cv=10, scoring='r2')


In [164]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared 0.6722963803471931
Test set Mean Squared Error: 0.21688466882670163


In [165]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: 0.679884465281173 Standard Deviation: 0.08498967257200474


In [166]:
RF.max_depth = 11

model = RF.fit(X_train, y_train)

train_cross_val = cross_val_score(RF, X_train, y_train, cv=10, scoring='r2')

In [167]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared 0.6748929966521261
Test set Mean Squared Error: 0.21516614564419273


In [168]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: 0.6834603405120394 Standard Deviation: 0.08884841597807833


### SVM

In [169]:
SVR = LinearSVR()

model = SVR.fit(X_train, y_train)

train_cross_val = cross_val_score(SVR, X_train, y_train, cv=10, scoring='r2')



In [170]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared -1.0155537408158684
Test set Mean Squared Error: 1.3339575133238075


In [171]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: -3.482247694000472 Standard Deviation: 7.478573581834738


### MLP

In [172]:
MLP = MLPRegressor()

model = SVR.fit(X_train, y_train)

train_cross_val = cross_val_score(MLP, X_train, y_train, cv=10, scoring='r2')



In [173]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared -2.6529522374966588
Test set Mean Squared Error: 2.4176398695522776


In [174]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: -3.8603944251305427 Standard Deviation: 4.021420849804941


### KNN

In [177]:
KNN = KNeighborsRegressor()

model = KNN.fit(X_train, y_train)

train_cross_val = cross_val_score(KNN, X_train, y_train, cv=10, scoring='r2')

In [178]:
print(f'Test set R Squared {model.score(X_test, y_test)}')
print(f'Test set Mean Squared Error: {mean_squared_error(y_test, model.predict(X_test))}')

Test set R Squared 0.5219831978116146
Test set Mean Squared Error: 0.3163667094860509


In [179]:
print(f'Cross validation mean R Squared: {train_cross_val.mean()} Standard Deviation: {train_cross_val.std()}')

Cross validation mean R Squared: 0.48325795254395815 Standard Deviation: 0.09535544513121608
