Model Selection

In [115]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.metrics import r2_score

In [116]:
dataset = pd.read_csv("dataset/Data.csv")
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

Splitting the dataset into training and test set

In [117]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [118]:
print(X_train)   

[[ 752904      10       1 ...       5       4       1]
 [1218860       1       1 ...       3       1       1]
 [ 411453       5       1 ...       3       1       1]
 ...
 [1214092       1       1 ...       1       1       1]
 [1303489       3       1 ...       2       1       1]
 [ 378275      10       9 ...       7       7       1]]


Multiple Linear Regression

In [122]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate(( y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))
r2_score(y_test, y_pred)

[[2.25 2.  ]
 [2.03 2.  ]
 [3.52 4.  ]
 [3.7  4.  ]
 [1.89 2.  ]
 [2.09 2.  ]
 [2.26 2.  ]
 [3.94 4.  ]
 [2.07 2.  ]
 [1.94 2.  ]
 [4.   4.  ]
 [2.17 2.  ]
 [4.23 4.  ]
 [2.04 2.  ]
 [2.22 2.  ]
 [2.7  2.  ]
 [3.58 4.  ]
 [4.11 4.  ]
 [4.21 4.  ]
 [2.18 2.  ]
 [2.03 2.  ]
 [2.11 2.  ]
 [3.12 4.  ]
 [1.99 2.  ]
 [3.83 4.  ]
 [3.9  4.  ]
 [2.2  2.  ]
 [1.92 2.  ]
 [2.1  2.  ]
 [3.74 4.  ]
 [1.88 2.  ]
 [4.31 4.  ]
 [4.59 4.  ]
 [2.07 2.  ]
 [2.05 2.  ]
 [2.07 2.  ]
 [3.83 4.  ]
 [3.63 4.  ]
 [2.05 2.  ]
 [3.9  4.  ]
 [2.26 2.  ]
 [2.43 2.  ]
 [2.01 2.  ]
 [1.96 2.  ]
 [1.88 2.  ]
 [2.13 2.  ]
 [1.97 2.  ]
 [4.23 4.  ]
 [2.13 2.  ]
 [2.11 2.  ]
 [3.77 4.  ]
 [2.15 2.  ]
 [3.49 4.  ]
 [2.13 2.  ]
 [2.18 2.  ]
 [2.31 2.  ]
 [3.8  4.  ]
 [3.48 2.  ]
 [2.4  2.  ]
 [3.88 4.  ]
 [1.96 2.  ]
 [1.98 2.  ]
 [2.02 2.  ]
 [1.99 2.  ]
 [2.07 2.  ]
 [1.96 2.  ]
 [1.97 2.  ]
 [2.09 2.  ]
 [3.64 4.  ]
 [2.84 4.  ]
 [1.94 2.  ]
 [2.05 2.  ]
 [1.92 2.  ]
 [2.07 2.  ]
 [2.32 2.  ]
 [2.13 2.  ]
 [3.8  4.  ]

0.8354489501242135

Polynomial Linear Regression

In [110]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)
y_pred = regressor.predict(poly_reg.transform(X_test))
r2_score(y_test, y_pred)

0.6085483223694554

Decision Tree Model

In [111]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)
r2_score(y_test, regressor.predict(X_test))

0.6535632183908047

Random Forest

In [112]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10, random_state=0)
regressor.fit(X_train, y_train)
r2_score(y_test, regressor.predict(X_test))

0.8447333333333333

Support Vector Regression (SVR)

In [113]:
from sklearn.preprocessing import StandardScaler
#for the SVN we need to reshape the y axis
y_reshaped = y.reshape(len(y), 1)

X_train_svr, X_test_svr, y_train_svr, y_test_svr = train_test_split(X, y_reshaped, test_size=0.2, random_state=0)
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_train_svr = scaler_X.fit_transform(X_train_svr)
y_train_svr = scaler_y.fit_transform(y_train_svr)

from sklearn.svm import SVR
regressor = SVR(kernel="rbf")
regressor.fit(X_train_svr, y_train_svr)

y_pred = scaler_y.inverse_transform(regressor.predict(scaler_X.transform(X_test_svr)))
    # r2_score(X_test_svr, y_)

  y = column_or_1d(y, warn=True)
