### IMPORTING LIBRARIES

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Importing dataset and splitting into features and dependent variables

In [60]:
dataset=pd.read_csv("Data.csv")
x=dataset.iloc[:,:-1].values
y=dataset.iloc[:,-1].values

### Splitting into train and test data

In [61]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,random_state=0, test_size=0.2)

### THESE FIRST 3 CODE BLOCKS ABOVE ARE USED BY ALL MODELS ASIDES THE SVR MODEL

 ###   MULTIPLE LINEAR REGRESSION

In [63]:
# training the multiple linear regression on the training data

from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(x_train,y_train)

# predicting the test set result
y_pred=regressor.predict(x_test)


np.set_printoptions(precision=2)

# Putting the predicted and test values side by side to visually compare 
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

# Evaluating performance
from sklearn.metrics import r2_score
mulitiple_linear_regression_score=r2_score(y_test,y_pred)

[[431.43 431.23]
 [458.56 460.01]
 [462.75 461.14]
 ...
 [469.52 473.26]
 [442.42 438.  ]
 [461.88 463.28]]


### POLYNOMIAL REGRESION

In [64]:
# training the multiple linear regression on the training data

from sklearn.linear_model import LinearRegression
regressor=LinearRegression()

# importing the tool to help create power features
from sklearn.preprocessing import PolynomialFeatures
poly_reg=PolynomialFeatures(degree=4)
x_poly=poly_reg.fit_transform(x_train)

regressor_2=LinearRegression()
regressor_2.fit(x_poly,y_train)


# predicting the test set result
y_pred=regressor_2.predict(poly_reg.transform(x_test))


np.set_printoptions(precision=2)

# Putting the predicted and test values side by side to visually compare 
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

# Evaluating performance
from sklearn.metrics import r2_score
polynomial_regression_score=r2_score(y_test,y_pred)


[[433.94 431.23]
 [457.9  460.01]
 [460.52 461.14]
 ...
 [469.53 473.26]
 [438.27 438.  ]
 [461.66 463.28]]


### DECISION TREE REGRESSION

In [65]:
# training the Decision tree regression model on the training data
from sklearn.tree import DecisionTreeRegressor
regressor=DecisionTreeRegressor(random_state=0)
regressor.fit(x_train,y_train)

# predicting the test set result
y_pred=regressor.predict(x_test)


np.set_printoptions(precision=2)

# Putting the predicted and test values side by side to visually compare 
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

# Evaluating performance
from sklearn.metrics import r2_score
DecisionTreeRegressor_score=r2_score(y_test,y_pred)



[[431.28 431.23]
 [459.59 460.01]
 [460.06 461.14]
 ...
 [471.46 473.26]
 [437.76 438.  ]
 [462.74 463.28]]


### RANDOM FOREST REGRESSION

In [66]:
# training the random forest regression model on the training data
from sklearn.ensemble import RandomForestRegressor

# we decide to use 10 trees in our model hence n_estimators=10

regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(x_train,y_train)

# predicting the test set result
y_pred=regressor.predict(x_test)


np.set_printoptions(precision=2)

# Putting the predicted and test values side by side to visually compare 
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

# Evaluating performance
from sklearn.metrics import r2_score
RandomForestRegressor_score=r2_score(y_test,y_pred)


[[434.05 431.23]
 [458.79 460.01]
 [463.02 461.14]
 ...
 [469.48 473.26]
 [439.57 438.  ]
 [460.38 463.28]]


### SUPPORT VECTOR REGRESSION

In [67]:
# I had to put the SVR model last and just repeat a few codes in this box.
# SVR just requires a little extra 

dataset=pd.read_csv("Data.csv")
x=dataset.iloc[:,:-1].values
y=dataset.iloc[:,-1].values

# we perform feature scaling for support vector regression so we have to reshape y since the standard scalar class to be 
# used for feature scaling expects a 2d array as input

y = y.reshape(len(y),1)

#  splitting into training and testing data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,random_state=0, test_size=0.2)

# feature scaling. Note: Feature scaling is done after splitting the data
from sklearn.preprocessing import StandardScaler

sc_x=StandardScaler()
sc_y=StandardScaler()

x_train=sc_x.fit_transform(x_train)
y_train=sc_y.fit_transform(y_train)

# training the svr model on the whole dataset
from sklearn.svm import SVR
regressor=SVR(kernel='rbf')
regressor.fit(x_train,y_train)

# we use inverse transform method to give us sth similar to y_test since y_test was never transformed
y_pred = sc_y.inverse_transform([regressor.predict(sc_x.transform(x_test))])

# this line below helps us switch our rows and columns because the shape of y_pred above was (1,1914) and we want to resemble
# y_test

y_pred=y_pred.transpose()


np.set_printoptions(precision=2)

# Putting the predicted and test values side by side to visually compare 
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

# Evaluating performance
from sklearn.metrics import r2_score
support_vector_regression_score=r2_score(y_test,y_pred)


  y = column_or_1d(y, warn=True)


[[434.05 431.23]
 [457.94 460.01]
 [461.03 461.14]
 ...
 [470.6  473.26]
 [439.42 438.  ]
 [460.92 463.28]]


### VISUALIZING THE RESULTS 

In [72]:
data={'MODEL':['Multiple Regression','Polynomial regression', 'Decision tree', 'Random forest', 'Support vector regression'],
      'SCORE':[mulitiple_linear_regression_score,polynomial_regression_score,DecisionTreeRegressor_score,RandomForestRegressor_score,
              support_vector_regression_score]}
result=pd.DataFrame(data=data)
print(result)
print()
print('Maximum_Score = ',result['SCORE'].max())

                       MODEL     SCORE
0        Multiple Regression  0.932532
1      Polynomial regression  0.945819
2              Decision tree  0.922906
3              Random forest  0.961591
4  Support vector regression  0.948078

Maximum_Score =  0.9615908334363876


### COMMENTS

## so you should know that regression models consist of two parameters,
1. The parameters that are learnt (the coefficients)
2. The hyparameters. So far for the models above, we have used the default value of this parameters and we havent searched for their 
optimal value so that the models reaches even higher performance
3. From the results above, we can see that the Random forest regression model performs best on this dataset
