## Building the Regression Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import Pandas Library, used for data manipulation
# Import matplotlib, used to plot our data
# Import nump for mathemtical operations
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

fish_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Maze/Regression Model Evaluation/Fish.csv")
fish_data.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [None]:
# renaming columns
renamed_columns = ['Species','Weight', 'Vertical_length','Diagonal_length','Cross_length', 'Height','Width']

fish_data.columns = renamed_columns
fish_data.columns

Index(['Species', 'Weight', 'Vertical_length', 'Diagonal_length',
       'Cross_length', 'Height', 'Width'],
      dtype='object')

In [None]:
# one hot encode species feature
fish_data = pd.get_dummies(fish_data)

# view changes
fish_data.tail()

Unnamed: 0,Weight,Vertical_length,Diagonal_length,Cross_length,Height,Width,Species_Bream,Species_Parkki,Species_Perch,Species_Pike,Species_Roach,Species_Smelt,Species_Whitefish
154,12.2,11.5,12.2,13.4,2.0904,1.3936,0,0,0,0,0,1,0
155,13.4,11.7,12.4,13.5,2.43,1.269,0,0,0,0,0,1,0
156,12.2,12.1,13.0,13.8,2.277,1.2558,0,0,0,0,0,1,0
157,19.7,13.2,14.3,15.2,2.8728,2.0672,0,0,0,0,0,1,0
158,19.9,13.8,15.0,16.2,2.9322,1.8792,0,0,0,0,0,1,0


In [None]:
# view shape of dataframe
fish_data.shape

(159, 13)

In [None]:
# input data
X = fish_data.drop(['Weight'], 1)
# target variable
y = fish_data.Weight

# split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

  X = fish_data.drop(['Weight'], 1)


In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

## Evaluating the regression model

In [None]:
y_pred = model.predict(X_test)
print(y_pred)

[  22.83330062   23.41273287  187.29483799  310.45293863  208.87406165
  781.04848248  -52.19610215  253.55251651  261.83735593 1151.75626825
  599.51515345  830.9367517   548.24025022  140.03721841  691.31264199
  836.13483002 1021.39702132  286.88161176  231.66639764  587.73721669
   -2.27001926  592.30651037  509.16439696  490.44424842]


In [None]:
# mean squared error
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

5828.016723991149

In [None]:
# root mean squared error
mean_squared_error(y_test, y_pred, squared = False)

76.34144827019689

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9565882607957029

In [None]:
# define variables for adjusted r2 score
r2 = r2_score(y_test, y_pred)
n = len(y_test)
k = len(X_test.columns)

# calculate adjusted r2 score
adj_r2_score = 1-(((1-r2)*(n-1))/(n-k-1))

adj_r2_score

0.9092299998455606

### Adding features

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# transform data to include polynomial terms to third degree
poly = PolynomialFeatures(degree = 3)
X_degree3 = poly.fit_transform(X)
# check number of features, this is given as the number of columns of our transformed data
X_degree3.shape

(159, 455)

In [None]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_degree3, y, test_size=0.15, random_state=42)

# fit model
degree3_model = LinearRegression()
degree3_model.fit(X_train, y_train)

# produce set of predictions
y_pred = degree3_model.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

-10153.97150998124

In [None]:
# produce set of predictions from training data X_train
y_train_pred = degree3_model.predict(X_train)

r2_score(y_train, y_train_pred)

0.9999260622078828

### Improving our model

In [None]:
# transform data to include polynomial terms to third degree
# interactive terms refer to terms that capture the joint effect of two or more variables on the dependent variable
poly = PolynomialFeatures(interaction_only = True)
X_interaction = poly.fit_transform(X)

In [None]:
# check number of features, this is given as the number of columns of our transformed data
X_interaction.shape

(159, 79)

In [None]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_interaction, y, test_size=0.15, random_state=42)

# fit model
interaction_model = LinearRegression()
interaction_model.fit(X_train, y_train)

# produce set of predictions
y_pred = interaction_model.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

0.9643044521027049

In [None]:
# produce set of predictions from training data X_train
y_train_pred = interaction_model.predict(X_train)

r2_score(y_train, y_train_pred)

0.9827158071698887

In [None]:
# define variables for adjusted r2 score
r2 = r2_score(y_test, y_pred)
n = len(y_test)
k = len(X_test[0])

# calculate adjusted r2 score
adj_r2_score = 1-(((1-r2)*(n-1))/(n-k-1))

adj_r2_score

1.0146606714578177

In [None]:
print(n,',',k)

24 , 79


Since the number of our test example (n), is less than the number of features in our model (k), the adj_r2_score is bigger than 1, so it may not be useful in the evaluation.