In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np

In [2]:
dataset = pd.read_csv("../fish_participant.csv")

In [3]:
print(dataset)

    Species  Weight  Length1  Length2  Length3   Height   Width
0     Bream   430.0     26.5     29.0     34.0  12.4440  5.1340
1     Perch   110.0     20.0     22.0     23.5   5.5225  3.9950
2     Roach   160.0     20.5     22.5     25.3   7.0334  3.8203
3    Parkki    60.0     14.3     15.5     17.4   6.5772  2.3142
4     Bream   700.0     30.4     33.0     38.3  14.8604  5.2854
..      ...     ...      ...      ...      ...      ...     ...
106   Perch   197.0     23.5     25.6     27.0   6.5610  4.2390
107  Parkki   140.0     19.0     20.7     23.2   8.5376  3.2944
108   Roach   110.0     19.1     20.8     23.1   6.1677  3.3957
109   Perch   685.0     34.0     36.5     39.0  10.8810  6.8640
110   Perch   300.0     26.9     28.7     30.1   7.5852  4.6354

[111 rows x 7 columns]


In [4]:
# Check if there are null values
print (dataset[dataset.isnull().any(axis=1)])

Empty DataFrame
Columns: [Species, Weight, Length1, Length2, Length3, Height, Width]
Index: []


In [5]:
# Check the type of fish
dataset.Species.unique()

array(['Bream', 'Perch', 'Roach', 'Parkki', 'Smelt', 'Whitefish', 'Pike'],
      dtype=object)

In [6]:
dataset = dataset.assign(Girth=dataset["Length1"] * dataset["Width"])

# Organize columns

dataset = dataset[["Species", "Length2", "Height", "Width", "Girth", "Weight" ]]
dataset

Unnamed: 0,Species,Length2,Height,Width,Girth,Weight
0,Bream,29.0,12.4440,5.1340,136.05100,430.0
1,Perch,22.0,5.5225,3.9950,79.90000,110.0
2,Roach,22.5,7.0334,3.8203,78.31615,160.0
3,Parkki,15.5,6.5772,2.3142,33.09306,60.0
4,Bream,33.0,14.8604,5.2854,160.67616,700.0
...,...,...,...,...,...,...
106,Perch,25.6,6.5610,4.2390,99.61650,197.0
107,Parkki,20.7,8.5376,3.2944,62.59360,140.0
108,Roach,20.8,6.1677,3.3957,64.85787,110.0
109,Perch,36.5,10.8810,6.8640,233.37600,685.0


In [7]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Encoding categorical data

In [8]:
# Dummy Variables

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

print (X)


[[1.0 0.0 0.0 ... 12.444 5.134 136.05100000000002]
 [0.0 0.0 1.0 ... 5.5225 3.995 79.9]
 [0.0 0.0 0.0 ... 7.0334 3.8203 78.31615000000001]
 ...
 [0.0 0.0 0.0 ... 6.1677 3.3957 64.85787]
 [0.0 0.0 1.0 ... 10.880999999999998 6.864 233.376]
 [0.0 0.0 1.0 ... 7.5852 4.6354 124.69225999999999]]


## Multilinear Regression

## Splitting the dataset into the Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


## Training the Multiple Linear Regression model on the Training set

In [10]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predicting the Test set results

In [11]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[  19.03   13.4 ]
 [ 995.91  925.  ]
 [ 321.85  270.  ]
 [ 152.43  160.  ]
 [ 942.65 1100.  ]
 [ 133.65  120.  ]
 [ 446.19  450.  ]
 [ 259.21  250.  ]
 [ 685.59  690.  ]
 [ 733.27  720.  ]
 [ 220.23  200.  ]
 [  58.88   55.  ]
 [ 936.17  950.  ]
 [ 274.13  273.  ]
 [ 154.27  170.  ]
 [ 867.13  850.  ]
 [1137.31 1550.  ]
 [ 247.24  260.  ]
 [  63.04   69.  ]
 [ 890.2   920.  ]
 [  11.02    9.8 ]
 [  18.28    6.7 ]
 [ 538.81  514.  ]]


## Evaluating the Model Performance - R2 Score

In [12]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.948127647953537

## Evaluating the Model Performance - Mean-squared Error

In [13]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

8971.889907062252

## Evaluating the Model Mean absolute error

In [14]:
## Applying k-Fold Cross Validation

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 96.83 %
Standard Deviation: 1.84 %
