In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np

In [2]:
dataset = pd.read_csv("../fish_participant.csv")

In [3]:
print(dataset)

    Species  Weight  Length1  Length2  Length3   Height   Width
0     Bream   430.0     26.5     29.0     34.0  12.4440  5.1340
1     Perch   110.0     20.0     22.0     23.5   5.5225  3.9950
2     Roach   160.0     20.5     22.5     25.3   7.0334  3.8203
3    Parkki    60.0     14.3     15.5     17.4   6.5772  2.3142
4     Bream   700.0     30.4     33.0     38.3  14.8604  5.2854
..      ...     ...      ...      ...      ...      ...     ...
106   Perch   197.0     23.5     25.6     27.0   6.5610  4.2390
107  Parkki   140.0     19.0     20.7     23.2   8.5376  3.2944
108   Roach   110.0     19.1     20.8     23.1   6.1677  3.3957
109   Perch   685.0     34.0     36.5     39.0  10.8810  6.8640
110   Perch   300.0     26.9     28.7     30.1   7.5852  4.6354

[111 rows x 7 columns]


In [4]:
# Check if there are null values
print (dataset[dataset.isnull().any(axis=1)])


Empty DataFrame
Columns: [Species, Weight, Length1, Length2, Length3, Height, Width]
Index: []


In [5]:
# Check the type of fish
dataset.Species.unique()

array(['Bream', 'Perch', 'Roach', 'Parkki', 'Smelt', 'Whitefish', 'Pike'],
      dtype=object)

In [6]:
dataset = dataset.assign(Girth=dataset["Length1"] * dataset["Width"])

    
# Organize columns

dataset = dataset[["Species", "Length2", "Height", "Width", "Girth", "Weight" ]]

dataset

Unnamed: 0,Species,Length2,Height,Width,Girth,Weight
0,Bream,29.0,12.4440,5.1340,136.05100,430.0
1,Perch,22.0,5.5225,3.9950,79.90000,110.0
2,Roach,22.5,7.0334,3.8203,78.31615,160.0
3,Parkki,15.5,6.5772,2.3142,33.09306,60.0
4,Bream,33.0,14.8604,5.2854,160.67616,700.0
...,...,...,...,...,...,...
106,Perch,25.6,6.5610,4.2390,99.61650,197.0
107,Parkki,20.7,8.5376,3.2944,62.59360,140.0
108,Roach,20.8,6.1677,3.3957,64.85787,110.0
109,Perch,36.5,10.8810,6.8640,233.37600,685.0


In [7]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Encoding categorical data



In [8]:

# Dummy Variables

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

print (X)

[[1.0 0.0 0.0 ... 12.444 5.134 136.05100000000002]
 [0.0 0.0 1.0 ... 5.5225 3.995 79.9]
 [0.0 0.0 0.0 ... 7.0334 3.8203 78.31615000000001]
 ...
 [0.0 0.0 0.0 ... 6.1677 3.3957 64.85787]
 [0.0 0.0 1.0 ... 10.880999999999998 6.864 233.376]
 [0.0 0.0 1.0 ... 7.5852 4.6354 124.69225999999999]]


In [9]:
y = y.reshape(len(y),1)


## Splitting the dataset into the Training set and Test set



In [10]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature Scaling



In [11]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

## Training the SVR model on the Training set



In [12]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

## Predicting the Test set results



In [13]:
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[  37.79   13.4 ]
 [ 888.49  925.  ]
 [ 299.52  270.  ]
 [ 168.65  160.  ]
 [ 882.74 1100.  ]
 [ 112.29  120.  ]
 [ 462.82  450.  ]
 [ 255.3   250.  ]
 [ 695.23  690.  ]
 [ 747.82  720.  ]
 [ 205.09  200.  ]
 [  93.2    55.  ]
 [ 809.91  950.  ]
 [ 252.48  273.  ]
 [ 142.11  170.  ]
 [ 859.34  850.  ]
 [ 843.63 1550.  ]
 [ 235.3   260.  ]
 [  93.85   69.  ]
 [ 844.61  920.  ]
 [  37.72    9.8 ]
 [  40.71    6.7 ]
 [ 546.26  514.  ]]


## Evaluating the Model Performance - R2 Score




In [14]:

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)


0.8536066274191271

## Evaluating the Model Performance - Mean-squared Error

In [15]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

25320.33289608073

In [16]:
## Evaluating the Model Mean absolute error
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)



67.03693306761143

In [17]:
## Applying k-Fold Cross Validation

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))


Accuracy: 89.98 %
Standard Deviation: 10.25 %


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
