In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder

In [4]:
df = pd.read_csv("./datasets/fish.csv")
df.head(10)

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134
5,Bream,450.0,26.8,29.7,34.7,13.6024,4.9274
6,Bream,500.0,26.8,29.7,34.5,14.1795,5.2785
7,Bream,390.0,27.6,30.0,35.0,12.67,4.69
8,Bream,450.0,27.6,30.0,35.1,14.0049,4.8438
9,Bream,500.0,28.5,30.7,36.2,14.2266,4.9594


<font size="5">Converting categorical data to numerical</font>

In [5]:
le = LabelEncoder()
# Creates an object

labels = le.fit_transform(df["Species"])
# Fits categorical data and turns it into numerical data

le.classes_
# Displays classes

array(['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish'],
      dtype=object)

In [6]:
print(labels)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 6 6 6 6 6 6 1 1 1 1 1 1 1 1 1 1 1 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 5 5 5
 5 5 5 5 5 5 5 5 5 5 5]


In [7]:
df = df.drop("Species", axis = "columns")
df.head(10)
# Removes the species column from the dataframe

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
0,242.0,23.2,25.4,30.0,11.52,4.02
1,290.0,24.0,26.3,31.2,12.48,4.3056
2,340.0,23.9,26.5,31.1,12.3778,4.6961
3,363.0,26.3,29.0,33.5,12.73,4.4555
4,430.0,26.5,29.0,34.0,12.444,5.134
5,450.0,26.8,29.7,34.7,13.6024,4.9274
6,500.0,26.8,29.7,34.5,14.1795,5.2785
7,390.0,27.6,30.0,35.0,12.67,4.69
8,450.0,27.6,30.0,35.1,14.0049,4.8438
9,500.0,28.5,30.7,36.2,14.2266,4.9594


In [8]:
df["specs"] = labels
df.head(100)
# Adds labels to the df

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width,specs
0,242.0,23.2,25.4,30.0,11.5200,4.0200,0
1,290.0,24.0,26.3,31.2,12.4800,4.3056,0
2,340.0,23.9,26.5,31.1,12.3778,4.6961,0
3,363.0,26.3,29.0,33.5,12.7300,4.4555,0
4,430.0,26.5,29.0,34.0,12.4440,5.1340,0
...,...,...,...,...,...,...,...
95,170.0,21.5,23.5,25.0,6.2750,3.7250,2
96,225.0,22.0,24.0,25.5,7.2930,3.7230,2
97,145.0,22.0,24.0,25.5,6.3750,3.8250,2
98,188.0,22.6,24.6,26.2,6.7334,4.1658,2


In [15]:
msk = np.random.rand(len(df)) < 0.8
trainset = df[msk]
testset = df[~msk]

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width,specs
3,363.0,26.3,29.0,33.5,12.73,4.4555,0
10,475.0,28.4,31.0,36.2,14.2628,5.1042,0
12,500.0,29.1,31.5,36.4,13.7592,4.368,0
14,600.0,29.4,32.0,37.2,14.9544,5.1708,0
33,975.0,37.4,41.0,45.9,18.6354,6.7473,0


In [10]:
model = linear_model.LinearRegression()

train_x = np.asanyarray(trainset[["Length1","Length2","Length3","Height","Width","specs"]])
train_y = np.asanyarray(trainset[["Weight"]])

model.fit(train_x, train_y)

coeff = model.coef_
inter = model.intercept_
print("The coefficients are : ",coeff)
print("The intercept is : ",inter)

The coefficients are :  [[ 51.80878016  20.81614198 -47.27429137  49.84398779   0.58027062
   31.19817892]]
The intercept is :  [-600.49621267]


In [11]:
predicted_weight = model.predict(testset[["Length1","Length2","Length3","Height","Width","specs"]])

actual_y = np.asanyarray(testset[["Weight"]])
actual_x = np.asanyarray(testset[["Length1","Length2","Length3","Height","Width","specs"]])

print("The mean square error is : %.2f" %np.mean(predicted_weight - actual_y)**2)
print("The score is : %.2f" %model.score(actual_x,actual_y))

The mean square error is : 318.75
The score is : 0.90


In [13]:
length_1 = float(input("Vertical length : "))
length_2 = float(input("Diagonal length : "))
length_3 = float(input("Cross length : "))
height_ = float(input("Height : "))
width_ = float(input("Width : "))
species_ = str(input("Species : "))
        
actual_specs = le.transform([species_])
# Transforms the species class into its label

output = model.predict([[length_1, length_2, length_3, height_, width_, actual_specs]])
# predicts result. Parameters are entered in the same order as the model was trained.

print ("The predicted weight : ", output)

Vertical length :  23
Diagonal length :  12
Cross length :  23
Height :  12
Width :  23
Species :  Bream


The predicted weight :  [[365.06481114]]
