In [36]:
import pandas as pd

data = pd.read_csv('Fish.csv')
data.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


Lets rename the length columns

In [37]:
data.rename(columns={"Length1": "Vlength", "Length2": "Dlength", "Length3": "Clength"},inplace=True)
data.head()

Unnamed: 0,Species,Weight,Vlength,Dlength,Clength,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [38]:
data.shape

(159, 7)

In [39]:
for col in data.columns:
    print('Column {} has {} missing values'.format(col,data[col].isnull().sum()))

Column Species has 0 missing values
Column Weight has 0 missing values
Column Vlength has 0 missing values
Column Dlength has 0 missing values
Column Clength has 0 missing values
Column Height has 0 missing values
Column Width has 0 missing values


There are no missing values in any of the dataset columns. Lets check also the type of each column

In [40]:
for col in data.columns:
    print('Column {} type is {}'.format(col,data[col].dtype))

Column Species type is object
Column Weight type is float64
Column Vlength type is float64
Column Dlength type is float64
Column Clength type is float64
Column Height type is float64
Column Width type is float64


Number of available species and number of individuals in each one

In [41]:
data['Species'].value_counts()

Perch        56
Bream        35
Roach        20
Pike         17
Smelt        14
Parkki       11
Whitefish     6
Name: Species, dtype: int64

Descriptive analysis of each one of the numerical features available

In [42]:
data.describe()

Unnamed: 0,Weight,Vlength,Dlength,Clength,Height,Width
count,159.0,159.0,159.0,159.0,159.0,159.0
mean,398.326415,26.24717,28.415723,31.227044,8.970994,4.417486
std,357.978317,9.996441,10.716328,11.610246,4.286208,1.685804
min,0.0,7.5,8.4,8.8,1.7284,1.0476
25%,120.0,19.05,21.0,23.15,5.9448,3.38565
50%,273.0,25.2,27.3,29.4,7.786,4.2485
75%,650.0,32.7,35.5,39.65,12.3659,5.5845
max,1650.0,59.0,63.4,68.0,18.957,8.142


As it is possible to see, on the above table, the min weight present in the dataset is 0 which is obviously a bit strange. lets plot weight by height and check what we have

In [43]:
data = data.drop("Species", axis=1)
y = data.Weight
X = data.loc[:,data.columns != 'Weight']

In [44]:

X.head()

Unnamed: 0,Vlength,Dlength,Clength,Height,Width
0,23.2,25.4,30.0,11.52,4.02
1,24.0,26.3,31.2,12.48,4.3056
2,23.9,26.5,31.1,12.3778,4.6961
3,26.3,29.0,33.5,12.73,4.4555
4,26.5,29.0,34.0,12.444,5.134


Now divide the dataset in X_train and X_test and Y_train and Y_test

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77, shuffle=True)

In [46]:
X_test.columns

Index(['Vlength', 'Dlength', 'Clength', 'Height', 'Width'], dtype='object')

### Modelling

Now applying a linear regression model to the data

In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression() # Tried to normalize the dataset but actually results do not improve that much 
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(r2_score(y_test, y_pred))

0.8512070790294621


In [48]:
model.predict(X_test[:1])

array([1019.85392663])

In [49]:
import pickle

filename = 'fishdataset_model.pkl'
pickle.dump(model, open(filename, 'wb'))