In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In [2]:
df = pd.read_csv('./cereal.csv')

In [3]:
df[:5]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [4]:
# Fixing some typos in the column names
df = df.rename(columns={'carbo':'carbon','sugars':'sugar','potass':'potassium'})

In [5]:
df[:3]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbon,sugar,potassium,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505


In [6]:
df.describe()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbon,sugar,potassium,vitamins,shelf,weight,cups,rating
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,106.883117,2.545455,1.012987,159.675325,2.151948,14.597403,6.922078,96.077922,28.246753,2.207792,1.02961,0.821039,42.665705
std,19.484119,1.09479,1.006473,83.832295,2.383364,4.278956,4.444885,71.286813,22.342523,0.832524,0.150477,0.232716,14.047289
min,50.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.0,1.0,0.5,0.25,18.042851
25%,100.0,2.0,0.0,130.0,1.0,12.0,3.0,40.0,25.0,1.0,1.0,0.67,33.174094
50%,110.0,3.0,1.0,180.0,2.0,14.0,7.0,90.0,25.0,2.0,1.0,0.75,40.400208
75%,110.0,3.0,2.0,210.0,3.0,17.0,11.0,120.0,25.0,3.0,1.0,1.0,50.828392
max,160.0,6.0,5.0,320.0,14.0,23.0,15.0,330.0,100.0,3.0,1.5,1.5,93.704912


In [7]:
# Counting amount of -1 entries in carbon column
count = 0
for i in df.carbon:
    if i == -1:
        count += 1
count

1

In [8]:
# Counting amount of -1 entries in sugar column
count = 0
for i in df.sugar:
    if i == -1:
        count += 1
count

1

In [9]:
# Counting amount of -1 entries in potassium column
count = 0
for i in df.potassium:
    if i == -1:
        count += 1
count

2

In [10]:
# Since there are not that many -1 values we can ignore the rows with those values
print(df.sugar.argmin())
print(df.potassium.argmin())
print(df.carbon.argmin())

57
4
57


In [11]:
df = df.drop([57],axis=0)
df = df.drop([4],axis=0)

In [12]:
print(df.potassium.argmin())

20


In [13]:
df = df.drop([20],axis=0)

In [14]:
df.describe()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbon,sugar,potassium,vitamins,shelf,weight,cups,rating
count,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0
mean,107.027027,2.513514,1.0,162.364865,2.175676,14.72973,7.108108,98.513514,29.054054,2.216216,1.030811,0.821622,42.371787
std,19.843893,1.075802,1.006826,82.769787,2.423391,3.891675,4.359111,70.878681,22.294352,0.832067,0.153416,0.235715,14.033712
min,50.0,1.0,0.0,0.0,0.0,5.0,0.0,15.0,0.0,1.0,0.5,0.25,18.042851
25%,100.0,2.0,0.0,135.0,0.25,12.0,3.0,41.25,25.0,1.25,1.0,0.67,32.44921
50%,110.0,2.5,1.0,180.0,2.0,14.5,7.0,90.0,25.0,2.0,1.0,0.75,40.253087
75%,110.0,3.0,1.0,217.5,3.0,17.0,11.0,120.0,25.0,3.0,1.0,1.0,50.52061
max,160.0,6.0,5.0,320.0,14.0,23.0,15.0,330.0,100.0,3.0,1.5,1.5,93.704912


In [15]:
df[:3]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbon,sugar,potassium,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505


In [16]:
len(df.name.unique())

74

In [17]:
len(df)

74

In [18]:
df = df.drop(['name'],axis=1)

In [19]:
len(df.mfr.unique())

7

In [20]:
df[:3]

Unnamed: 0,mfr,type,calories,protein,fat,sodium,fiber,carbon,sugar,potassium,vitamins,shelf,weight,cups,rating
0,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505


In [21]:
# Encoding string labels into integers from 0 to n-1
encode = LabelEncoder()
df['mfr'] = encode.fit_transform(df['mfr'])
df['type'] = encode.fit_transform(df['type'])

In [22]:
df[:2]

Unnamed: 0,mfr,type,calories,protein,fat,sodium,fiber,carbon,sugar,potassium,vitamins,shelf,weight,cups,rating
0,3,0,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,5,0,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679


In [23]:
# Check correlation of each column with Showup column, make them a new dataframe
important_data = []
for i in df:
    if i != 'rating':
        x = df[i].corr(df['rating'])
        important_data.append(i)
        print(i,' ~ ',x)

mfr  ~  0.14994675771689805
type  ~  0.1047863556468138
calories  ~  -0.6937846629537247
protein  ~  0.4671621803138431
fat  ~  -0.4050501988243349
sodium  ~  -0.38301235807087747
fiber  ~  0.6034108974552956
carbon  ~  0.05594128529048922
sugar  ~  -0.7559550888889875
potassium  ~  0.41578244343009124
vitamins  ~  -0.2144809450895495
shelf  ~  0.05103974978110971
weight  ~  -0.3004610402217782
cups  ~  -0.22250439541244443


In [24]:
# Independent variables
input_data = df.drop(['rating'],axis=1)

In [25]:
# Dependent variable
output_data = df['rating']

In [26]:
# Split both the input_data and the output_data into train and test with 80/20 proportion
X_train, X_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.20, random_state=123)

In [27]:
len(X_train)

59

In [28]:
len(y_test)

15

In [29]:
# Making a basic Linear Regression model for output Y1
model = linear_model.LinearRegression()
model.fit(X_train,y_train)
accuracy = model.score(X_test,y_test)
print(accuracy)

0.999999999999999


