Imports

In [1]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from scipy import stats
from sklearn import linear_model

import copy

Data Reading

In [2]:
df = pd.read_csv("cereal.csv")
df = df[(df["protein"] >= 0) & (df["fat"] >= 0) & (df["sugars"] >= 0) & (df["carbo"] >= 0)]

Step One

In [3]:
features = ["protein", "fat", "sugars", "carbo"]
df[features]

Unnamed: 0,protein,fat,sugars,carbo
0,4,1,6,5.0
1,3,5,8,8.0
2,4,1,5,7.0
3,4,0,0,8.0
4,2,2,8,14.0
...,...,...,...,...
72,2,1,3,21.0
73,1,1,12,13.0
74,3,1,3,17.0
75,3,1,3,17.0


Visualization

In [4]:
fig = make_subplots(rows = 2, cols = 2)
fig.add_trace(go.Scatter(x = [i for i in range(len(df["protein"]))], y = df["protein"], mode = "markers", name = "Protein"), row = 1, col = 1)
fig.add_trace(go.Scatter(x = [i for i in range(len(df["fat"]))], y = df["fat"], mode = "markers", name = "Fat"), row = 2, col = 1)
fig.add_trace(go.Scatter(x = [i for i in range(len(df["sugars"]))], y = df["sugars"], mode = "markers", name = "Sugars"), row = 1, col = 2)
fig.add_trace(go.Scatter(x = [i for i in range(len(df["carbo"]))], y = df["carbo"], mode = "markers", name = "Carbohydrates"), row = 2, col = 2)
fig.show()

Normalization

In [5]:
def normalize(x):
    return((x - np.min(x))/(np.max(x) - np.min(x)))

Visualization

In [6]:
fig = make_subplots(rows = 2, cols = 2)
fig.add_trace(go.Scatter(x = [i for i in range(len(df["protein"]))], y = normalize(df["protein"]), mode = "markers", name = "Protein"), row = 1, col = 1)
fig.add_trace(go.Scatter(x = [i for i in range(len(df["fat"]))], y = normalize(df["fat"]), mode = "markers", name = "Fat"), row = 2, col = 1)
fig.add_trace(go.Scatter(x = [i for i in range(len(df["sugars"]))], y = normalize(df["sugars"]), mode = "markers", name = "Sugars"), row = 1, col = 2)
fig.add_trace(go.Scatter(x = [i for i in range(len(df["carbo"]))], y = normalize(df["carbo"]), mode = "markers", name = "Carbohydrates"), row = 2, col = 2)
fig.show()

Step Two

In [15]:
realx = df[features]
x = np.c_[np.ones(len(realx)), realx]

y = df["rating"]

w = [0, 4, -5, -2, -1]

lr = 0.0065

costx = []
costy = []

flag = True

c = 0

while flag:
    c += 1
    pred = np.dot(x, w)
    diff = pred - y
    grad = np.dot(diff, x)/len(x)
    cost = np.sum(diff**2)
    
    w = w - lr * grad
    costx.append(c)
    costy.append(cost)
        
    if c > 10000:
        break

In [16]:
fig = make_subplots(rows = 1, cols = 1)
fig.add_trace(go.Scatter(x = costx[0::1000], y = costy[0::1000], mode = "markers+lines", name = "Real"), row = 1, col = 1)
fig.show()

In [17]:
w

array([54.32188274,  5.76725335, -4.9532161 , -1.81727836, -0.60269572])

Step Three

Bad

In [66]:
realx = normalize([df["protein"], df["fat"], df["sugars"], df["carbo"]])
x = np.transpose(np.concatenate(([np.ones(len(realx[0]))], realx)))

y = normalize(df["rating"])
w = [0, 0, 0, 0, 0]

lr = 0.05

costx = []
costy = []

flag = True

c = 0

while flag:
    c += 1
    pred = np.dot(x, w)
    diff = pred - y
    grad = np.dot(diff, x)/len(x)
    cost = np.sum(diff**2)
    
    w = w - lr * grad
    costx.append(c)
    costy.append(cost)
        
    if c > 100:
        break

In [67]:
fig = make_subplots(rows = 1, cols = 1)
fig.add_trace(go.Scatter(x = costx[0::1], y = costy[0::1], mode = "lines", name = "Real"), row = 1, col = 1)
fig.show()

In [68]:
pred = np.dot(x, w)
repred = pred * (max(df["rating"]) - min(df["rating"])) + min(df["rating"])

fig = make_subplots(rows = 1, cols = 1)
fig.add_trace(go.Scatter(x = [i for i in range(len(repred))], y = repred, mode = "markers", name = "Predicted"), row = 1, col = 1)
fig.add_trace(go.Scatter(x = [i for i in range(len(repred))], y = df["rating"], mode = "markers", name = "Real"), row = 1, col = 1)
fig.show()

sum(abs(repred - df["rating"]))/len(repred)

10.18773333375823

In [69]:
fig = make_subplots(rows = 1, cols = 1)
fig.add_trace(go.Scatter(x = df["rating"], y = repred, mode = "markers", name = "Cereal"), row = 1, col = 1)
x = [20, 80]
y = [20, 80]
fig.add_trace(go.Scatter(x = x, y = y, mode = "lines", name = "Equality"), row = 1, col = 1)
fig.update_xaxes(title = "Real")
fig.update_yaxes(title = "Predicted")
fig.show()

Good

In [46]:
realx = normalize([df["protein"], df["fat"], df["sugars"], df["carbo"]])
x = np.transpose(np.concatenate(([np.ones(len(realx[0]))], realx)))

y = normalize(df["rating"])
w = [0.5, 1.25, -1.5, -0.5, -0.25]

lr = 0.9

costx = []
costy = []

flag = True

c = 0

while flag:
    c += 1
    pred = np.dot(x, w)
    diff = pred - y
    grad = np.dot(diff, x)/len(x)
    cost = np.sum(diff**2)
    
    w = w - lr * grad
    costx.append(c)
    costy.append(cost)
        
    if c > 50000:
        break

In [50]:
fig = make_subplots(rows = 1, cols = 1)
fig.add_trace(go.Scatter(x = costx[0::1000], y = costy[0::1000], mode = "lines", name = "Real"), row = 1, col = 1)
fig.show()

In [48]:
print(w)
print(np.dot(x, w) - normalize(df["rating"]))

[ 0.72209956  1.27770722 -1.56060255 -0.71358168 -0.39132437]
0    -0.060358
1    -0.045504
2     0.055291
3    -0.191803
4    -0.004886
        ...   
72    0.036593
73   -0.012027
74    0.019031
75   -0.004821
76    0.005112
Name: rating, Length: 76, dtype: float64


In [52]:
pred = np.dot(x, w)
repred = pred * (max(df["rating"]) - min(df["rating"])) + min(df["rating"])

fig = make_subplots(rows = 1, cols = 1)
fig.add_trace(go.Scatter(x = [i for i in range(len(repred))], y = repred, mode = "markers+lines", name = "Predicted"), row = 1, col = 1)
fig.add_trace(go.Scatter(x = [i for i in range(len(repred))], y = df["rating"], mode = "markers+lines", name = "Real"), row = 1, col = 1)
fig.show()

sum(abs(repred - df["rating"]))/len(repred)

4.54783572649332

In [62]:
fig = make_subplots(rows = 1, cols = 1)
fig.add_trace(go.Scatter(x = df["rating"], y = repred, mode = "markers", name = "Cereal"), row = 1, col = 1)
x = [20, 80]
y = [20, 80]
fig.add_trace(go.Scatter(x = x, y = y, mode = "lines", name = "Equality"), row = 1, col = 1)
fig.update_xaxes(title = "Real")
fig.update_yaxes(title = "Predicted")
fig.show()

In [55]:
clf = linear_model.LinearRegression()
clf.fit(np.transpose(np.asarray(normalize([df["protein"], df["fat"], df["sugars"], df["carbo"]]))), [[i] for i in normalize(df["rating"])])

coefficients = list(clf.coef_)
coefficients

[array([ 1.27770722, -1.56060255, -0.71358168, -0.39132437])]

In [56]:
clf = linear_model.LinearRegression()
clf.fit(np.transpose(np.asarray([df["protein"], df["fat"], df["sugars"], df["carbo"]])), [[i] for i in df["rating"]])

coefficients = list(clf.coef_)
coefficients

[array([ 4.20321574, -5.13384372, -2.34743742, -1.28732212])]