# Lab-02: Linear Regression

#### Regression
What is regression?
<span style="color:blue">Regression searches for relationships among variables. Generally, in regression analysis, we usually consider some phenomenon of interest and have a number of observations. Following the assumption that (at least) one of the features depends on the others, we try to establish a relation among them.</span>
</br>
When Do We Need Regression?
<span style="color:blue">Typically, we need regression to answer whether and how some phenomenon influences the other or how several variables are related.</span>

In [1]:
from IPython.display import clear_output
!pip3 install ../Libs/plotting_funcs-0.0.1-py3-none-any.whl  --force-reinstall
clear_output()

In [2]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotting_funcs.graphs import show, list_plot, line_plot

In [3]:
import torch
from tqdm import tqdm
from torch import nn

In [4]:
data = pd.read_csv("data/student_scores.csv")
data.head()

Unnamed: 0,Hours,Scores
0,2.5,21
1,5.1,47
2,3.2,27
3,8.5,75
4,3.5,30


In [5]:
def student_plot(x_data, y_data):
    return go.Scatter(y=y_data, x=x_data, mode="markers", marker=dict(size=7))

In [None]:
# show(
#     [student_plot(y_data=data["Scores"].to_numpy(), x_data=data["Hours"].to_numpy())]
#     , x_axis=dict(title="hours"), y_axis=dict(title="scores")
# )

In [7]:
train = data.sample(frac=0.8)
test = data.drop(train.index)


def series_to_tensor(data_: pd.Series, with_grad=True):
    return torch.Tensor(data_.to_numpy()).unsqueeze(1)


x_train, y_train = series_to_tensor(train["Hours"]), series_to_tensor(train["Scores"])
x_test, y_test = series_to_tensor(test["Hours"]), series_to_tensor(test["Scores"])

### Linear Regression

In [29]:
class TorchLinearRegression(nn.Module):
    def __init__(self):
        super(TorchLinearRegression, self).__init__()
        self.node = nn.Linear(1, 1)  # Could have just created a simple parameter instead

    def forward(self, input_):
        return self.node(input_)

    def fit(self, input_, output_, optimizer=None, loss_func=nn.MSELoss(), n=1_000):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001) if optimizer is None else optimizer
        losses_ = []

        with tqdm(total=n, postfix={"loss": torch.inf}) as tqdm_control:
            for i in range(n):
                loss_ = loss_func(self(input_), output_)
                loss_.backward()
                optimizer.step()
                optimizer.zero_grad()
                losses_.append(loss_)

                # Just printing learning progress
                if i % 10 == 0:
                    tqdm_control.update(i)
                    tqdm_control.set_postfix({"loss": loss_})
        return losses_

In [30]:
regressor = TorchLinearRegression()
losses = regressor.fit(x_train, y_train, n=20_000)

19990000it [00:03, 5729568.74it/s, loss=tensor(28.3042, grad_fn=<MseLossBackward0>)]                         


In [21]:
with torch.no_grad():
    for name, param in regressor.named_parameters():
        print(f"{name} = {param.data.numpy().round(3)}")

node.weight = [[10.055]]
node.bias = [2.446]


In [22]:
with torch.no_grad():
    show([
        student_plot(x_data=x_train.numpy().flatten(), y_data=y_train.numpy().flatten()),
        line_plot(x_data=x_train.numpy().flatten(), y_data=regressor.forward(x_train).numpy().flatten())
    ], x_axis=dict(title="hours"), y_axis=dict(title="scores"))

### Multiple Linear Regression

In [35]:
petrol_data = pd.read_csv("data/petrol_consumption.csv")
petrol_data.head()

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410
