# Linear Regression pt. 1

In [26]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import linear_model
from sklearn.utils import shuffle
import os

In [27]:
# Prepare directories and paths
dir_here = os.path.abspath("")
dir_base = os.path.dirname(dir_here)
dir_data = os.path.join(dir_base, "data")
path_data_csv = os.path.join(dir_data, "student-mat.csv")

In [28]:
# Read data
data = pd.read_csv(path_data_csv, sep=";")
print(data.head())

  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
0      4        3      4     1     1      3        6   5   6   6  
1      5        3      3     1     1      3        4   5   5   6  
2      4        3      2     2     3      3       10   7   8  10  
3      3        2      2     1     1      5        2  15  14  15  
4      4        3      2     1     2      5        4   6  10  10  

[5 rows x 33 columns]


In [29]:
# Filter the data that interests me
data = data[['G1', 'G2', 'G3', 'studytime', 'failures', 'absences']]
print(data.head())

   G1  G2  G3  studytime  failures  absences
0   5   6   6          2         0         6
1   5   5   6          2         0         4
2   7   8  10          2         3        10
3  15  14  15          3         0         2
4   6  10  10          2         0         4


## Some Nomenclature

***Atributes*** are the different variables, i.e. the fields or names of the columns in the dataframe.

Strictly speaking the variable to predict is not an attribute, and will have to be dropped from the dataframe.

We are going to predict the variable 'G3'. 

## Data preparation

### Select the variable to predict

In [30]:
predict = "G3"
x = np.array(data.drop([predict], 1))
y = np.array(data[predict])
print("x:")
print(x)
print("---")
print("y:")
print(y)

x:
[[ 5  6  2  0  6]
 [ 5  5  2  0  4]
 [ 7  8  2  3 10]
 ...
 [10  8  1  3  3]
 [11 12  1  0  0]
 [ 8  9  1  0  5]]
---
y:
[ 6  6 10 15 10 15 11  6 19 15  9 12 14 11 16 14 14 10  5 10 15 15 16 12
  8  8 11 15 11 11 12 17 16 12 15  6 18 15 11 13 11 12 18 11  9  6 11 20
 14  7 13 13 10 11 13 10 15 15  9 16 11 11  9  9 10 15 12  6  8 16 15 10
  5 14 11 10 10 11 10  5 12 11  6 15 10  8  6 14 10  7  8 18  6 10 14 10
 15 10 14  8  5 17 14  6 18 11  8 18 13 16 19 10 13 19  9 16 14 13  8 13
 15 15 13 13  8 12 11  9  0 18  0  0 12 11  0  0  0  0 12 15  0  9 11 13
  0 11  0 11  0 10  0 14 10  0 12  8 13 10 15 12  0  7  0 10  7 12 10 16
  0 14  0 16 10  0  9  9 11  6  9 11  8 12 17  8 12 11 11 15  9 10 13  9
  8 10 14 15 16 10 18 10 16 10 10  6 11  9  7 13 10  7  8 13 14  8 10 15
  4  8  8 10  6  0 17 13 14  7 15 12  9 12 14 11  9 13  6 10 13 12 11  0
 12 12  0 12  0 18 13  8  5 15  8 10  8  8 12  8 13 11 14  0 18  8 12  9
  0 17 10 11 10  0  9 14 11 14 10 12  9  9  8 10  8 10 12 10 11 11 19 12


In [31]:
print(x.shape)
print("---")
print(y.shape)

(395, 5)
---
(395,)


### Split the data between train and select batches

In [32]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.1)

In [33]:
print(x_train)
print("---")
print(y_train)

[[13 14  1  0  0]
 [ 7  8  2  3 10]
 [18 18  1  1 24]
 ...
 [ 7  0  2  0  0]
 [12 11  1  1 16]
 [ 5  6  2  0  6]]
---
[14 10 18 11 15  6 14 10 13 10 18  9 14  0  9 12  9 11 15 11  6 19 13  9
 10 13  8 12  9 10 12 18 12 15 15  8 13 10 15 10 16  0 12 10 10  6 11 10
 14 18 16 10  0 18 10  8  9 12 11 12 16 13  0  7 13  8 10 12 10 14 12 12
 10 15 10  0 11 17 17 15 15 10 12  9 16  0 10 11 11 12 11 15 15 14 15 14
 16 10 19 10  8  7  0 16  0 13 13 18 15 11 15 16 16  8 15  8 15  7 10 10
 15 12 15  0  0 10 11 19 13 16 10 15 10 10  6 15 12 14  6  8  6 14  0 18
 11  7 11 15  5 13 11  9 15 11  9 14 11 15  0  0 11  6  8 10 13 15  8  8
 10 10 13  0 17  0 12 10 12 13 17 11 10 13  9 14 11 12  9  8  8  7 13 14
  8 15  8 11  0 12  0  0 10  8 11 10 12 10 12  6 11 14  9  9 11 14 14 12
 10 13 11 10 15 11 11 15 12  0 11  5 13 10 13 13 11 14 14  7  0  0  9 10
  0  6  6 11 10 13  6 14 10  8 18 13 12 16  8  5 13 10 15 12 10  9 13 10
 11  0  0  5 10 11  0 15  9  9 14  0  8  7  8 11 10  0 18  8 11  0  9 13
 14 10

In [34]:
print(x_train.shape)
print("---")
print(y_train.shape)

(355, 5)
---
(355,)


### Create a linear regression model

In [35]:
# Create linear regression object
linear = sklearn.linear_model.LinearRegression()

In [36]:
# Fit linear regression to the train data
linear.fit(x_train, y_train) 
# The fit is stored in the linear object already created

LinearRegression()

In [37]:
# Check accuracy with the test data
acc = linear.score(x_test, y_test)
print(acc)

0.7958668509988149


We get a 79% accuracy.

For the case that's decent enough.

### Check the linear regression

In [40]:
# Check the coefficients
print(f"Coefficients: {linear.coef_}")
print(f"Intercept: {linear.intercept_}")

Coefficients: [ 0.1507054   0.97811518 -0.14668034 -0.27418046  0.0422737 ]
Intercept: -1.5683400317954241


In [50]:
# Check some predictions made by the model
predictions = linear.predict(x_test)
for k in range(len(predictions)):
    print(f"{y_test[k]:02d} ; {predictions[k]:06.3f} ; {x_test[k]}")

08 ; 08.386 ; [10  9  3  0  2]
11 ; 09.757 ; [ 7 10  2  1 25]
09 ; 08.471 ; [10  9  3  0  4]
00 ; 09.129 ; [ 9 10  3  0  0]
10 ; 08.632 ; [8 9 1 0 8]
08 ; 08.036 ; [ 6  9  1  2 14]
13 ; 14.664 ; [15 14  2  1 20]
09 ; 09.211 ; [10  9  3  1 28]
08 ; 10.665 ; [ 9  9  2  0 56]
15 ; 15.075 ; [16 15  3  0  0]
13 ; 12.747 ; [12 13  2  0  2]
10 ; 10.600 ; [11 11  4  0  8]
14 ; 12.666 ; [13 13  3  0  0]
18 ; 17.035 ; [17 17  4  0  0]
12 ; 11.645 ; [13 12  2  1  2]
06 ; 05.231 ; [7 6 2 0 4]
07 ; 07.400 ; [5 9 2 2 6]
10 ; 07.107 ; [8 8 3 0 2]
16 ; 15.751 ; [14 16  3  0  0]
08 ; 07.887 ; [6 9 1 1 4]
17 ; 16.284 ; [16 16  2  0  2]
04 ; 05.293 ; [ 6  6  2  2 22]
11 ; 11.063 ; [11 11  2  0 12]
08 ; 07.953 ; [7 9 1 1 2]
19 ; 18.156 ; [16 18  2  0  0]
10 ; 09.511 ; [10 10  2  0  2]
14 ; 12.788 ; [12 13  1  1  6]
08 ; 06.249 ; [ 6  7  2  1 15]
11 ; 09.511 ; [10 10  2  0  2]
08 ; 07.893 ; [10  8  2  0 10]
10 ; 10.145 ; [10 10  2  0 17]
09 ; 09.360 ; [ 9 10  2  0  2]
17 ; 17.178 ; [16 17  2  0  0]
12 ; 11