# Multiple linear regression

- the multiple linear regression looks like: y = b0 + b1*x1 + b2*x2 ....
- In multiple linear regression there is no need feature scaling. 
- Backward elimination is not relevant in python sckitlearn, it will automatically select the relevant features. 
- Assumptions of linear regression : 
    1) linearity
    2) Homoscedasticity
    3) Multivariate (normality of error distribution)
    4) Independence
    5) Lack of multicollinearity (predictors are not correlated with each otehr)
    6) The outlier check: depending on the data include or exclude the outliers. 


In [1]:
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Import the dataset
math = pd.read_csv("./student-mat.csv", sep=';', header=0)
por = pd.read_csv("./student-por.csv", sep=';', header=0)

In [3]:
#change the data types: obejct -> category

#check the data types
print(math.info())
print(por.info())

#change the data types:
# Check data types before conversion
print("Before conversion:\n", math.dtypes)

# Convert 'object' columns to 'category'
for col in math.columns:
    if math[col].dtypes == 'object':
        math[col] = math[col].astype('category')

# Check data types after conversion
print("After conversion:\n", math.dtypes)


# Check data types before conversion
print("Before conversion:\n", por.dtypes)

# Convert 'object' columns to 'category'
for col in por.columns:
    if por[col].dtypes == 'object':
        por[col] = por[col].astype('category')

# Check data types after conversion
print("After conversion:\n", por.dtypes)

#convert additionally this columns into a category


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [4]:
X = math.iloc[:, :-1].values #all the rows, all the columns except the last one
y = math.iloc[:, -1].values  #all the rows, only the last column

In [5]:
# Split the data into train and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)


In [6]:
# encode the categorical variables 

from sklearn.preprocessing import LabelEncoder
import numpy as np

# Assuming X_train and X_test are NumPy arrays
# Identify categorical columns (columns with string data)
categorical_columns = [i for i in range(X_train.shape[1]) if isinstance(X_train[0, i], str)]

# Create a dictionary to store LabelEncoders for each categorical column
label_encoders = {}

# Apply label encoding to categorical columns in X_train
for i in categorical_columns:
    le = LabelEncoder()
    X_train[:, i] = le.fit_transform(X_train[:, i])
    label_encoders[i] = le  # Save the LabelEncoder for this column

# Apply the same encoders to categorical columns in X_test
for i in categorical_columns:
    if i in label_encoders:
        le = label_encoders[i]
        X_test[:, i] = le.transform(X_test[:, i])  # Transform using the same encoder


In [7]:
# training the multiple linear regression model

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train) #trainig of the model.

#the class of sklearn will automatically choose the best model. 

#since we have many variables we can not plot them. like depenant (x) , independent (y) values. Instead we will just display it
#as a table. 

In [8]:
# predict the test set results

y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2) # display only 2 decimals after the column for the numerical values. 
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) 
#concatenate two vectors vertically.
#y_pred.reshape(len(y_pred),1) it will reshape the vertically: row = length of the y_pred, column = 1
#theh last 1 refers two the horizontal concatenation.

[[ 5.42  0.  ]
 [11.1  10.  ]
 [ 4.92  8.  ]
 [ 8.55 10.  ]
 [ 9.88 10.  ]
 [10.48 12.  ]
 [ 6.57  8.  ]
 [12.7  11.  ]
 [10.59 11.  ]
 [ 7.18  8.  ]
 [12.19 11.  ]
 [13.75 13.  ]
 [ 5.96  8.  ]
 [ 8.7   7.  ]
 [ 7.93 10.  ]
 [11.28 10.  ]
 [16.31 15.  ]
 [16.59 16.  ]
 [ 6.07  8.  ]
 [ 8.9  10.  ]
 [10.53 11.  ]
 [18.15 19.  ]
 [ 9.74 10.  ]
 [11.78 11.  ]
 [15.41 16.  ]
 [ 5.56  5.  ]
 [12.13 11.  ]
 [ 7.05  9.  ]
 [ 9.3  10.  ]
 [12.31 13.  ]
 [11.35 12.  ]
 [12.19 11.  ]
 [ 5.35 10.  ]
 [10.38 11.  ]
 [12.55 14.  ]
 [ 8.79  8.  ]
 [10.62 10.  ]
 [ 7.9  10.  ]
 [ 7.52  7.  ]
 [10.65 10.  ]
 [ 9.47  9.  ]
 [11.09  9.  ]
 [16.19 15.  ]
 [ 4.87  9.  ]
 [10.1  11.  ]
 [11.87 13.  ]
 [19.16 18.  ]
 [ 8.67  9.  ]
 [13.06 14.  ]
 [ 8.13 10.  ]
 [ 4.93  5.  ]
 [12.48 10.  ]
 [ 9.47 11.  ]
 [ 5.3   6.  ]
 [ 7.22  6.  ]
 [10.91 11.  ]
 [ 6.53  6.  ]
 [15.48 15.  ]
 [ 9.54 10.  ]
 [ 3.47  0.  ]
 [ 5.57  0.  ]
 [ 8.    0.  ]
 [10.37 13.  ]
 [15.07 17.  ]
 [14.09 13.  ]
 [ 9.32  8.  ]
 [10.35 10

In [10]:
#getting the regression coefficients: 

print(regressor.coef_)
print(regressor.intercept_)

[ 7.13e-01  1.32e-01 -3.19e-01 -1.66e-01  1.61e-01 -8.86e-02  1.70e-01
 -7.12e-02 -4.75e-02 -4.85e-02  7.23e-02  6.42e-02  6.85e-02 -7.83e-02
 -1.09e-01  3.97e-02  1.31e-02 -4.45e-04 -4.62e-01 -2.89e-01  1.01e+00
 -1.95e-01 -2.46e-01  3.76e-01  8.31e-02  1.63e-01 -1.61e-01  7.38e-02
  1.24e-01  4.46e-02  1.68e-01  9.60e-01]
0.3682319712749962


In [12]:
# Evaluating the model performance

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)




0.736627349028625

- the model performed okayish with r2 value of 0.73 for the math class
