<a href="https://colab.research.google.com/github/chrisogonas/coo_dataviz/blob/main/dataviz_linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multiple Linear Regression - Band Size Selection

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

## Importing the dataset

In [2]:
dataset = pd.read_csv('/content/sample_data/ParticipantFinalSelectionMerged.csv')
X = dataset.iloc[:, 1:-2].values # leave out subject number
y = dataset.iloc[:, -2].values # train with the Selected Band Size as the ground truth
print(X)


[['Male' 'White' '30-39' 165]
 ['Female' 'East Asian' '30-39' 152]
 ['Female' 'White' '20-29' 169]
 ...
 ['Female' 'White' '<20' 168]
 ['Female' 'East Asian' '<20' 170]
 ['Female' 'White' '30-39' 157]]


In [3]:
print(y)

[5 4 5 ... 5 5 4]


In [4]:
# Encoding Independent Categorical variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1,2])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
# print(X)
print(X)

[[0.0 1.0 0.0 ... 0.0 0.0 165]
 [1.0 0.0 0.0 ... 0.0 0.0 152]
 [1.0 0.0 0.0 ... 0.0 0.0 169]
 ...
 [1.0 0.0 0.0 ... 1.0 0.0 168]
 [1.0 0.0 0.0 ... 1.0 0.0 170]
 [1.0 0.0 0.0 ... 0.0 0.0 157]]


In [10]:
y # print(X)

array([5, 4, 5, ..., 5, 5, 4])

In [13]:
print(y)

[5 4 5 ... 5 5 4]


## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [8]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predicting the Test set results



In [15]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=0)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[5. 5.]
 [5. 5.]
 [5. 5.]
 [5. 5.]
 [3. 4.]
 [6. 6.]
 [7. 7.]
 [4. 4.]
 [5. 5.]
 [4. 4.]
 [7. 7.]
 [7. 7.]
 [8. 8.]
 [6. 5.]
 [4. 4.]
 [3. 3.]
 [7. 7.]
 [3. 3.]
 [4. 5.]
 [3. 3.]
 [6. 6.]
 [6. 6.]
 [4. 4.]
 [4. 4.]
 [4. 4.]
 [7. 7.]
 [8. 8.]
 [4. 4.]
 [5. 5.]
 [4. 5.]
 [6. 6.]
 [4. 4.]
 [4. 5.]
 [5. 5.]
 [7. 7.]
 [3. 3.]
 [5. 5.]
 [5. 5.]
 [4. 4.]
 [4. 4.]
 [4. 4.]
 [4. 4.]
 [8. 7.]
 [3. 3.]
 [6. 6.]
 [7. 7.]
 [5. 4.]
 [7. 7.]
 [6. 6.]
 [4. 4.]
 [4. 4.]
 [5. 5.]
 [3. 3.]
 [6. 6.]
 [8. 8.]
 [7. 7.]
 [5. 5.]
 [4. 4.]
 [7. 7.]
 [5. 5.]
 [4. 4.]
 [5. 5.]
 [4. 4.]
 [4. 4.]
 [7. 7.]
 [4. 4.]
 [4. 4.]
 [5. 5.]
 [6. 6.]
 [6. 6.]
 [6. 6.]
 [7. 7.]
 [5. 5.]
 [5. 5.]
 [9. 9.]
 [8. 7.]
 [4. 4.]
 [4. 5.]
 [6. 6.]
 [6. 6.]
 [7. 7.]
 [5. 5.]
 [7. 7.]
 [6. 6.]
 [6. 6.]
 [4. 4.]
 [2. 2.]
 [4. 4.]
 [6. 5.]
 [6. 6.]
 [4. 5.]
 [6. 6.]
 [4. 4.]
 [8. 8.]
 [6. 6.]
 [4. 4.]
 [7. 7.]
 [3. 3.]
 [5. 6.]
 [3. 3.]
 [6. 6.]
 [5. 5.]
 [6. 6.]
 [7. 7.]
 [5. 5.]
 [6. 6.]
 [6. 6.]
 [7. 8.]
 [6. 6.]
 [6. 6.]
 [7. 7.]
 