# Simple Linear Regression

## Linear regression with NumPy

In [8]:
import numpy as np

In [2]:
# Suppose we have some data
X = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 5, 4, 5])

# Reshape X to be a two-dimensional array for matrix operations
X = X.reshape(-1, 1)

# Add a column of ones to X for the intercept
X_bias = np.hstack([np.ones((X.shape[0], 1)), X])

# Compute beta using OLS formula: (X^T X)^(-1) X^T y
beta = np.linalg.inv(X_bias.T @ X_bias) @ X_bias.T @ y

# beta[0] is intercept, beta[1] is slope
print("Intercept:", beta[0])
print("Slope:", beta[1])

# Make predictions
y_pred = X_bias @ beta
print("Predictions:", y_pred)

Intercept: 2.200000000000004
Slope: 0.6000000000000005
Predictions: [2.8 3.4 4.  4.6 5.2]


#### Key Points:
The above approach illustrates the math behind OLS, but in practice, we rely on libraries like scikit-learn for convenience and additional
functionality.

## Linear Regression with scikit-learn

In [3]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [6]:
# Dummy dataset
X = np.array([[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]])
y = np.array([2, 4, 5, 4, 5, 6, 7, 8, 9, 10])

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Initialize and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Extract coefficients
intercept = model.intercept_
slope = model.coef_[0]
print("Intercept:", intercept)
print("Slope:", slope)

# Predictions
y_pred = model.predict(X_test)
print("Predictions on test set:", y_pred)

# Performance evaluation (e.g., R^2 score)
r2 = model.score(X_test, y_test)
print("R^2 score on test set:", r2)

Intercept: 1.454545454545455
Slope: 0.818181818181818
Predictions on test set: [8.81818182 3.09090909 6.36363636 2.27272727]
R^2 score on test set: 0.9601452073839499


#### Key Points:
LinearRegression automatically handles the intercept term. Use model.coef_ and model.intercept_ to access parameters.
model.score() returns the R² (coefficient of determination), a measure of how well the regression line fits the data.

# Multiple Linear Regression

In [9]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [11]:
# Expanded dataset with 10 rows
data = pd.DataFrame({
    'X1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'X2': [2, 3, 1, 5, 4, 6, 7, 8, 6, 9],
    'y':  [3, 5, 4, 7, 6, 8, 9, 10, 9, 12]
})

# Feature matrix and target
X = data[['X1', 'X2']]
y = data['y']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Output model parameters and test score
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print("R^2 on test set:", model.score(X_test, y_test))

Intercept: 1.6097560975609744
Coefficients: [0.52439024 0.54878049]
R^2 on test set: 0.8912477691850085


#### Key Points:
The model.coef_ now returns an array of coefficients, one for each predictor, describing the relationship between that predictor and the
response variable when other predictors are held constant.

In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("MAE:", mae)
print("R²:", r2)

MSE: 0.43500892325996615
MAE: 0.6585365853658551
R²: 0.8912477691850085
