In [None]:
import numpy
from sklearn.linear_model import LinearRegression

In [None]:
# Seed data for experiments
# Input data. We call reshape on x because this array is required to be two-dimensional (read: should ahve one column and as many rows as necessary).
# x.shape = (6, 1)
x = numpy.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1))

# Output data:
# y.shape = (6,)
y = numpy.array([5, 20, 14, 32, 22, 38])
x, y

In [None]:
# Optional parameters to this model constructor:
# fit_intercept - (default: True). Boolean to determine whether or not to calculate the intercept b0 (True) or consider it equal to 0 (False)
# normalize - (default: False). Normalize input variables?
# copy_X - (default: True). copy (True) or overwrite (False) the input variables.
# n_jobs - (integer or None (default)). Represents the number of jobs used in parallel computation. 
linear_regression_model = LinearRegression()

In [None]:
# Calculate the optimal values of the weights b0 and b1, using the existing input and output (x and y).
linear_regression_model.fit(x, y)

In [None]:
# Obtain the coefficient of determination (R^2).
# In the walkthrough, the author has now called the x and y variables the input and output (respectively) and with this step, called it
# the predictor and the regressor. I guess depending on the context there is different terminology for these variables?
r_squared = linear_regression_model.score(x, y)
print(f'coefficient of determination: {r_squared}')

In [None]:
# The attributes of this model are .intercept_ (represents the coefficient b0) and .coef_ which represents b1.
# The intercept is a scalar, and the coefficient is an array. 

# The value of the intercept (b0) illustrates that the model predicts a specific response when x is zero.
# The coefficient represents the predict response if x is increased by one.
print(f'intercept: {linear_regression_model.intercept_}')
print(f'coefficient: {linear_regression_model.coef_}')

In [None]:
# Depending on the shape and needs of the data, it may be necessary to pass these variables as two-dimensional arrays, which changes the 
# shape of the intercept and coefficient.
different_model_example = LinearRegression().fit(x, y.reshape((-1, 1)))
print(f'intercept: {different_model_example.intercept_}')
print(f'coefficient: {different_model_example.coef_}')

In [None]:
# When applying .predict(), we pass the regressor as the argument to get the corresponding predicted response.
y_predictions = linear_regression_model.predict(x)
print(f'predicted response: {y_predictions}')

In [None]:
# Another formula for generating the predictions.
# This is nearly identical, except in the output. 
# The dimensions of the output goes from a single dimension to two.
y_pred = linear_regression_model.intercept_ + linear_regression_model.coef_ * x
print(f'predicted response: {y_pred}')

# If the number of dimensions of x is reduced to one, the two prediction approaches will yield the same result.

In [None]:
# In practice, regression models are often applied for forecasting values. 
# This means that fitted models can be used to calculate outputs based on some new input.
x_new = numpy.arange(6).reshape((-1, 1))
print(x_new)
new_prediction = linear_regression_model.predict(x_new)
print(new_prediction)

In [None]:
# Multi-dimensional linear regression.
multi_x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
multi_y = [4, 5, 20, 14, 32, 22, 38, 43]
multi_x = numpy.array(multi_x)
multi_y = numpy.array(multi_y)

In [None]:
multi_linear_regression_model = LinearRegression().fit(multi_x, multi_y)

In [None]:
# Get model properties
# .intercept_ holds the bias b0 (again, another name for this variable. Need to standardize the naming scheme in this notebook).
# .coef_ is now an array containing the coefficients b1 and b2 (this is a multi-dimensional model, so we have multiple coefficients).
r_sq = multi_linear_regression_model.score(multi_x, multi_y)
print(f'coefficient of determination: {r_sq}')
print(f'intercept: {multi_linear_regression_model.intercept_}')
print(f'slope: {multi_linear_regression_model.coef_}')

In [None]:
# Generate predictions
multi_y_pred = multi_linear_regression_model.predict(multi_x)
print(f'predicted response: {multi_y_pred}')

In [None]:
# Apply this model to a new dataset
new_dataset_x = numpy.arange(12).reshape((-1, 2))
print(new_dataset_x)
new_predictions = multi_linear_regression_model.predict(new_dataset_x)
print(f'predictions: {new_predictions}')

In [None]:
# Polynomial regression with scikit-learn
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# poly_x needs to be a two-dimensional
# Interestingly, adding values to these arrays has side-effects for the transformations in the following cells.
# TODO: Dig into how the fit and transform computations are done. Start here: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
poly_x = numpy.array([5, 15, 25, 35, 45, 55, 2, 106, 72, 1]).reshape((-1, 1))
poly_y = numpy.array([15, 11, 2, 8, 25, 32, 7, 123, 98, 100])
poly_x

In [None]:
# We need to include x^2 (and depending on the case, other terms) as additional features when implementing polynomial regression.
# Here, we transform the input variable poly_x to contain an additional column with the values of x^2 (and eventually more features).

# This is an instance of PolynomialFeatures that can be used ot transform the input x.
# Optional parameters to this model constructor:
# degree - (default: 2) represents the degree of the polynomial regression function.
# interaction_only - (default: False) Whether to include only interaction features (True) or all features (False).
# include_bias - (default: True) Whether or not to include the bias (intercept) column of ones (True) or not (False).
transformer = PolynomialFeatures(degree=2, include_bias=False)

In [None]:
# Before applying the transformation, we need to fit it.
transformer.fit(poly_x)

In [None]:
#  Transform the input
x_ = transformer.transform(poly_x)
x_

In [None]:
# It is possible to replace the last three statements with the fit_transformation() function.
x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(poly_x)
x_

In [None]:
# Create a model and fit it
polynomial_linear_regression = LinearRegression().fit(x_, poly_y)

In [None]:
poly_r_sq = polynomial_linear_regression.score(x_, poly_y)
print(f'coefficient for determination: {poly_r_sq}')
print(f'intercept: {polynomial_linear_regression.intercept_}')
print(f'coefficients: {polynomial_linear_regression.coef_}')

In [None]:
# The function to predict is the same as with the simpler linear regression. It requires the transformation
# for the polynomial 
poly_y_prediction = polynomial_linear_regression.predict(x_)
print(f'predicted response: {poly_y_prediction}')

In [None]:
# Generate predictions for a new set of data.
new_dataset_x = numpy.arange(12).reshape((-1, 2))
new_predictions = polynomial_linear_regression.predict(new_dataset_x)
new_predictions

In [None]:
# Applying the same methods for polynomial regression for several input variables
x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x, y = numpy.array(x), numpy.array(y)
# Transform the input data
x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x)

model = LinearRegression().fit(x_, y)

# Get results
r_sq = model.score(x_, y)
intercept, coefficients = model.intercept_, model.coef_

# Predict
y_prediction = model.predict(x_)

print(f'coefficient: {r_sq}')
print(f'intercept: {intercept}')
print(f'coefficients: {coefficients}')
print(f'predicted response: {y_prediction}')

In [None]:
# Advanced linear regression with statsmodels
import statsmodels.api as sm

In [None]:
stats_x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
stats_y = [4, 5, 20, 14, 32, 22, 38, 43]
stats_x, stats_y = numpy.array(stats_x), numpy.array(stats_y)

In [None]:
stats_x = sm.add_constant(stats_x)
stats_x

In [None]:
# This regression model is based on ordinary least squares (OLS)
ols_regression_model = sm.OLS(stats_y, stats_x)

In [None]:
results = ols_regression_model.fit()
results.summary()