### Linear Regression

# SciKit Learn

In [5]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

'/home/jovyan/Class Notes'

# Minnesota Traffic Volume Data

In [30]:
dirty_data = pd.read_csv("Metro_Interstate_Traffic_Volume.csv")

df = dirty_data.dropna()

print(df.shape)

df.head()

(48204, 9)


Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


In [31]:
xvars = ["temp", "rain_1h", "snow_1h", "clouds_all"]  # taking the variables that we want to keep as our predictor variables, and putting them into a NumPy Array

X = df.loc[:, xvars].values   # get X values (i.e. predictors/features) # casting our predictors into a NumPy array
y = df.loc[:, "traffic_volume"].values  # get y values (i.e., outcome/target variable) 

# Train/Test Split

In [32]:
# Split traning/test data at random

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # the '0' is setting the seed; a way to "fix" the randomization process


# Fitting Regression Model

In [34]:

mod = LinearRegression()      # create model object
mod.fit(X_train, y_train)     # fitting the model to the "train" data

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [36]:
print(mod.coef_)     # show regression coefficients; the numbers are just the "beta" values

[2.03771422e+01 1.41759285e-01 6.18593670e+02 3.89941368e+00]


# Making Predictions with Fitted Model

In [37]:
# Use our fitted model to make predictions using test set

y_pred = mod.predict(X_test)

In [41]:
# print our metrics of model adequacy

from sklearn import metrics
import numpy as np

print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error: ", metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R-Squared Value: ", metrics.explained_variance_score(y_test, y_pred))

      

Mean Absolute Error:  1701.0237286830916
Mean Squared Error:  3799938.6620990513
Root Mean Squared Error:  1949.3431360586703
R-Squared Value:  0.02834348685617749
