# Feature Engineering

In [2]:
# The term "feature" is typically used to refer to predictor variables

# Transforming and/or modifying data in a manner that extracts additional information from raw data

In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

import time

In [12]:
dirty_data = pd.read_csv("Metro_Interstate_Traffic_Volume.csv")
df = dirty_data.dropna()
print(df.shape)
df.head()

(48204, 9)


Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


# Pre-Processing and Data Engineering

In [5]:
# stuff that happens before the model-fitting; in order to clean up the data (can also be done iteratively as a cycle)



In [14]:
def get_hour_wkday(v_dt):   # extract hour and day of  week from our date_time variable
    n = len(v_dt)        # start by getting the length of the vector 
    hour = np.zeros(n)     # allocate a vector of zeros of length n where n is the same length as (v_dt)
    wkday = np.zeros(n)    # a vector of zeros length n 
    
    for i in range(n):
        dt_tmp = time.strptime(v_dt[i], "%Y-%m-%d %H:%M:%S")    # this is telling Python the format that the date_time variable is in; we are "stripping " time, and storing it in dt_Tmp
        hour[i] = dt_tmp.tm_hour
        wkday[i] = dt_tmp.tm_wday
        
    return hour, wkday
    

In [15]:
# create two new columns hour and wkday

df["hour"], df["wkday"] = get_hour_wkday(df["date_time"])

In [17]:
# add dummy coded weather; this turns a column into something that we can "do math" on. It turns it into a matrix. ex: if you have repub, dem, independent values, itll make a matrix of 0s and 1s for each value

weather_dummy_codes = pd.get_dummies(df["weather_description"])

df = df.join(weather_dummy_codes)
df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume,hour,...,sleet,smoke,snow,thunderstorm,thunderstorm with drizzle,thunderstorm with heavy rain,thunderstorm with light drizzle,thunderstorm with light rain,thunderstorm with rain,very heavy rain
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545,9.0,...,0,0,0,0,0,0,0,0,0,0
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516,10.0,...,0,0,0,0,0,0,0,0,0,0
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767,11.0,...,0,0,0,0,0,0,0,0,0,0
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026,12.0,...,0,0,0,0,0,0,0,0,0,0
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918,13.0,...,0,0,0,0,0,0,0,0,0,0


# More Data Engineering

In [18]:
# create column of 0/1 indicating holidays

df["is_holiday"] = [0 if x == "None" else 1 for x in df["holiday"]]

In [19]:
# create column of 0/1 indicating weekends

df["is_weekend"] = [1 if x in [5, 6] else 0 for x in df["wkday"]]
df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume,hour,...,snow,thunderstorm,thunderstorm with drizzle,thunderstorm with heavy rain,thunderstorm with light drizzle,thunderstorm with light rain,thunderstorm with rain,very heavy rain,is_holiday,is_weekend
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545,9.0,...,0,0,0,0,0,0,0,0,0,0
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516,10.0,...,0,0,0,0,0,0,0,0,0,0
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767,11.0,...,0,0,0,0,0,0,0,0,0,0
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026,12.0,...,0,0,0,0,0,0,0,0,0,0
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918,13.0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
xvars = ["temp", "rain_1h", "snow_1h", "clouds_all"]

xvars2 = ["temp", "rain_1h", "snow_1h", "clouds_all", "hour", "wkday", "is_weekend", "is_holiday"]

xvars2 = xvars2 + weather_dummy_codes.columns.values.tolist()

x = df.loc[:, xvars2].values    # get x values (i.e. predictors/features)
y = df.loc[:, "traffic_volume"].values # get y vvalues (i.e. outcome/target variable)

# Split Training/Test Data

In [23]:
# split training/test data at random

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [24]:
mod = LinearRegression()  # create model object

mod.fit(x_train, y_train)  # fit model to training data

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [25]:
print(mod.coef_)  # show regression coefficients

[ 1.36282865e+01  2.37103764e-01 -1.00316251e+02  4.84476336e+00
  9.27434574e+01  6.47757732e+01 -1.18253645e+03 -1.49543785e+03
 -1.00338654e+03  2.98694636e+02  2.75686496e+02 -3.62998128e+02
  5.14655779e+02 -7.46664112e+01  6.66119597e+02  3.41400982e+02
  9.87140468e+01 -3.66936279e+02 -1.22216844e+02 -6.77524112e+01
  1.82313666e+02 -5.57203807e+01  8.59593709e+02  9.50366376e+02
 -1.11240570e+02 -1.65110266e+02 -2.01285152e+02 -2.83143246e+01
  9.08638953e+02 -3.54866993e+02 -6.82636976e+02 -6.47624688e+02
  4.97411544e+02 -6.62934222e+02 -5.68434189e-13  1.12205424e+02
  1.45678356e+03  1.82560102e+01 -6.22174307e+02 -6.02553589e+02
  1.77766070e+03 -6.48467998e+02 -5.38561604e+02 -6.04015837e+02
  1.25378962e+02 -1.16041691e+03]


In [26]:
# use our fitted model to make predictions using test set

y_pred = mod.predict(x_test)

In [None]:
# print our metrics of model adequacy



In [27]:
# print our metrics of model adequacy



print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error: ", metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R-Squared Value: ", metrics.explained_variance_score(y_test, y_pred))

      

Mean Absolute Error:  1555.8016865530478
Mean Squared Error:  3157421.0110158063
Root Mean Squared Error:  1776.9133380713326
R-Squared Value:  0.19265696897419993


In [28]:
# notice that our R-squared value is much larger than in the Linear Regression notes, which contained uncleaned, clunky data.