# Regression

In [1]:
import pandas as pd
import numpy  as np
import datetime

import tensorflow as tf
from tensorflow       import keras
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics  import mean_absolute_error, median_absolute_error

import seaborn as sns

Let's start by loading the dataset and parsing the date.

In [2]:
border_crossing = pd.read_csv('../data/Border_Crossing_Entry_Data.csv')    
border_crossing.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,Location
0,Calexico East,California,2507,US-Mexico Border,03/01/2019 12:00:00 AM,Trucks,34447,POINT (-115.48433000000001 32.67524)
1,Van Buren,Maine,108,US-Canada Border,03/01/2019 12:00:00 AM,Rail Containers Full,428,POINT (-67.94271 47.16207)
2,Otay Mesa,California,2506,US-Mexico Border,03/01/2019 12:00:00 AM,Trucks,81217,POINT (-117.05333 32.57333)
3,Nogales,Arizona,2604,US-Mexico Border,03/01/2019 12:00:00 AM,Trains,62,POINT (-110.93361 31.340279999999996)
4,Trout River,New York,715,US-Canada Border,03/01/2019 12:00:00 AM,Personal Vehicle Passengers,16377,POINT (-73.44253 44.990010000000005)


In [3]:
def parse_date(s): 
    date, *_ = s.strip().split()
    [month, day, year] = list(map(int, date.split(sep='/')))
    return datetime.date(year, month, day)
    
border_crossing['Date'] = border_crossing['Date'].apply(parse_date)
border_crossing = border_crossing.sort_values(by='Date')
border_crossing.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,Location
346732,Presidio,Texas,2403,US-Mexico Border,1996-01-01,Trucks,347,POINT (-104.39000000000001 29.56)
345903,Whitlash,Montana,3321,US-Canada Border,1996-01-01,Truck Containers Full,0,POINT (-111.26000000000002 49)
345904,Jackman,Maine,104,US-Canada Border,1996-01-01,Truck Containers Full,2103,POINT (-70.4 45.81)
345905,Hidalgo,Texas,2305,US-Mexico Border,1996-01-01,Truck Containers Full,9794,POINT (-98.27 26.1)
345906,Boundary,Washington,3015,US-Canada Border,1996-01-01,Pedestrians,0,POINT (-117.62999999999998 49)


Now, we'll extract the year, month, and also keep a running count of the total number of months that have passed.

In [4]:
border_crossing['Year'] = border_crossing['Date'].apply(lambda d: d.year - 1996)
border_crossing['Month'] = border_crossing['Date'].apply(lambda d: d.month - 1)

def get_total_month(p):
    (y,m) = p
    return 12*y + m
border_crossing['Total Month'] = list(map(get_total_month, zip(border_crossing['Year'], border_crossing['Month'])))

border_crossing.tail()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,Location,Year,Month,Total Month
527,Calexico,California,2503,US-Mexico Border,2019-03-01,Personal Vehicles,413457,POINT (-115.49806000000001 32.67889),23,2,278
526,Richford,Vermont,203,US-Canada Border,2019-03-01,Train Passengers,30,POINT (-72.67832000000001 44.98588),23,2,278
525,Eastport,Idaho,3302,US-Canada Border,2019-03-01,Personal Vehicle Passengers,9719,POINT (-116.18027999999998 48.99944),23,2,278
523,Laurier,Washington,3016,US-Canada Border,2019-03-01,Rail Containers Full,196,POINT (-118.22302 48.99892),23,2,278
0,Calexico East,California,2507,US-Mexico Border,2019-03-01,Trucks,34447,POINT (-115.48433000000001 32.67524),23,2,278


The last thing we need to do to prepare the data is to one-hot encode the categorical data.

In [5]:
del border_crossing['Location']
one_hot = border_crossing.copy()

for col in ['Port Name', 'State', 'Port Code', 'Border', 'Measure']:
    border_crossing[col] = border_crossing[col].astype('category')

def one_hot_encode_categoricals(df):
    df2 = df.copy()
    for col in df.columns:
        if df[col].dtype.name == 'category':
            dummies = pd.get_dummies(df[col], prefix=col)
            df2[dummies.columns] = dummies
            del df2[col]
    return df2
    
one_hot = one_hot_encode_categoricals(border_crossing)
one_hot.head()

Unnamed: 0,Date,Value,Year,Month,Total Month,Port Name_Alcan,Port Name_Alexandria Bay,Port Name_Algonac,Port Name_Ambrose,Port Name_Anacortes,...,Measure_Pedestrians,Measure_Personal Vehicle Passengers,Measure_Personal Vehicles,Measure_Rail Containers Empty,Measure_Rail Containers Full,Measure_Train Passengers,Measure_Trains,Measure_Truck Containers Empty,Measure_Truck Containers Full,Measure_Trucks
346732,1996-01-01,347,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
345903,1996-01-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
345904,1996-01-01,2103,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
345905,1996-01-01,9794,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
345906,1996-01-01,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
def standardize(col):
    max = col.max()
    min = col.min()
    return (col - min)/(max - min)

def split_data(df, label_col, split=.9):
    data = df.copy()
    del data[label_col]
    labels = df[label_col]
    
    len_total = len(df)
    len_train = int(.9 * len_total)
    len_test  = len_total - len_train
    
    train_input  = data.head(len_train).to_numpy()
    train_labels = labels.head(len_train).to_numpy()

    test_input  = data.tail(len_test).to_numpy()
    test_labels = labels.tail(len_test).to_numpy()
    
    return train_input, train_labels, test_input, test_labels

shuffled = one_hot.sample(frac=1)
del shuffled['Date']
train_input, train_labels, test_input, test_labels = split_data(shuffled, 'Value')

Let's start by applying a random forest algorithm.

In [7]:
forest = RandomForestRegressor(n_estimators=100, max_depth=100, random_state=731)

In [8]:
forest.fit(train_input, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=100,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=731, verbose=0,
                      warm_start=False)

In [9]:
print(forest.score(test_input, test_labels))
print(mean_absolute_error(test_labels, forest.predict(test_input)))

0.9916687680605362
1724.268178334475


To put this error into context, let's look at the distribution:

In [10]:
border_crossing['Value'].describe()

count    3.467330e+05
mean     2.818767e+04
std      1.518588e+05
min      0.000000e+00
25%      0.000000e+00
50%      9.000000e+01
75%      2.483000e+03
max      4.447374e+06
Name: Value, dtype: float64

As we can see, most data is very small with respect to the maximum. So now I am curious if the model is consistently off, or if it strays infrequently, but severely.

In [11]:
median_absolute_error(test_labels, forest.predict(test_input))

14.94

This suggests to me that the error is typically very small, but modest mistakes on the extremely large values greatly contribute to the average error.

Now let's try to apply a neural network.

In [13]:
len_hidden = len(border_crossing.columns)

nn = tf.keras.Sequential([
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(1)
])

nn.compile(loss = 'mse',
           optimizer = tf.keras.optimizers.RMSprop(0.001),
           metrics=['mae','mse']
          )

In [14]:
nn.fit(train_input, train_labels, epochs=10, verbose=1)

Train on 312059 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f39ced29908>

In [15]:
print(nn.evaluate(test_input, test_labels, verbose=0))
print(median_absolute_error(test_labels, nn.predict(test_input)))

[5109485168.475284, 14217.426, 5109490000.0]
2685.967041015625


These results aren't very encouraging. Let's try again after standardizing our data.

In [16]:
shuffled = one_hot.sample(frac=1)
del shuffled['Date']
shuffled['Value'] = standardize(shuffled['Value'])
    
train_input, train_labels, test_input, test_labels = split_data(shuffled, 'Value')

In [17]:
nn = tf.keras.Sequential([
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(1)
])

nn.compile(loss = 'mse',
           optimizer = tf.keras.optimizers.RMSprop(0.001),
           metrics=['mae','mse']
          )

In [18]:
nn.fit(train_input, train_labels, epochs=10, verbose=1)

Train on 312059 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f39cca06710>

In [19]:
print(nn.evaluate(test_input, test_labels, verbose=0))
print(median_absolute_error(test_labels, nn.predict(test_input)))

[8.709803234830607e-05, 0.0024008516, 8.709803e-05]
0.0002564825117588043


How does this compare? What are the values after scaling back to the original range?

In [20]:
max = border_crossing['Value'].max()
print(.003067 * max)
print(.00119 * max)

13640.096058
5292.37506


The values appear to be slightly better. Now, let's just tweak some parameters of the neural network itself, such as the depth, loss function, and optimizer.

In [21]:
nn = tf.keras.Sequential([
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(1)
])

nn.compile(loss = 'mse',
           optimizer = tf.keras.optimizers.RMSprop(0.001),
           metrics=['mae','mse']
          )

In [22]:
nn.fit(train_input, train_labels, epochs=10, verbose=1)

Train on 312059 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f39ca8471d0>

In [23]:
print(nn.evaluate(test_input, test_labels, verbose=0))
print(median_absolute_error(test_labels, nn.predict(test_input)))

[6.920392145971672e-05, 0.0018306827, 6.92039e-05]
0.0004271995276212692


In [24]:
max = border_crossing['Value'].max()
print(.002222 * max)
print(.000694 * max)

9882.065028
3086.477556


It seems depth was very beneficial. Now let's try changing the loss function. I think mean absolute error might be better here than squared error, which will over-punish predictions on large numbers that are relatively accurate.

In [25]:
nn = tf.keras.Sequential([
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(1)
])

nn.compile(loss = 'mae',
           optimizer = tf.keras.optimizers.RMSprop(0.001),
           metrics=['mae','mse']
          )

In [26]:
nn.fit(train_input, train_labels, epochs=10, verbose=1)

Train on 312059 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f39c81a50f0>

In [27]:
print(nn.evaluate(test_input, test_labels, verbose=0))
print(median_absolute_error(test_labels, nn.predict(test_input)))

[0.0026719394062012978, 0.0026719382, 0.00015431074]
0.00019661427750170343


Finally, let's change the optimizer. I found "adam" to be the most successful.

In [28]:
nn = tf.keras.Sequential([
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(1)
])

nn.compile(loss = 'mae',
           optimizer = 'adam',
           metrics=['mse']
          )

In [29]:
nn.fit(train_input, train_labels, epochs=10, verbose=1)

Train on 312059 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f39c75e97f0>

In [30]:
print(nn.evaluate(test_input, test_labels, verbose=0))
print(median_absolute_error(test_labels, nn.predict(test_input)))

[0.0014915586093510297, 3.8463913e-05]
0.0002637451980262995


In [31]:
max = border_crossing['Value'].max()
print(.00124 * max)
print(.00004888 * max)

5514.74376
217.38764112


In [32]:
nn = tf.keras.Sequential([
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(1)
])

nn.compile(loss = 'mae',
           optimizer = 'adam',
           metrics=['mse']
          )

In [33]:
nn.fit(train_input, train_labels, epochs=25, verbose=1)

Train on 312059 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f39b5b3d080>

In [34]:
print(nn.evaluate(test_input, test_labels, verbose=0))
print(median_absolute_error(test_labels, nn.predict(test_input)))

[0.001153752259198494, 3.183326e-05]
9.128908277489245e-05


In [35]:
max = border_crossing['Value'].max()
print(.00093133 * max)
print(.00001812 * max)

4141.97282742
80.58641688


At this point, I am unable to improve accuracy with the neural network. It is very suprising that the random forest would outperform the neural network. Perhaps this was a result of the unusual value distribution.