# Neural Network
- Since our regression wasn't very useful in predicting rider age, let's try using a neural network to predict whether a rider falls into an age bucket (20-29) based on their ride details
- First, import dependencies

In [1]:
import warnings
warnings.simplefilter('ignore')

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Data Prep
- Load in CSV from DataCleanup notebook

In [2]:
data = pd.read_csv('cleaned_bike_data.csv')
data = data.dropna()
data.head()

Unnamed: 0.1,Unnamed: 0,age,duration,end_lat,end_long,gender,hour,start_lat,start_long,weekend,twenties
0,0,34.0,303,40.733812,-73.980544,0,0,40.740964,-73.986022,0,0
1,1,22.0,700,40.763094,-73.97835,0,0,40.739126,-73.979738,0,1
2,2,49.0,443,40.744449,-73.983035,0,0,40.729515,-73.990753,0,0
3,3,33.0,297,40.71924,-73.95242,0,0,40.710451,-73.960876,0,0
4,4,32.0,421,40.786995,-73.941648,0,0,40.799139,-73.938915,0,0


- Pull start and end coords to build trip clusters with

In [3]:
km_test = data[['start_lat','start_long','end_lat','end_long']]
km_test.head()

Unnamed: 0,start_lat,start_long,end_lat,end_long
0,40.740964,-73.986022,40.733812,-73.980544
1,40.739126,-73.979738,40.763094,-73.97835
2,40.729515,-73.990753,40.744449,-73.983035
3,40.710451,-73.960876,40.71924,-73.95242
4,40.799139,-73.938915,40.786995,-73.941648


- Run KMeans on coordinate data. From our earlier regression, we know that 11 clusters gives the most accuracy.

In [4]:
from sklearn.cluster import KMeans

k_data = data.drop(['start_lat','start_long','end_lat','end_long','age','twenties','Unnamed: 0'], axis=1)

kmeans = KMeans(n_clusters=11)
kmeans.fit(km_test)
predicted_clusters = kmeans.predict(km_test)

k_data['trip_cluster'] = predicted_clusters
k_data_encoded = pd.get_dummies(k_data, columns=['trip_cluster','hour'])

k_data.head()

Unnamed: 0,duration,gender,hour,weekend,trip_cluster
0,303,0,0,0,1
1,700,0,0,0,2
2,443,0,0,0,8
3,297,0,0,0,7
4,421,0,0,0,0


- Reshape twenties dummy column to train and test with

In [5]:
y = data['twenties'].values.reshape(-1, 1)     

In [7]:
print(k_data.shape, y.shape)

(1550171, 5) (1550171, 1)


- Get dummies for hour of the day and which cluster the trip falls in

In [8]:
X_encoded = pd.get_dummies(k_data, columns=["hour", "trip_cluster"])
X_encoded.head()

Unnamed: 0,duration,gender,weekend,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,...,trip_cluster_1,trip_cluster_2,trip_cluster_3,trip_cluster_4,trip_cluster_5,trip_cluster_6,trip_cluster_7,trip_cluster_8,trip_cluster_9,trip_cluster_10
0,303,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,700,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,443,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,297,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,421,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Prepare and run the model
- Divide data into train and test samples
- Scale data on training samples, apply scaler to train and test samples

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, random_state=1, stratify=y)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

Using TensorFlow backend.


- Build sequential model. We tweaked the number of units and layers without much effect on our results.

In [16]:
from keras.models import Sequential
from keras.layers import Dense

# Create model and add layers
model = Sequential()
model.add(Dense(units=128, activation='relu', input_dim=38))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

- Compile model and fit to training data. We don't need many epochs for the accuracy to top out.

In [17]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=10,
    shuffle=True,
    verbose=2
)

Epoch 1/10
 - 42s - loss: 0.5524 - acc: 0.7483
Epoch 2/10
 - 46s - loss: 0.5507 - acc: 0.7484
Epoch 3/10
 - 47s - loss: 0.5502 - acc: 0.7483
Epoch 4/10
 - 46s - loss: 0.5499 - acc: 0.7484
Epoch 5/10
 - 46s - loss: 0.5498 - acc: 0.7483
Epoch 6/10
 - 46s - loss: 0.5497 - acc: 0.7484
Epoch 7/10
 - 46s - loss: 0.5497 - acc: 0.7483
Epoch 8/10
 - 46s - loss: 0.5496 - acc: 0.7484
Epoch 9/10
 - 47s - loss: 0.5495 - acc: 0.7483
Epoch 10/10
 - 46s - loss: 0.5495 - acc: 0.7483


<keras.callbacks.History at 0x2d3a7dbe7f0>

 - Test the model on the test samples. Looks pretty good! The accuracy is almost equivalent, meaning we didn't overfit our model to our training data.

In [15]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: 0.5494588133609096, Accuracy: 0.7484227556673266
