## Neural Network
-  neural network to predict whether a rider falls into an age bucket (20-29) based on their ride details

In [1]:
import  warnings
warnings.simplefilter('ignore')

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
data= pd.read_csv('chicago-divvy-bicycle-sharing-data\divvy_cleaned_neuralnetwork.csv')
data=data.dropna()
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,StartTime_Hour,Trip_Duration,Age,Start_Lat,Start_Long,End_Lat,End_Long,gender,twenties
0,0,0,0,4.25,21,41.874675,-87.650019,41.869482,-87.655486,1.0,1
1,1,1,8,4.666667,30,41.874675,-87.650019,41.869482,-87.655486,1.0,0
2,2,2,13,5.083333,25,41.874675,-87.650019,41.869482,-87.655486,1.0,1
3,3,3,19,5.75,25,41.874675,-87.650019,41.869482,-87.655486,1.0,1
4,4,4,19,6.0,24,41.874675,-87.650019,41.869482,-87.655486,1.0,1


In [3]:
km_test = data[['Start_Lat', 'Start_Long', 'End_Lat', 'End_Long']]
km_test.head()

Unnamed: 0,Start_Lat,Start_Long,End_Lat,End_Long
0,41.874675,-87.650019,41.869482,-87.655486
1,41.874675,-87.650019,41.869482,-87.655486
2,41.874675,-87.650019,41.869482,-87.655486
3,41.874675,-87.650019,41.869482,-87.655486
4,41.874675,-87.650019,41.869482,-87.655486


- Run KMeans on coordinate data

In [4]:
from sklearn.cluster import KMeans

k_data = data.drop(['Start_Lat', 'Start_Long', 'End_Lat', 'End_Long','Age','twenties','Unnamed: 0'], axis=1)

kmeans = KMeans(n_clusters=10)
kmeans.fit(km_test)
predicted_clusters = kmeans.predict(km_test)

k_data['trip_cluster'] = predicted_clusters
k_data_encoded = pd.get_dummies(k_data, columns=['trip_cluster','StartTime_Hour'])

k_data.head()

Unnamed: 0,Unnamed: 0.1,StartTime_Hour,Trip_Duration,gender,trip_cluster
0,0,0,4.25,1.0,9
1,1,8,4.666667,1.0,9
2,2,13,5.083333,1.0,9
3,3,19,5.75,1.0,9
4,4,19,6.0,1.0,9


- Reshape twenties dummy column to train and test with

In [5]:
y = data['twenties'].values.reshape(-1, 1)

In [6]:
print(k_data_encoded.shape, y.shape)

(263626, 37) (263626, 1)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(
    k_data_encoded, y, random_state=1, stratify=y)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

Using TensorFlow backend.


In [8]:
from  keras.models import Sequential
from keras.layers import Dense

# Create model and add layers
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=37))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [11]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=10,
    shuffle=True,
    verbose=2
)

Epoch 1/10
 - 8s - loss: 0.5719 - acc: 0.7128
Epoch 2/10
 - 8s - loss: 0.5712 - acc: 0.7132
Epoch 3/10
 - 8s - loss: 0.5706 - acc: 0.7133
Epoch 4/10
 - 8s - loss: 0.5705 - acc: 0.7134
Epoch 5/10
 - 8s - loss: 0.5701 - acc: 0.7141
Epoch 6/10
 - 8s - loss: 0.5696 - acc: 0.7145
Epoch 7/10
 - 8s - loss: 0.5692 - acc: 0.7148
Epoch 8/10
 - 8s - loss: 0.5687 - acc: 0.7152
Epoch 9/10
 - 8s - loss: 0.5687 - acc: 0.7151
Epoch 10/10
 - 8s - loss: 0.5681 - acc: 0.7147


<keras.callbacks.History at 0x12bfc630>

In [12]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: 0.5743339104233643, Accuracy: 0.710789445737758


In [15]:
data['predicted_cluster'] = predicted_clusters
data.to_csv('divvy_clusters.csv')
model.save('divvy_20s_nn.h5')