## Bike Index Seattle

### Model Training

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Load data

crashes = pd.read_csv('data/crashes.csv')

In [4]:
crashes.head()

Unnamed: 0,SPEEDLIMIT,SURFACEWIDTH,AWDT_ROUNDED,one_way,is_steep,is_paved,is_hwy,is_bus,is_truck,is_light,is_clear,is_child,severity
0,20.0,30.0,6700,0,0,0,0,0,0,1,0,0,3
1,25.0,52.0,11300,0,0,0,0,1,0,1,0,0,3
2,25.0,42.0,18100,0,1,1,0,1,0,0,0,0,3
3,25.0,42.0,13800,0,0,0,0,1,0,1,0,0,3
4,25.0,42.0,13800,0,0,0,0,1,0,0,0,0,2


In [4]:
features = crashes.iloc[:,:-1].to_numpy()
severity = crashes.iloc[:,-1].to_numpy()

print('features: ', features.shape)
print('severity: ', severity.shape)

features:  (1169, 12)
severity:  (1169,)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(features, severity, train_size=0.8)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

print('X_train.mean(): ', X_train.mean())
print('X_test.mean(): ', X_test.mean())

print('X_train.std(): ', X_train.std())
print('X_test.std(): ', X_test.std())

X_train.mean():  2.153159805333637e-17
X_test.mean():  1.2652114240742525e-17
X_train.std():  0.9999999999999982
X_test.std():  0.9999999999999997


In [22]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [24]:
pred = knn.predict(X_test)

In [26]:
np.mean(pred == y_test)

0.4658119658119658

In [28]:
def get_knn_pred(n):
    
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    
    return knn.predict(X_test)

def get_knn_acc(n):
    
    pred = get_knn_pred(n)
    
    return np.mean(pred == y_test)

In [39]:
get_knn_acc(3)

0.4658119658119658

In [52]:
ks = np.arange(1,11)
accs = np.zeros(10)

for i, k_ in enumerate(ks):
    
    acc = get_knn_acc(k_)
    accs[i] = acc

In [55]:
pd.DataFrame(data = {'K':ks, 'accuracy':accs})

Unnamed: 0,K,accuracy
0,1,0.461538
1,2,0.41453
2,3,0.465812
3,4,0.504274
4,5,0.461538
5,6,0.435897
6,7,0.482906
7,8,0.461538
8,9,0.440171
9,10,0.461538


In [2]:
# Load data

crashes = pd.read_csv('data/crash_streets_vol.csv', parse_dates=['DATETIME'])

keep_cols = ['REPORT NUMBER',
            'DATETIME',
            'SPEEDLIMIT',
            'SURFACEWIDTH',
            'AWDT_ROUNDED',
            'one_way',
            'is_steep',
            'is_paved',
            'is_hwy',
            'is_bus',
            'is_truck',
            'is_light',
            'is_clear',
            'is_hit_run',
            'is_workzone',
            'is_child',
            'impaired',
            'speeding',
            'driver_16_25',
            'driver_65_plus',
            'severity',
            'geometry']

df = crashes[keep_cols]

In [10]:
df

Unnamed: 0,SPEEDLIMIT,SURFACEWIDTH,AWDT_ROUNDED,one_way,is_steep,is_paved,is_hwy,is_bus,is_truck,is_light,is_clear,is_hit_run,is_workzone,is_child,impaired,speeding,driver_16_25,driver_65_plus,severity
count,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0,1169.0
mean,24.482464,43.625321,13743.627032,0.180496,0.174508,0.668948,0.003422,0.728828,0.05047,0.773311,0.074423,0.157399,0.004277,0.023952,0.014542,0.001711,0.111206,0.083832,2.508982
std,2.561218,17.486342,8498.192547,0.384765,0.379708,0.470793,0.05842,0.444755,0.219007,0.418869,0.26257,0.364333,0.065288,0.152965,0.119763,0.041345,0.314522,0.277255,0.595471
min,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,25.0,36.0,8700.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,25.0,43.0,12100.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
75%,25.0,54.0,17000.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,40.0,120.0,58700.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0


In [5]:
feats = df.iloc[:,2:-2].to_numpy()
severity = df['severity']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(feats, severity, train_size=0.8, random_state=12345)

In [7]:
scale = StandardScaler().fit(X_train)
X_train = scale.transform(X_train)
X_test = scale.transform(X_test)

In [8]:
clf = LogisticRegression().fit(X_train, y_train)

In [9]:
clf.predict(X_test)

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3], dtype=int64)

In [10]:
y_test

1146    3
268     3
1145    1
973     3
162     3
       ..
1001    3
732     2
437     2
208     3
422     2
Name: severity, Length: 234, dtype: int64

From the study:  

`