In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score

Dataset loading

In [2]:
ds = np.array([[]]).reshape(0,11)
for day in range(9,14,1):
    impact_fname = '../data/PeMS/Incidents/logit/impact_2017_10_{:02d}.csv'
    ds = np.append(ds, np.loadtxt(impact_fname.format(day), delimiter=','), axis=0)

ds = ds[~np.isnan(ds[:,-3])]

onehot_enc = OneHotEncoder(categorical_features=[0,1])
onehot_enc.fit(ds[:,2:4])
encoded = onehot_enc.transform(ds[:,2:4]).toarray()

scaler = StandardScaler()
scaler.fit(ds[:,[4,8,9]])
scaled = scaler.transform(ds[:,[4,8,9]])

ds = np.concatenate((ds[:,:2], encoded, scaled[:,:1], ds[:,5:8], scaled[:,1:], ds[:,-1:]), axis=1)

del scaled, encoded

Model training

In [3]:
logit = LogisticRegressionCV(class_weight={0:.05, 1:.9}, penalty='l2', n_jobs=3)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(ds[:,:-1], ds[:,-1], test_size=.2, random_state=41)

In [5]:
sum(ds[:,-1]==1)/ds.shape[0]

0.049648309559178851

In [6]:
logit = LogisticRegression(class_weight={0:1, 1:20}, penalty='l2', n_jobs=3, tol=1e-10)

In [7]:
logit.fit(X_train[:,2:-1], y_train)

  " = {}.".format(self.n_jobs))


LogisticRegression(C=1.0, class_weight={0: 1, 1: 20}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=3, penalty='l2', random_state=None,
          solver='liblinear', tol=1e-10, verbose=0, warm_start=False)

In [8]:
logit.score(X_test[:,2:-1], y_test)

0.75328859060402686

In [9]:
predicted = logit.predict(X_test[:,2:-1])

In [10]:
sum((y_test==predicted) & (y_test==1))

393

In [11]:
sum(predicted==1)

3013

In [12]:
sum((y_test!=predicted) & (y_test==1))

137

In [13]:
f1_score(y_test, predicted)

0.22184589331075358

In [14]:
recall_score(y_test, predicted)

0.7415094339622641

In [15]:
precision_score(y_test, predicted)

0.13043478260869565

In [16]:
proba = logit.predict_proba(X_test[:,2:-1])

In [17]:
roc_auc_score(y_test, proba[:,1])

0.81672607389420127

In [18]:
X_test.shape

(11175, 92)

In [19]:
fn = X_test[:,:2][(y_test!=predicted) & (y_test==1)]

In [20]:
np.savetxt(fname='../data/PeMS/Incidents/logit/result/fn.csv', X=fn, delimiter=',')

In [21]:
tp = X_test[:,:2][(y_test==predicted) & (y_test==1)]

In [22]:
np.savetxt(fname='../data/PeMS/Incidents/logit/result/tp.csv', X=tp, delimiter=',')

In [23]:
fp = X_test[:,:2][(y_test!=predicted) & (y_test==0)]

In [30]:
np.savetxt(fname='../data/PeMS/Incidents/logit/result/fp.csv', X=fp, delimiter=',')

In [24]:
tn = X_test[:,:2][(y_test==predicted) & (y_test==0)]

In [31]:
np.savetxt(fname='../data/PeMS/Incidents/logit/result/tn.csv', X=tn, delimiter=',')

In [32]:
fp.shape

(2620, 2)

In [33]:
fn[fn[:,0]==17516051]

array([], shape=(0, 2), dtype=float64)

In [34]:
tp[tp[:,0]==17516051]

array([], shape=(0, 2), dtype=float64)

In [35]:
fp[fp[:,0]==17516051]

array([], shape=(0, 2), dtype=float64)

In [36]:
tn[tn[:,0]==17516051]

array([[ 17516051.,    763674.],
       [ 17516051.,    773012.],
       [ 17516051.,    760650.],
       [ 17516051.,    772513.],
       [ 17516051.,    775610.],
       [ 17516051.,    771673.],
       [ 17516051.,    718370.],
       [ 17516051.,    767750.],
       [ 17516051.,    716955.]])