In [11]:
import os, sys
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE
from sklearn.base import clone

In [12]:
# Dataset location
DATASET = 'Datasets/cycle_flag.csv'
assert os.path.exists(DATASET)

# # Load and shuffle
dataset = pd.read_csv(DATASET, sep=',').sample(frac = 1).reset_index(drop=True)
dataset.drop(['Unnamed: 0', 'CRN', 'FATAL_OR_MAJ_INJ','CRASH_YEAR','COUNTY','MUNICIPALITY','COUNTY_YEAR','MOTORCYCLE_COUNT',
              'FATAL_COUNT','MCYCLE_DEATH_COUNT','DEC_LAT','DEC_LONG','PSP_REPORTED','MC_DVR_HLMT_TYPE','MC_PAS_HLMT_TYPE','MC_PAS_HLMTON_IND'], axis=1, inplace=True)




In [4]:
g = dataset.columns.to_series().groupby(dataset.dtypes).groups
g

{dtype('int64'): Index(['INTERSTATE', 'STATE_ROAD', 'LOCAL_ROAD', 'LOCAL_ROAD_ONLY', 'TURNPIKE',
        'WET_ROAD', 'SNOW_SLUSH_ROAD', 'ICY_ROAD', 'SUDDEN_DEER',
        'SHLDR_RELATED', 'REAR_END', 'HO_OPPDIR_SDSWP', 'HIT_FIXED_OBJECT',
        'SV_RUN_OFF_RD', 'WORK_ZONE', 'PROPERTY_DAMAGE_ONLY', 'INJURY_OR_FATAL',
        'INJURY', 'FATAL', 'NON_INTERSECTION', 'INTERSECTION', 'SIGNALIZED_INT',
        'STOP_CONTROLLED_INT', 'UNSIGNALIZED_INT', 'SCHOOL_BUS', 'SCHOOL_ZONE',
        'HIT_DEER', 'HIT_TREE_SHRUB', 'HIT_EMBANKMENT', 'HIT_POLE',
        'HIT_GDRAIL', 'HIT_GDRAIL_END', 'HIT_BARRIER', 'HIT_BRIDGE',
        'OVERTURNED', 'MOTORCYCLE', 'BICYCLE', 'HVY_TRUCK_RELATED',
        'VEHICLE_FAILURE', 'TRAIN_TROLLEY', 'PHANTOM_VEHICLE',
        'ALCOHOL_RELATED', 'DRINKING_DRIVER', 'UNDERAGE_DRNK_DRV', 'UNLICENSED',
        'DISTRACTED', 'CELL_PHONE', 'NO_CLEARANCE', 'RUNNING_RED_LT',
        'TAILGATING', 'CROSS_MEDIAN', 'CURVED_ROAD', 'CURVE_DVR_ERROR',
        'LIMIT_65MPH', 'SPEE

In [13]:
dataset = pd.get_dummies(dataset, columns=["MC_DVR_HLMTON_IND"])

In [14]:
dataset.drop(['MC_PASSNGR_IND', 'MC_DVR_HLMTDOT_IND', 'MC_PAS_HLMTDOT_IND','MINOR_INJURY','MODERATE_INJURY','MAJOR_INJURY'], axis=1, inplace=True)


In [54]:
#dataset.iloc[:,np.r_[:,:]]

#dataset.loc[:, dataset.columns != 'FATAL']

In [15]:
# # View some metadata of the dataset and see if that makes sense
print('dataset.shape', dataset.shape)

X = np.array(dataset.loc[:, dataset.columns != 'FATAL'])
y = np.array(dataset.FATAL)

print('X', X.shape, 'y', y.shape)
print('Label distribution:', {i: np.sum(y==i) for i in np.unique(dataset.FATAL)})

dataset.shape (101773, 89)
X (101773, 88) y (101773,)
Label distribution: {0: 97853, 1: 3920}


In [8]:
#dataset.drop(['Unnamed: 0', 'CRN', 'FATAL_OR_MAJ_INJ'], axis=1, inplace=True)

dataset.describe()

Unnamed: 0,INTERSTATE,STATE_ROAD,LOCAL_ROAD,LOCAL_ROAD_ONLY,TURNPIKE,WET_ROAD,SNOW_SLUSH_ROAD,ICY_ROAD,SUDDEN_DEER,SHLDR_RELATED,...,MC_DRINKING_DRIVER,DRUG_RELATED,ILLEGAL_DRUG_RELATED,SCHOOL_BUS_UNIT,DRUGGED_DRIVER,IMPAIRED_DRIVER,MC_DVR_HLMTON_IND_,MC_DVR_HLMTON_IND_N,MC_DVR_HLMTON_IND_U,MC_DVR_HLMTON_IND_Y
count,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,...,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0
mean,0.028926,0.666479,0.536209,0.32876,0.004361,0.053565,0.000727,0.001483,0.037752,0.002937,...,0.073115,0.014463,0.005904,0.001187,0.012994,0.090604,0.322322,0.17618,0.156466,0.345032
std,0.167599,0.471474,0.498691,0.469766,0.065895,0.225158,0.026951,0.038486,0.190597,0.054116,...,0.260328,0.11939,0.07661,0.034428,0.11325,0.287048,0.467369,0.380976,0.363299,0.475382
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
selector = SelectKBest(chi2, k=5)
selector.fit(X, y)
print('χ² statistic', selector.scores_)
print('Selected indices', selector.get_support(True))

χ² statistic [  2.10288341e+01   1.89747964e+02   2.93082545e+02   3.42505896e+02
   1.29112805e+00   2.15173981e+01   9.83788182e-02   4.48456713e-01
   5.87527139e+00   3.84681829e+00   2.45889653e+01   2.39226507e+02
   6.63186421e+02   1.15401672e+02   9.17742942e-01   1.53870806e+02
   1.81060348e+01   1.78925925e+03   1.89236188e+02   2.25679350e+02
   8.29324793e+01   7.67908054e+01   1.43772327e+02   3.41118885e+00
   6.21018716e+00   2.45432657e+00   8.21483316e+02   2.51875022e+02
   1.07658764e+03   6.75672119e+02   1.46663556e+02   5.11170011e+01
   7.59367732e+01   6.66651954e-01   3.30088203e+02   6.87566959e+02
   2.91888189e+02   2.08619669e+01   3.23136110e+00   8.96633527e+00
   3.41187907e+03   3.65002788e+03   5.80902605e+01   1.73150261e+00
   8.32844890e+00   3.02308137e-02   3.88942387e+00   9.17728327e+00
   8.17365406e-01   2.12978905e+03   7.64374037e+02   4.50554029e+02
   1.65603795e+01   2.46287979e+03   1.82193136e+03   3.73101129e+02
   7.56817717e+00   5

In [17]:
#dataset.iloc[:,[40,41,53,81,85]]
X_selected = selector.transform(X)

[dataset.columns[i] for i in selector.get_support(True)]


['PHANTOM_VEHICLE',
 'ALCOHOL_RELATED',
 'LIMIT_65MPH',
 'HAZARDOUS_TRUCK',
 'DRUGGED_DRIVER']

In [11]:
import statsmodels.api as sm
logit_model=sm.Logit(y,X_selected)
result=logit_model.fit()
print(result.summary())

  return f(*args, **kwds)


Optimization terminated successfully.
         Current function value: 0.662384
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                96508
Model:                          Logit   Df Residuals:                    96503
Method:                           MLE   Df Model:                            4
Date:                Sun, 04 Mar 2018   Pseudo R-squ.:                  -3.034
Time:                        13:42:29   Log-Likelihood:                -63925.
converged:                       True   LL-Null:                       -15846.
                                        LLR p-value:                     1.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -2.9605      0.187    -15.812      0.000      -3.327      -2.594
x2             1.9029      0.

In [31]:
testColumns = ['PHANTOM_VEHICLE','ALCOHOL_RELATED','LIMIT_65MPH','HAZARDOUS_TRUCK','DRUGGED_DRIVER']
for var in testColumns:
    selector.TestIndependence(colX=var,colY="Survived" )  

AttributeError: 'SelectKBest' object has no attribute 'TestIndependence'

In [10]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_selected, y, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

10-fold cross validation average accuracy: 0.961
