In [1]:
import os, sys
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE
from sklearn.base import clone

In [2]:
# Dataset location
DATASET = 'Datasets/post2003cycle_flag.csv'
assert os.path.exists(DATASET)

# # Load and shuffle
dataset = pd.read_csv(DATASET, sep=',').sample(frac = 1).reset_index(drop=True)
dataset.drop(['Unnamed: 0', 'CRN', 'FATAL_OR_MAJ_INJ','CRASH_YEAR','COUNTY','MUNICIPALITY','COUNTY_YEAR','MOTORCYCLE_COUNT',
              'FATAL_COUNT','MCYCLE_DEATH_COUNT','DEC_LAT','DEC_LONG','PSP_REPORTED','MC_DVR_HLMT_TYPE','MC_PAS_HLMT_TYPE','MC_PAS_HLMTON_IND'], axis=1, inplace=True)




In [8]:
g = dataset.columns.to_series().groupby(dataset.dtypes).groups
g

{dtype('uint8'): Index(['MC_DVR_HLMTON_IND_ ', 'MC_DVR_HLMTON_IND_N', 'MC_DVR_HLMTON_IND_U',
        'MC_DVR_HLMTON_IND_Y'],
       dtype='object'),
 dtype('int64'): Index(['INTERSTATE', 'STATE_ROAD', 'LOCAL_ROAD', 'LOCAL_ROAD_ONLY', 'TURNPIKE',
        'WET_ROAD', 'SNOW_SLUSH_ROAD', 'ICY_ROAD', 'SUDDEN_DEER',
        'SHLDR_RELATED', 'REAR_END', 'HO_OPPDIR_SDSWP', 'HIT_FIXED_OBJECT',
        'SV_RUN_OFF_RD', 'WORK_ZONE', 'PROPERTY_DAMAGE_ONLY', 'INJURY_OR_FATAL',
        'INJURY', 'FATAL', 'NON_INTERSECTION', 'INTERSECTION', 'SIGNALIZED_INT',
        'STOP_CONTROLLED_INT', 'UNSIGNALIZED_INT', 'SCHOOL_BUS', 'SCHOOL_ZONE',
        'HIT_DEER', 'HIT_TREE_SHRUB', 'HIT_EMBANKMENT', 'HIT_POLE',
        'HIT_GDRAIL', 'HIT_GDRAIL_END', 'HIT_BARRIER', 'HIT_BRIDGE',
        'OVERTURNED', 'MOTORCYCLE', 'BICYCLE', 'HVY_TRUCK_RELATED',
        'VEHICLE_FAILURE', 'TRAIN_TROLLEY', 'PHANTOM_VEHICLE',
        'ALCOHOL_RELATED', 'DRINKING_DRIVER', 'UNDERAGE_DRNK_DRV', 'UNLICENSED',
        'DISTRACTED',

In [3]:
dataset = pd.get_dummies(dataset, columns=["MC_DVR_HLMTON_IND"])

In [4]:
dataset.drop(['MC_PASSNGR_IND', 'MC_DVR_HLMTDOT_IND', 'MC_PAS_HLMTDOT_IND','MINOR_INJURY','MODERATE_INJURY','MAJOR_INJURY'], axis=1, inplace=True)


In [54]:
#dataset.iloc[:,np.r_[:,:]]

#dataset.loc[:, dataset.columns != 'FATAL']

In [5]:
# # View some metadata of the dataset and see if that makes sense
print('dataset.shape', dataset.shape)

X = np.array(dataset.loc[:, dataset.columns != 'FATAL'])
y = np.array(dataset.FATAL)

print('X', X.shape, 'y', y.shape)
print('Label distribution:', {i: np.sum(y==i) for i in np.unique(dataset.FATAL)})

dataset.shape (67414, 89)
X (67414, 88) y (67414,)
Label distribution: {0: 64512, 1: 2902}


In [6]:
#dataset.drop(['Unnamed: 0', 'CRN', 'FATAL_OR_MAJ_INJ'], axis=1, inplace=True)

dataset.describe()

Unnamed: 0,INTERSTATE,STATE_ROAD,LOCAL_ROAD,LOCAL_ROAD_ONLY,TURNPIKE,WET_ROAD,SNOW_SLUSH_ROAD,ICY_ROAD,SUDDEN_DEER,SHLDR_RELATED,...,MC_DRINKING_DRIVER,DRUG_RELATED,ILLEGAL_DRUG_RELATED,SCHOOL_BUS_UNIT,DRUGGED_DRIVER,IMPAIRED_DRIVER,MC_DVR_HLMTON_IND_,MC_DVR_HLMTON_IND_N,MC_DVR_HLMTON_IND_U,MC_DVR_HLMTON_IND_Y
count,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,...,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0,67414.0
mean,0.028926,0.666479,0.536209,0.32876,0.004361,0.053565,0.000727,0.001483,0.037752,0.002937,...,0.073115,0.014463,0.005904,0.001187,0.012994,0.090604,0.322322,0.17618,0.156466,0.345032
std,0.167599,0.471474,0.498691,0.469766,0.065895,0.225158,0.026951,0.038486,0.190597,0.054116,...,0.260328,0.11939,0.07661,0.034428,0.11325,0.287048,0.467369,0.380976,0.363299,0.475382
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
selector = SelectKBest(chi2, k=5)
selector.fit(X, y)
print('χ² statistic', selector.scores_)
print('Selected indices', selector.get_support(True))

χ² statistic [  6.61838355e+00   1.04542175e+02   1.95880976e+02   2.15982106e+02
   9.23339246e-01   1.21108194e+01   5.92108072e-03   4.13249919e-01
   5.75148777e+00   2.67317304e-01   2.86246365e+01   1.11440683e+02
   3.95713401e+02   3.98927167e+01   3.47874337e+00   1.17812779e+02
   6.28948906e+00   1.39607616e+03   1.17974719e+02   1.49880182e+02
   5.91312519e+01   4.67529998e+01   9.18415168e+01   1.97110335e+00
   6.24387606e+00   3.17383238e+00   5.21686050e+02   1.29867144e+02
   6.68871780e+02   3.96973516e+02   8.65194225e+01   3.46401291e+01
   4.62908748e+01   1.18918076e+00   1.55589858e+02   4.17640159e+02
   1.98690104e+02   2.24670767e+01   1.01212826e+00   1.24844398e+01
   2.20771668e+03   2.34291669e+03   3.63534653e+01   1.46119402e+00
   1.10776788e+01   3.92160000e-02   5.90759461e+00   6.66376912e+00
   1.64675326e-01   1.67217578e+03   4.62596491e+02   3.93938785e+02
   1.15321873e+01   1.75016633e+03   1.23370606e+03   3.10675683e+02
   1.04282286e+01   2

In [7]:
#dataset.iloc[:,[40,41,53,81,85]]
X_selected = selector.transform(X)

[dataset.columns[i] for i in selector.get_support(True)]

['PHANTOM_VEHICLE',
 'ALCOHOL_RELATED',
 'LIMIT_65MPH',
 'HAZARDOUS_TRUCK',
 'DRUGGED_DRIVER']

In [9]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_selected, y, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

NameError: name 'LogisticRegression' is not defined