In [51]:
import os, sys
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE
from sklearn.base import clone

In [55]:
# Dataset location
DATASET = 'Datasets/FlagData.csv'
assert os.path.exists(DATASET)

# # Load and shuffle
dataset = pd.read_csv(DATASET, sep=',').sample(frac = 1).reset_index(drop=True)
dataset.drop(['Unnamed: 0', 'CRN', 'FATAL_OR_MAJ_INJ'], axis=1, inplace=True)


# # View some metadata of the dataset and see if that makes sense
print('dataset.shape', dataset.shape)

X = np.array(dataset.iloc[:, 18:])
y = np.array(dataset.FATAL)

print('X', X.shape, 'y', y.shape)
print('Label distribution:', {i: np.sum(y==i) for i in np.unique(dataset.FATAL)})

dataset.shape (60797, 89)
X (60797, 71) y (60797,)
Label distribution: {0: 60262, 1: 535}


In [56]:
#dataset.drop(['Unnamed: 0', 'CRN', 'FATAL_OR_MAJ_INJ'], axis=1, inplace=True)

dataset.describe()

Unnamed: 0,INTERSTATE,STATE_ROAD,LOCAL_ROAD,LOCAL_ROAD_ONLY,TURNPIKE,WET_ROAD,SNOW_SLUSH_ROAD,ICY_ROAD,SUDDEN_DEER,SHLDR_RELATED,...,HAZARDOUS_TRUCK,MAJOR_INJURY,MODERATE_INJURY,MINOR_INJURY,MC_DRINKING_DRIVER,DRUG_RELATED,ILLEGAL_DRUG_RELATED,SCHOOL_BUS_UNIT,DRUGGED_DRIVER,IMPAIRED_DRIVER
count,60797.0,60797.0,60797.0,60797.0,60797.0,60797.0,60797.0,60797.0,60797.0,60797.0,...,60797.0,60797.0,60797.0,60797.0,60797.0,60797.0,60797.0,60797.0,60797.0,60797.0
mean,0.073984,0.723029,0.466306,0.272382,0.004211,0.210997,0.044361,0.037272,0.029886,0.001431,...,0.001053,0.022978,0.098344,0.295163,0.005543,0.028883,0.009573,0.002928,0.028636,0.117095
std,0.261747,0.447506,0.498868,0.445189,0.064754,0.40802,0.205897,0.189428,0.170275,0.037802,...,0.032428,0.149835,0.297781,0.45612,0.074246,0.167479,0.097372,0.05403,0.166783,0.321536
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [57]:
selector = SelectKBest(chi2, k=5)
selector.fit(X, y)
print('χ² statistic', selector.scores_)
print('Selected indices', selector.get_support(True))

χ² statistic [  6.02620000e+04   1.60977741e+01   2.61803534e+01   1.93621038e+01
   3.77085165e+00   1.04501792e+01   6.32252560e-01   2.33526205e-01
   4.78014087e+00   1.47287070e+02   1.72093168e+01   2.31068127e+00
   1.67421363e+01   3.20673557e-03   5.92879249e-01   8.55152523e-01
   1.12065427e+02   4.11268321e+02   1.60049742e+00   3.25599916e+01
   2.49867621e-01   4.84224218e+01   1.75485066e+00   4.58188897e+02
   4.11706662e+02   3.60680464e+01   2.89392574e+00   1.80704659e+01
   1.61949877e+00   3.61146324e+00   3.81286396e+00   8.96458785e+00
   5.91240288e+02   6.47781037e+01   6.15239761e+01   2.64320990e-02
   5.26946946e+02   8.02779004e+01   3.49501412e+00   1.18246289e+00
   7.28674632e-01   1.07209579e+01   1.45473657e+00   5.42744924e-01
   4.30129184e-01   3.73365521e-01   1.08008937e+00   6.73218902e+00
   4.12193617e+02   1.96561978e+02   3.68590549e+01   2.22473684e+01
   6.52908361e+01   8.27371696e+00   4.11581662e+01   1.43806874e+00
   4.84224218e+01    

In [58]:
dataset.iloc[:,[0,32,36,65,70]]

Unnamed: 0,INTERSTATE,HIT_BARRIER,BICYCLE,DRIVER_75PLUS,NHTSA_AGG_DRIVING
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,1,0,0,0
4,0,0,0,0,0
5,0,0,0,0,0
6,0,0,0,0,0
7,0,0,0,0,1
8,0,0,0,0,0
9,0,0,0,0,0
