# CLASSIFICATION MODEL ACCIDENTS MADRID

In [1]:
# Cell-to-cell script
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Remove warning errors
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Importing libraries
import pandas as pd
import numpy as np

In [4]:
#Display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth',None)

###  Upload the data

In [5]:
#Importing the data set already cleaned
data= pd.read_csv ('data_model.csv')

In [6]:
data.head()
data.shape

Unnamed: 0,date,day,week_day,month,year,hour,district,address,record,cum_victims,unique_victims,severity,type_accident,vehicle,person,gendre,age,weather,road_status
0,2010-01-01,1,Friday,Jan,2010,From 00:00 to 00:59,CHAMARTIN,CALLE DE CARTAGENA NUM 104 Madrid,2010/135,1,1,slightly injured,double_colision,car,traveler,femenine,From 40 to 44 years,rain,wet
1,2010-01-01,1,Friday,Jan,2010,From 1:00 to 1:59,PUENTE DE VALLECAS,AUTOVIA M-30 CALZADA 2 KM. 10200 Madrid,2010/94,7,1,slightly injured,multiple_colision,others,traveler,femenine,From 25 to 29 years,dry,wet
2,2010-01-01,1,Friday,Jan,2010,From 1:00 to 1:59,PUENTE DE VALLECAS,AUTOVIA M-30 CALZADA 2 KM. 10200 Madrid,2010/94,7,1,slightly injured,multiple_colision,others,traveler,femenine,From 30 to 34 years,dry,wet
3,2010-01-01,1,Friday,Jan,2010,From 1:00 to 1:59,PUENTE DE VALLECAS,AUTOVIA M-30 CALZADA 2 KM. 10200 Madrid,2010/94,7,1,slightly injured,multiple_colision,others,traveler,femenine,From 25 to 29 years,dry,wet
4,2010-01-01,1,Friday,Jan,2010,From 1:00 to 1:59,PUENTE DE VALLECAS,AUTOVIA M-30 CALZADA 2 KM. 10200 Madrid,2010/94,7,1,slightly injured,multiple_colision,others,traveler,femenine,From 25 to 29 years,dry,wet


(111841, 19)

We want to predict/classificy if a  victim of an accident is high injured/ or mortal or if not ( slightly injured) taking into account some characteristics of the accident. Then we will analyze wich are the important features that the model take into account for the prediction. 

In [7]:
data.isnull().sum()

date              0
day               0
week_day          0
month             0
year              0
hour              0
district          0
address           0
record            0
cum_victims       0
unique_victims    0
severity          0
type_accident     0
vehicle           0
person            0
gendre            0
age               0
weather           0
road_status       0
dtype: int64

### Define our target variable

Our target is the column severity. But in this column we find three values according to how the victim is affected.
Thus, we will creat a new column as hour target with the value 0 if the victim is slightlyly injured and 1 if the victim is seriously injured/ or death. 

In [8]:
data['severity'].value_counts()

slightly injured     102716
seriously injured      8850
mortal                  275
Name: severity, dtype: int64

In [9]:
#data['victim_condition']

In [10]:
#creating a columns for weather:
def victim_condition (row):
    if row['severity'] == 'slightly injured' :
        return 0
    if row['severity']== 'seriously injured' :
        return 1
    if row['severity'] == 'mortal' :
        return 1

In [11]:
data['victim_condition'] = data.apply (lambda row:victim_condition (row), axis=1)

In [12]:
data['victim_condition'].value_counts() 
# this is the target of our model. We can see that we will have to handle data imbalance

0    102716
1      9125
Name: victim_condition, dtype: int64

In [13]:
data.shape
data.dtypes

(111841, 20)

date                object
day                  int64
week_day            object
month               object
year                 int64
hour                object
district            object
address             object
record              object
cum_victims          int64
unique_victims       int64
severity            object
type_accident       object
vehicle             object
person              object
gendre              object
age                 object
weather             object
road_status         object
victim_condition     int64
dtype: object

### Split numerical and categorical

In [14]:
num= data.select_dtypes(np.number)
cat = data.select_dtypes(np.object)

- Numerical

In [15]:
num.head()
num.columns

Unnamed: 0,day,year,cum_victims,unique_victims,victim_condition
0,1,2010,1,1,0
1,1,2010,7,1,0
2,1,2010,7,1,0
3,1,2010,7,1,0
4,1,2010,7,1,0


Index(['day', 'year', 'cum_victims', 'unique_victims', 'victim_condition'], dtype='object')

In [16]:
# we will drop from our data set 'day' and 'unique_victims'

In [17]:
num=num.drop(['day','unique_victims','year'],axis=1)

In [18]:
num.head()

Unnamed: 0,cum_victims,victim_condition
0,1,0
1,7,0
2,7,0
3,7,0
4,7,0


- Categorical

In [19]:
cat.head(1)
cat.shape

Unnamed: 0,date,week_day,month,hour,district,address,record,severity,type_accident,vehicle,person,gendre,age,weather,road_status
0,2010-01-01,Friday,Jan,From 00:00 to 00:59,CHAMARTIN,CALLE DE CARTAGENA NUM 104 Madrid,2010/135,slightly injured,double_colision,car,traveler,femenine,From 40 to 44 years,rain,wet


(111841, 15)

In [20]:
# we will drop from our data set 'address' and 'record'
cat.columns

Index(['date', 'week_day', 'month', 'hour', 'district', 'address', 'record',
       'severity', 'type_accident', 'vehicle', 'person', 'gendre', 'age',
       'weather', 'road_status'],
      dtype='object')

In [21]:
cat=cat.drop(['date','address','record','severity',],axis=1)

In [22]:
cat.isnull().sum()

week_day         0
month            0
hour             0
district         0
type_accident    0
vehicle          0
person           0
gendre           0
age              0
weather          0
road_status      0
dtype: int64

In [23]:
# data_final.head(1)

#### One Hot encoder for categoricals

In [24]:
# hot encoding categoricals
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(cat)
encoded_cat = encoder.transform(cat).toarray()
encoded_cat = pd.DataFrame(encoded_cat)
encoded_cat.columns = encoder.get_feature_names()

In [25]:
encoded_cat.head()

Unnamed: 0,x0_Mon,x0_Saturday,x0_Sunday,x0_Thursday,x0_Tuesday,x0_Wednesday,x1_August,x1_Feb,x1_Jan,x1_July,x1_June,x1_March,x1_May,x1_Sept,x2_From 10:00 to 10:59,x2_From 11:00 to 11:59,x2_From 12:00 to 12:59,x2_From 13:00 to 13:59,x2_From 14:00 to 14:59,x2_From 15:00 to 15:59,x2_From 16:00 to 16:59,x2_From 17:00 to 17:59,x2_From 18:00 to 18:59,x2_From 19:00 to 19:59,x2_From 1:00 to 1:59,x2_From 20:00 to 20:59,x2_From 21:00 to 21:59,x2_From 22:00 to 22:59,x2_From 23:00 to 23:59,x2_From 2:00 to 2:59,x2_From 3:00 to 3:59,x2_From 4:00 to 4:59,x2_From 5:00 to 5:59,x2_From 6:00 to 6:59,x2_From 7:00 to 7:59,x2_From 8:00 to 8:59,x2_From 9:00 to 9:59,x3_BARAJAS,x3_CARABANCHEL,x3_CENTRO,x3_CHAMARTIN,x3_CHAMBERI,x3_CIUDAD LINEAL,x3_FUENCARRAL-EL PARDO,x3_HORTALEZA,x3_LATINA,x3_MONCLOA-ARAVACA,x3_MORATALAZ,x3_PUENTE DE VALLECAS,x3_RETIRO,x3_SALAMANCA,x3_SAN BLAS,x3_TETUAN,x3_USERA,x3_VICALVARO,x3_VILLA DE VALLECAS,x3_VILLA DE VALLECAS.1,x3_VILLAVERDE,x4_double_colision,x4_motorcycle_fall,x4_multiple_colision,x4_object,x4_other,x4_run_over,x5_bus,x5_car,x5_moto,x5_others,x5_truck,x5_van,x6_pedestrian,x6_traveler,x7_masculine,x8_From 10 to 14 years,x8_From 15 to 17 years,x8_From 18 to 20 years,x8_From 21 to 24 years,x8_From 25 to 29 years,x8_From 30 to 34 years,x8_From 35 to 39 years,x8_From 40 to 44 years,x8_From 45 to 49 years,x8_From 50 to 54 years,x8_From 55 to 59 years,x8_From 6 to 9 years,x8_From 60 to 64 years,x8_From 65 to 69 years,x8_From 70 to 74 years,x8_From MtoS From 74 years,x8_FromSCONOCIDto,x9_fog,x9_hail,x9_ice,x9_other,x9_rain,x10_ice,x10_loose_gravel,x10_mud,x10_oil,x10_snow,x10_unknown,x10_wet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [26]:
encoded_cat.reset_index(drop=True, inplace=True)
num.reset_index(drop=True, inplace=True)

In [27]:
data_final=pd.concat([num,encoded_cat],axis=1)

In [28]:
# data_final.head(20)
data_final.isnull().sum()

cum_victims         0
victim_condition    0
x0_Mon              0
x0_Saturday         0
x0_Sunday           0
                   ..
x10_mud             0
x10_oil             0
x10_snow            0
x10_unknown         0
x10_wet             0
Length: 104, dtype: int64

### Split X/Y

In [29]:
#label and features
y=data_final['victim_condition']
X=data_final.drop(['victim_condition'],axis=1)

**Handling Data imbalance**

In [30]:
y.value_counts()

0    102716
1      9125
Name: victim_condition, dtype: int64

In [31]:
#Check the imbalance.
imbalance_check = pd.DataFrame(y.value_counts())
print('imbalanced percentage:',round(imbalance_check['victim_condition'][0]/imbalance_check['victim_condition'].sum()*100,2),'%')

imbalanced percentage: 91.84 %


91,84% of people are slightly injured in an accident.( We expected that)

Thus,every machine model that we create will have a huge tendency to say when it is in doubt that the individual will not be seriously injured.This is bad for us.

The cost of saying that a real mortal/seriously injured is not  is worst than predicting a slightly injured victim as highly injured.

More interest in getting better accuracy in 1 than in 0 in this case

In [34]:
#DOWNSAMPLING
from sklearn.utils import resample

category_0 = data_final[data_final['victim_condition'] == 0]
category_1 = data_final[data_final['victim_condition'] == 1]

category_0_downsampled = resample(category_0, replace=True, n_samples = len(category_1))
print(category_0_downsampled.shape)
print(category_1.shape)
data_downsampled = pd.concat([category_0_downsampled,category_1], axis=0)
print(data_downsampled['victim_condition'].value_counts())

(9125, 104)
(9125, 104)
0    9125
1    9125
Name: victim_condition, dtype: int64


In [35]:
data_downsampled.shape

(18250, 104)

In [36]:
#label and features
y=data_downsampled['victim_condition']
X=data_downsampled.drop(['victim_condition'],axis=1)

In [37]:
X

Unnamed: 0,cum_victims,x0_Mon,x0_Saturday,x0_Sunday,x0_Thursday,x0_Tuesday,x0_Wednesday,x1_August,x1_Feb,x1_Jan,x1_July,x1_June,x1_March,x1_May,x1_Sept,x2_From 10:00 to 10:59,x2_From 11:00 to 11:59,x2_From 12:00 to 12:59,x2_From 13:00 to 13:59,x2_From 14:00 to 14:59,x2_From 15:00 to 15:59,x2_From 16:00 to 16:59,x2_From 17:00 to 17:59,x2_From 18:00 to 18:59,x2_From 19:00 to 19:59,x2_From 1:00 to 1:59,x2_From 20:00 to 20:59,x2_From 21:00 to 21:59,x2_From 22:00 to 22:59,x2_From 23:00 to 23:59,x2_From 2:00 to 2:59,x2_From 3:00 to 3:59,x2_From 4:00 to 4:59,x2_From 5:00 to 5:59,x2_From 6:00 to 6:59,x2_From 7:00 to 7:59,x2_From 8:00 to 8:59,x2_From 9:00 to 9:59,x3_BARAJAS,x3_CARABANCHEL,x3_CENTRO,x3_CHAMARTIN,x3_CHAMBERI,x3_CIUDAD LINEAL,x3_FUENCARRAL-EL PARDO,x3_HORTALEZA,x3_LATINA,x3_MONCLOA-ARAVACA,x3_MORATALAZ,x3_PUENTE DE VALLECAS,x3_RETIRO,x3_SALAMANCA,x3_SAN BLAS,x3_TETUAN,x3_USERA,x3_VICALVARO,x3_VILLA DE VALLECAS,x3_VILLA DE VALLECAS.1,x3_VILLAVERDE,x4_double_colision,x4_motorcycle_fall,x4_multiple_colision,x4_object,x4_other,x4_run_over,x5_bus,x5_car,x5_moto,x5_others,x5_truck,x5_van,x6_pedestrian,x6_traveler,x7_masculine,x8_From 10 to 14 years,x8_From 15 to 17 years,x8_From 18 to 20 years,x8_From 21 to 24 years,x8_From 25 to 29 years,x8_From 30 to 34 years,x8_From 35 to 39 years,x8_From 40 to 44 years,x8_From 45 to 49 years,x8_From 50 to 54 years,x8_From 55 to 59 years,x8_From 6 to 9 years,x8_From 60 to 64 years,x8_From 65 to 69 years,x8_From 70 to 74 years,x8_From MtoS From 74 years,x8_FromSCONOCIDto,x9_fog,x9_hail,x9_ice,x9_other,x9_rain,x10_ice,x10_loose_gravel,x10_mud,x10_oil,x10_snow,x10_unknown,x10_wet
45223,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56048,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52293,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17799,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
78758,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111785,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111788,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111790,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111810,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
y.shape
X.shape

(18250,)

(18250, 103)

### Split TEST/TRAIN

In [40]:
#train/test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size =0.2, random_state=100)
X_train.shape
X_test.shape

(14600, 103)

(3650, 103)

## Classification Model

In [46]:
#logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
classification = LogisticRegression(random_state=0, solver='lbfgs',multi_class='ovr').fit(X_train, y_train)

predictions = classification.predict(X_test)
cross_val_scores = cross_val_score(classification, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predictions))

0.6823972602739726
[[1164  686]
 [ 475 1325]]


In [47]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [48]:
print("precision: ",precision_score(y_test,predictions,average='macro' ))
print("recall: ",recall_score(y_test,predictions,average='macro'))
print("f1: ",f1_score(y_test,predictions,average='macro'))

precision:  0.6845326603619082
recall:  0.6826501501501502
f1:  0.6812977232715274


In [None]:
#Random Forest

In [42]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predictions))

RandomForestClassifier(max_depth=5, min_samples_leaf=20, min_samples_split=20)

0.6673287671232877
[[1110  740]
 [ 488 1312]]


In [43]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [44]:
print("precision: ",precision_score(y_test,predictions,average='macro' ))
print("recall: ",recall_score(y_test,predictions,average='macro'))
print("f1: ",f1_score(y_test,predictions,average='macro'))

precision:  0.666997245582319
recall:  0.6644444444444444
f1:  0.6625280385878429


In [45]:
pd.DataFrame(list(zip(clf.feature_importances_,X_train.columns)),columns=['score','name']).sort_values('score', ascending=False)

Unnamed: 0,score,name
66,0.283797,x5_car
68,0.147160,x5_others
64,0.110465,x4_run_over
71,0.095162,x6_pedestrian
67,0.078410,x5_moto
...,...,...
98,0.000000,x10_mud
56,0.000000,x3_VILLA DE VALLECAS
100,0.000000,x10_snow
101,0.000000,x10_unknown


In [59]:
Features_importances=pd.DataFrame(list(zip(clf.feature_importances_,X_train.columns)),columns=['score','name'])

In [63]:
Features_importances.sort_values('score', ascending=False).head(20)

Unnamed: 0,score,name
66,0.21353,x5_car
71,0.166128,x6_pedestrian
64,0.127454,x4_run_over
68,0.110595,x5_others
67,0.080207,x5_moto
72,0.064146,x6_traveler
59,0.039687,x4_double_colision
73,0.036742,x7_masculine
89,0.03544,x8_From MtoS From 74 years
0,0.029112,cum_victims
