In [11]:
import matplotlib as plt
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.preprocessing import OneHotEncoder
from libs.ds_charts import get_variable_types

# Missing values imputation

### Dataset 1

In [21]:
dataset_1 = pd.read_csv('dataset_1/NYC_collisions_tabular.csv', na_values='NaN')
new_dataset_1 = dataset_1.copy()

#FIND VARIABLES WITH MISSING VALUES
mv = {}
for var in new_dataset_1:
    nr = new_dataset_1[var].isna().sum()
    if nr > 0:
        mv[var] = nr

#DISCARD COLUMNS WITH MORE THEN 90% MISSING VALUES
threshold = new_dataset_1.shape[0] * 0.85

missings = [c for c in mv.keys() if mv[c]>threshold]
new_dataset_1.drop(columns=missings, inplace=True)
print('Dropped variables', missings)

#DISCARD RECORDS WITH MAJORITY OF MISSING VALUES
threshold = new_dataset_1.shape[1] * 0.50

new_dataset_1.dropna(thresh=threshold, inplace=True)
print(new_dataset_1.shape)

#PERSON_AGE
person_age = dataset_1['PERSON_AGE']
mean_ages = int(person_age.mean())
new_dataset_1['PERSON_AGE'].fillna(mean_ages,inplace=True)

#SAFETY_EQUIPMENT
new_dataset_1['SAFETY_EQUIPMENT'].fillna('Unknown',inplace=True)

#EJECTION
new_dataset_1['EJECTION'].fillna('Not Ejected',inplace=True)

#VEHICLE_ID
new_dataset_1['VEHICLE_ID'].dropna(inplace=True)

#POSITION IN VEHICLE
new_dataset_1['POSITION_IN_VEHICLE'].fillna('Unknown',inplace=True)
new_dataset_1.dropna(inplace=True)


Dropped variables ['PED_LOCATION', 'CONTRIBUTING_FACTOR_2', 'CONTRIBUTING_FACTOR_1', 'PED_ACTION']
(45669, 17)
CRASH_DATE             0
CRASH_TIME             0
PERSON_AGE             0
BODILY_INJURY          0
SAFETY_EQUIPMENT       0
PERSON_SEX             0
PERSON_TYPE            0
EJECTION               0
COMPLAINT              0
EMOTIONAL_STATUS       0
VEHICLE_ID             0
PERSON_ID              0
POSITION_IN_VEHICLE    0
PED_ROLE               0
UNIQUE_ID              0
COLLISION_ID           0
PERSON_INJURY          0
dtype: int64


### Dummification

In [18]:
file = 'nyc_collisions'
filename = 'data/nyc_collisions.csv'
symbolic_vars = ['BODILY_INJURY','SAFETY_EQUIPMENT','PERSON_SEX','PERSON_TYPE','EJECTION','COMPLAINT','EMOTIONAL_STATUS','POSITION_IN_VEHICLE','PED_ROLE','PERSON_INJURY']

def dummify(df, vars_to_dummify):
    other_vars = [c for c in df.columns if not c in vars_to_dummify]
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=bool)
    X = df[vars_to_dummify]
    encoder.fit(X)
    new_vars = encoder.get_feature_names_out(vars_to_dummify)
    trans_X = encoder.transform(X)
    dummy = pd.DataFrame(trans_X, columns=new_vars, index=X.index)
    dummy = dummy.convert_dtypes(convert_boolean=True)

    final_df = pd.concat([df[other_vars], dummy], axis=1)
    return final_df

variables = get_variable_types(new_dataset_1)
df = dummify(new_dataset_1, symbolic_vars)
df.to_csv(f'data/{file}_dummified.csv', index=False)
nr = df.isna().sum()
print(nr)
#df.describe(include=[bool])

CRASH_DATE                  0
CRASH_TIME                  0
PERSON_AGE                  0
VEHICLE_ID               6571
PERSON_ID                   0
                         ... 
PED_ROLE_Other              0
PED_ROLE_Passenger          0
PED_ROLE_Pedestrian         0
PERSON_INJURY_Injured       0
PERSON_INJURY_Killed        0
Length: 95, dtype: int64
