In [81]:
import numpy as np
from acquisition import Acquisition
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.utils import class_weight

In [82]:
acq = Acquisition()

  acq = Acquisition()


# Users preprocessing

In [83]:
df_users = acq.get_users()

In [84]:

# to have the value in order cause unscathered was after death
df_users.grav.replace(inplace=True, value=3, to_replace=2) # merge death and hospitalized classes
df_users.grav.replace(inplace=True, value=2, to_replace=4)
df_users.grav.value_counts()

1    764874
2    666873
3    444258
Name: grav, dtype: int64

In [85]:
df_users['grav'].describe()

count    1.876005e+06
mean     1.829096e+00
std      7.844216e-01
min      1.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      2.000000e+00
max      3.000000e+00
Name: grav, dtype: float64

In [86]:
df_users.drop(inplace=True, labels=["place", "trip", "locp", "actp", "etatp", "num_veh", "secu"], axis=1) # drop column with missing infos or not important column
df_users = df_users.groupby(['Num_Acc'], as_index=False).first() #keeping only the first gave me worst result

In [87]:
df_users.dropna(inplace=True)

# Places preprocessing

In [88]:
df_places = acq.get_places()
df_places.drop(inplace=True, labels=["voie", "v1", "v2", "pr", "pr1", "vosp", "plan", "lartpc", "larrout", "situ", "env1"],axis=1) # drop column with missing infos or not important column

In [89]:
df_places.isna().sum()

Num_Acc       0
catr          1
circ        798
nbv        1790
prof       1061
surf       1017
infra      1278
dtype: int64

In [90]:
df_places.dropna(inplace=True)

In [91]:
df_places.circ.unique() #0 should not exist - question from the dataset on kaggle


array([2., 1., 0., 3., 4.])

In [92]:
df_places["circ"] = df_places["circ"].astype(np.int64)
df_places.circ.describe()


count    837362.000000
mean          1.854929
std           0.720794
min           0.000000
25%           2.000000
50%           2.000000
75%           2.000000
max           4.000000
Name: circ, dtype: float64

In [93]:
df_places = df_places[df_places.circ != 0] #remove 0

In [94]:
df_places.circ.describe()

count    792909.000000
mean          1.958922
std           0.587330
min           1.000000
25%           2.000000
50%           2.000000
75%           2.000000
max           4.000000
Name: circ, dtype: float64

In [95]:
df_places.isna().sum()

Num_Acc    0
catr       0
circ       0
nbv        0
prof       0
surf       0
infra      0
dtype: int64

# Caracteristics preprocessing

In [96]:
df_carac = acq.get_accident_caracteristics()
df_carac.drop(inplace=True, labels=["an",
                                    "col",
                                    "com",
                                    "adr",
                                    "gps",
                                    "lat",
                                    "long"],axis=1) # drop column with missing infos or not important column


In [97]:
df_carac.isna().sum()

Num_Acc     0
mois        0
jour        0
hrmn        0
lum         0
agg         0
int         0
atm        21
dep         0
dtype: int64

In [98]:
df_carac.dropna(inplace=True)

In [99]:
df_carac.isna().sum()


Num_Acc    0
mois       0
jour       0
hrmn       0
lum        0
agg        0
int        0
atm        0
dep        0
dtype: int64

In [100]:
df_carac['mn'] = df_carac['hrmn'].astype(str).str[-2:]
df_carac['hr'] = df_carac['hrmn'].astype(str).str[:-2]
df_carac.drop(columns='hrmn',inplace=True)

In [101]:
df_carac['hr'].unique()

array(['18', '19', '11', '10', '8', '16', '12', '7', '20', '15', '9', '5',
       '13', '17', '21', '3', '6', '14', '22', '2', '1', '4', '23', ''],
      dtype=object)

In [102]:
df_carac['hr'].replace(inplace=True, to_replace='', value='0') #first hour is empty so replacing with 0

In [103]:
df_carac['hr'].unique()


array(['18', '19', '11', '10', '8', '16', '12', '7', '20', '15', '9', '5',
       '13', '17', '21', '3', '6', '14', '22', '2', '1', '4', '23', '0'],
      dtype=object)

# Converting to int64

In [104]:
df_users.dtypes

Num_Acc      int64
catu         int64
grav         int64
sex          int64
an_nais    float64
dtype: object

In [105]:
df_users.an_nais = df_users.an_nais.astype((np.int64))


In [106]:
df_users.dtypes

Num_Acc    int64
catu       int64
grav       int64
sex        int64
an_nais    int64
dtype: object

In [107]:
df_carac.dtypes

Num_Acc      int64
mois         int64
jour         int64
lum          int64
agg          int64
int          int64
atm        float64
dep          int64
mn          object
hr          object
dtype: object

In [108]:
df_carac[['atm', 'mn', 'hr']] = df_carac[['atm', 'mn', 'hr']].astype((np.int64))


In [109]:
df_carac.dtypes

Num_Acc    int64
mois       int64
jour       int64
lum        int64
agg        int64
int        int64
atm        int64
dep        int64
mn         int64
hr         int64
dtype: object

In [110]:
df_places.dtypes

Num_Acc      int64
catr       float64
circ         int64
nbv        float64
prof       float64
surf       float64
infra      float64
dtype: object

In [111]:
df_places[df_places.columns] = df_places[df_places.columns].astype((np.int64))


In [112]:
df_places.dtypes

Num_Acc    int64
catr       int64
circ       int64
nbv        int64
prof       int64
surf       int64
infra      int64
dtype: object

# Merge dataframes

In [113]:
df_users = df_users.reset_index(drop=True)
df_carac = df_carac.reset_index(drop=True)
df_places = df_places.reset_index(drop=True)
df_x_y = df_carac.merge(df_places, on="Num_Acc")
df_x_y = df_x_y.merge(df_users, on="Num_Acc")


In [114]:
df_x_y.head(15)

Unnamed: 0,Num_Acc,mois,jour,lum,agg,int,atm,dep,mn,hr,catr,circ,nbv,prof,surf,infra,catu,grav,sex,an_nais
0,201600000002,3,16,1,2,6,1,590,0,18,3,1,0,1,1,0,1,3,1,1960
1,201600000003,7,13,1,1,1,1,590,0,19,3,2,2,1,2,0,1,1,1,1997
2,201600000004,8,15,2,2,1,7,590,30,19,4,2,0,1,1,0,1,3,1,1999
3,201600000006,12,23,1,2,1,7,590,15,11,3,2,0,1,1,0,1,1,1,1957
4,201600000007,5,1,1,2,1,7,590,45,11,3,2,0,1,1,0,1,1,1,2001
5,201600000008,5,14,2,1,1,1,590,15,19,3,2,2,1,1,0,1,1,2,1969
6,201600000009,9,23,1,2,1,1,590,0,19,4,2,2,1,1,0,1,1,1,1984
7,201600000010,12,30,1,1,1,9,590,30,10,4,2,0,0,7,0,1,1,1,1973
8,201600000011,1,25,2,2,1,8,590,0,8,3,2,2,1,2,0,1,1,1,1966
9,201600000012,1,28,3,1,1,1,590,15,18,3,2,2,0,1,0,1,3,1,1990


In [115]:
df_x_y.shape[0]

340687

In [116]:
df_x_y.isna().sum()

Num_Acc    0
mois       0
jour       0
lum        0
agg        0
int        0
atm        0
dep        0
mn         0
hr         0
catr       0
circ       0
nbv        0
prof       0
surf       0
infra      0
catu       0
grav       0
sex        0
an_nais    0
dtype: int64

In [117]:
df_x_y.dropna(inplace=True)

In [118]:
df_x_y.drop('Num_Acc', axis=1,inplace=True)

In [119]:
#check if classes are imbalanced
df_x_y.grav.value_counts() #stratifiedKfold

1    134857
3    125996
2     79834
Name: grav, dtype: int64

# Create train test and validation set

In [120]:
cv = RepeatedStratifiedKFold()
X = df_x_y.drop('grav', axis=1)
X = X.drop(["catu", "an_nais", "sex", "infra"], axis=1) # column shown to not be important with feature selection
y = df_x_y['grav']

In [121]:
# training-testing validating
X_train, X_rem, y_train, y_rem = train_test_split(
    X, y, train_size=0.4, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(
    X_rem, y_rem, test_size=0.5, stratify=y_rem)


In [122]:
#calculate class weights
class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                  classes=np.unique(y_train),
                                                  y=y_train)
#match class weights to their index
class_weights = dict(zip(range(1, len(class_weights) + 1), class_weights))

In [123]:
#scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

# Fit

In [124]:

model = LogisticRegression(
    max_iter=1000, class_weight=class_weights, penalty='l2', solver='liblinear')
#grid = GridSearchCV(model, hyperparam_grid, cv=cv, scoring='f1')
#cv_model = cross_val_score(model,X,y,cv=cv, scoring='f1_micro')
#grid.fit(X_train, y_train)
#print(grid.best_score_)
#print(grid.best_params_)
#print(cv_model)
model.fit(X_train, y_train)

LogisticRegression(class_weight={1: 0.8420863998418083, 2: 1.42249919101452,
                                 3: 0.9013188354035213},
                   max_iter=1000, solver='liblinear')

In [125]:
# test

In [126]:
y_pred = model.predict(X_test)
score = f1_score(y_test, y_pred, average='micro')
print('f1_score :', score)

f1_score : 0.5032825540325027


In [127]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.51      0.59      0.55     40457
           2       0.34      0.05      0.09     23951
           3       0.51      0.69      0.59     37799

    accuracy                           0.50    102207
   macro avg       0.45      0.45      0.41    102207
weighted avg       0.47      0.50      0.46    102207



# Feature selection

In [128]:
# from sklearn.feature_selection import RFE
# predictors = X_train
# selector = RFE(model, n_features_to_select=1)
# selector = selector.fit(predictors, y_train)


In [129]:
# order = selector.ranking_
# feature_ranks = []
# for i in order:
#     feature_ranks.append(f"{i-1}. {X.columns[i-1]}")
# feature_ranks


# Validation

In [130]:
#predict with validation set
y_pred = model.predict(X_val)
score = f1_score(y_val, y_pred, average='micro')
print(classification_report(y_val, y_pred))
print('f1_score :', score)

              precision    recall  f1-score   support

           1       0.51      0.59      0.55     40457
           2       0.32      0.05      0.09     23950
           3       0.51      0.69      0.59     37799

    accuracy                           0.50    102206
   macro avg       0.45      0.45      0.41    102206
weighted avg       0.47      0.50      0.45    102206

f1_score : 0.5023579828972858


# Serialize

In [131]:
import pickle
filename = 'trained_linear.sav'
with open(filename, 'wb') as file:
    pickle.dump(model, file)

In [132]:
X

Unnamed: 0,mois,jour,lum,agg,int,atm,dep,mn,hr,catr,circ,nbv,prof,surf
0,3,16,1,2,6,1,590,0,18,3,1,0,1,1
1,7,13,1,1,1,1,590,0,19,3,2,2,1,2
2,8,15,2,2,1,7,590,30,19,4,2,0,1,1
3,12,23,1,2,1,7,590,15,11,3,2,0,1,1
4,5,1,1,2,1,7,590,45,11,3,2,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340682,12,7,1,1,3,1,971,0,17,2,2,2,3,1
340683,12,6,1,2,1,1,973,0,17,4,2,2,1,1
340684,12,19,1,1,1,1,973,10,14,2,2,2,0,1
340685,12,22,2,2,1,1,974,15,19,3,1,0,0,1


In [133]:
y

0         3
1         1
2         3
3         1
4         1
         ..
340682    1
340683    3
340684    1
340685    1
340686    1
Name: grav, Length: 340687, dtype: int64