In [1]:
import pandas as pd

In [5]:
data = pd.read_csv("./data/processed_with_const_op.csv", index_col=[0])
data.head()

Unnamed: 0,age,sex,nature_of_inj,part_of_body,src_of_injury,event_type,evn_factor,hum_factor,occ_code,degree_of_inj,task_assigned,const_op,const_op_cause
31672,30,M,Puncture,FOOT/ANKLE,BODILY MOTION,OTHER,OTHER,"MISJUDGMENT, HAZ. SITUATION",Roofers,Hospitalized injury,Task regularly assigned,Roofing,Roofing
31673,34,M,Fracture,FOOT/ANKLE,OTHER,FALL(FROM ELEVATION),OTHER,EQUIP. INAPPROPR FOR OPERATION,Carpenters,Hospitalized injury,Task regularly assigned,Steel Erection Of Solid Web-Connecting,Steel Erection Of Solid Web-Connecting
31674,40,M,Electric Shock,HAND(S),OTHER,SHOCK,OTHER,"MISJUDGMENT, HAZ. SITUATION",Supervisors; electricians & power transm. inst...,Hospitalized injury,Task regularly assigned,Installing equipment (HVAC and other),Installing equipment (HVAC and other)
31870,20,M,Asphyxia,OTHBODYSYS,BUILDINGS/STRUCTURES,OTHER,OTHER,OTHER,Occupation not reported,Fatality,Task regularly assigned,Other Activities-Post Decking Detail Work,Other Activities-Post Decking Detail Work
31921,31,M,Electric Shock,HAND(S),ELEC APPARAT/WIRING,SHOCK,ILLUMINATION,"MISJUDGMENT, HAZ. SITUATION",Electricians,Fatality,Task not regularly assigned,"Interior plumbing, ducting, electrical work","Interior plumbing, ducting, electrical work"


## Correcting for Unhealthy user bias

## Multi-class classification task

In [31]:
# degree_of_injury ~ age, sex, const_op, task_assigned, occ_code
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [28]:
X, y = data[["age", "sex", "const_op", "task_assigned", "occ_code"]], data["degree_of_inj"]
X

Unnamed: 0,age,sex,const_op,task_assigned,occ_code
31672,30,M,Roofing,Task regularly assigned,Roofers
31673,34,M,Steel Erection Of Solid Web-Connecting,Task regularly assigned,Carpenters
31674,40,M,Installing equipment (HVAC and other),Task regularly assigned,Supervisors; electricians & power transm. inst...
31870,20,M,Other Activities-Post Decking Detail Work,Task regularly assigned,Occupation not reported
31921,31,M,"Interior plumbing, ducting, electrical work",Task not regularly assigned,Electricians
...,...,...,...,...,...
203453,40,M,Backfilling and compacting,Task regularly assigned,"Paving, surfacing and tamping equipment operators"
203454,26,M,Exterior masonry,Task regularly assigned,Concrete and terrazzo finishers
203456,38,M,"Installing interior walls, ceilings, doors",Task regularly assigned,Occupation not reported
203470,39,M,"Installing plumbing, lighting fixtures",Task regularly assigned,Electricians


In [30]:
# one-hot
X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,age,sex_M,const_op_Bituminous concrete placement,"const_op_Construction of playing fields, tennis courts",const_op_Cutting concrete pavement,const_op_Demolition,const_op_Dredging,"const_op_Elevator, escalator installation",const_op_Emplacing reinforcing steel,const_op_Erecting structural steel,...,"occ_code_Tile setters, hard and soft",occ_code_Timber cutting and logging occupations,occ_code_Trade and industrial teachers,"occ_code_Truck drivers, heavy","occ_code_Truck drivers, light",occ_code_Typists,occ_code_Urban planners,occ_code_Vehicle washers and equipment cleaners,"occ_code_Weighers, measurers and checkers",occ_code_Welders and cutters
31672,30,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31673,34,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31674,40,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31870,20,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31921,31,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203453,40,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203454,26,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203456,38,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203470,39,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [33]:
# Model fit and predict
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print(classification_report(y_true=y_test, y_pred=clf.predict(X_test)))

                         precision    recall  f1-score   support

               Fatality       0.53      0.51      0.52      2124
    Hospitalized injury       0.56      0.61      0.59      2509
Non Hospitalized injury       0.17      0.12      0.14       463

               accuracy                           0.53      5096
              macro avg       0.42      0.41      0.42      5096
           weighted avg       0.51      0.53      0.52      5096



In [41]:
from imblearn.ensemble import BalancedRandomForestClassifier
# Model fit and predict
balanced_clf = BalancedRandomForestClassifier()
balanced_clf.fit(X_train, y_train)
print(classification_report(y_true=y_test, y_pred=balanced_clf.predict(X_test)))

                         precision    recall  f1-score   support

               Fatality       0.57      0.50      0.53      2124
    Hospitalized injury       0.59      0.38      0.46      2509
Non Hospitalized injury       0.13      0.45      0.20       463

               accuracy                           0.43      5096
              macro avg       0.43      0.44      0.40      5096
           weighted avg       0.54      0.43      0.47      5096



In [37]:
# Fatal vs Non-fatal
y_fatal_train, y_fatal_test = y_train.replace("Hospitalized injury", "Non-fatal").replace("Non Hospitalized injury", "Non-fatal"), y_test.replace("Hospitalized injury", "Non-fatal").replace("Non Hospitalized injury", "Non-fatal")

In [39]:
# Model fit and predict
fatal_clf = RandomForestClassifier()
fatal_clf.fit(X_train, y_fatal_train)
print(classification_report(y_true=y_fatal_test, y_pred=fatal_clf.predict(X_test)))

              precision    recall  f1-score   support

    Fatality       0.54      0.48      0.51      2124
   Non-fatal       0.66      0.70      0.68      2972

    accuracy                           0.61      5096
   macro avg       0.60      0.59      0.59      5096
weighted avg       0.61      0.61      0.61      5096



In [42]:
# Model fit and predict
balanced_fatal_clf = BalancedRandomForestClassifier()
balanced_fatal_clf.fit(X_train, y_fatal_train)
print(classification_report(y_true=y_fatal_test, y_pred=balanced_fatal_clf.predict(X_test)))

              precision    recall  f1-score   support

    Fatality       0.52      0.59      0.55      2124
   Non-fatal       0.68      0.61      0.64      2972

    accuracy                           0.60      5096
   macro avg       0.60      0.60      0.60      5096
weighted avg       0.61      0.60      0.60      5096



In [43]:
# Severe vs Non-severe
y_severe_train, y_severe_test = y_train.replace("Hospitalized injury", "Severe").replace("Fatality", "Severe"), y_test.replace("Hospitalized injury", "Severe").replace("Fatality", "Severe")

In [44]:
# Model fit and predict
severe_clf = RandomForestClassifier()
severe_clf.fit(X_train, y_severe_train)
print(classification_report(y_true=y_severe_test, y_pred=severe_clf.predict(X_test)))

                         precision    recall  f1-score   support

Non Hospitalized injury       0.20      0.10      0.13       463
                 Severe       0.91      0.96      0.94      4633

               accuracy                           0.88      5096
              macro avg       0.56      0.53      0.53      5096
           weighted avg       0.85      0.88      0.86      5096



In [45]:
# Model fit and predict
balanced_severe_clf = BalancedRandomForestClassifier()
balanced_severe_clf.fit(X_train, y_severe_train)
print(classification_report(y_true=y_severe_test, y_pred=balanced_severe_clf.predict(X_test)))

                         precision    recall  f1-score   support

Non Hospitalized injury       0.12      0.60      0.20       463
                 Severe       0.93      0.57      0.70      4633

               accuracy                           0.57      5096
              macro avg       0.53      0.58      0.45      5096
           weighted avg       0.86      0.57      0.66      5096

