In [61]:
from sklearn import tree
import pandas as pd
import os
from sqlalchemy import create_engine

In [62]:
# Connect to the DB Server and retrieve table, export data to CSV.
engine = create_engine('postgresql+psycopg2://Grant_Nick_Proj3:Grant_Nick_Proj3@grantnickproj3.ckbobg8otu11.ap-southeast-2.rds.amazonaws.com/Project_3')
df = pd.read_sql_query('select * from "incident_data"',con=engine)

# Export the Full Table to CSV
df.to_csv('final_data.csv')

df.head(1)

Unnamed: 0,mine_id,Operational Process,Date,State,Resource Mined,Underground Method,Accident Injury Illness Classification,Accident Type,Injury Count,Sex,...,Days Lost,Description 1,Description 2,Experience Total,Experience Mine,Experience Job,Injury Classification,Operator Contractor,Commodity WorkStatus,General Incident Type
0,1200945,Dredging Activities,13/10/2020,Indiana,Sand & gravel,0,Handling material,Caught in-under-between a moving and stationar...,1,Female,...,0.0,EE WAS MOVING A 2X3X.75 STEEL PLATE OFF OF A P...,HE TIME OF THE INCIDENT.,1.15,1.15,1.15,NDL (No days lost),Operator,Sand & gravel operator,Handling material


## Model 1 - Nature of Injury (NOI)

In [63]:
NOI_target = df["Nature of Injury"]
data = df[["Sex","Age", "Occupation", "Operational Process"]]
data.head()

Unnamed: 0,Sex,Age,Occupation,Operational Process
0,Female,20 - 29,Labourer Roles,Dredging Activities
1,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...
2,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...
3,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...
4,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...


In [64]:
data_dummies = pd.get_dummies(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dummies, NOI_target)

In [65]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.29280397022332505

In [66]:
# This is how we predict. 
rf.predict([[0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]])

array(['Sprain, strains'], dtype=object)

In [67]:
# Export the Output Headers to Excel
X_train.to_csv("Machine Learning Output - Nature of Injury.csv", index=False)

In [68]:
# Show the importance of the columns
sorted(zip(rf.feature_importances_,data_dummies.columns))

[(0.002482783152535973,
  'Operational Process_Culm Banks Activities (Coal Mining Only)'),
 (0.003750018459550506, 'Operational Process_Non-Mine Workshops and Yards'),
 (0.004323291891028403, 'Occupation_Apprentice or Trainee Roles'),
 (0.005782914121079999, 'Age_80 - 89'),
 (0.00958750782715797, 'Operational Process_Other Surface Facility'),
 (0.010252271670940418, 'Age_70 - 79'),
 (0.0138991454700199, 'Occupation_Maritime Roles'),
 (0.01859600311250736, 'Occupation_Explosives Roles'),
 (0.02145439877462485, 'Sex_Female'),
 (0.023100809435919096, 'Sex_Male'),
 (0.023424349089472187, 'Age_<20'),
 (0.02394762143816128, 'Occupation_Other Role'),
 (0.026672939519933117, 'Occupation_Driller and Support Roles'),
 (0.026960423233546787, 'Operational Process_Office Located on Mine'),
 (0.03214051945444945, 'Operational Process_Dredging Activities'),
 (0.03331139914734232,
  'Occupation_Technical Services (Geology, Survey, Engineer, Lab. Tech, OHS etc.)'),
 (0.03838349553492695, 'Occupation_Mi

## Model 2 - Injured Body Part (IBP)

In [69]:
IBP_target = df["Injured Body Part"]
data = df[["Sex","Age", "Occupation", "Operational Process"]]
data.head()

Unnamed: 0,Sex,Age,Occupation,Operational Process
0,Female,20 - 29,Labourer Roles,Dredging Activities
1,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...
2,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...
3,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...
4,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...


In [70]:
data_dummies = pd.get_dummies(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dummies, IBP_target)

In [71]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.23490488006617039

In [72]:
# This is how we predict. 
rf.predict([[0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]])

array(['Wrist, Hand and Fingers'], dtype=object)

In [73]:
# Export the Output Headers to Excel
X_train.to_csv("Machine Learning Output - Injured Body Part.csv", index=False)

In [74]:
# Show the importance of the columns
sorted(zip(rf.feature_importances_,data_dummies.columns))

[(0.002334839733535528, 'Age_80 - 89'),
 (0.003405483147539442,
  'Operational Process_Culm Banks Activities (Coal Mining Only)'),
 (0.004068260721610495, 'Occupation_Apprentice or Trainee Roles'),
 (0.005286496952161909, 'Operational Process_Non-Mine Workshops and Yards'),
 (0.00927719918360673, 'Age_70 - 79'),
 (0.011698720326779776, 'Operational Process_Other Surface Facility'),
 (0.015756420723918765, 'Occupation_Maritime Roles'),
 (0.020761728451404178, 'Age_<20'),
 (0.02105063149345905, 'Occupation_Other Role'),
 (0.02177676764101375, 'Occupation_Explosives Roles'),
 (0.02631508051594612, 'Operational Process_Office Located on Mine'),
 (0.029581229102170217, 'Occupation_Driller and Support Roles'),
 (0.030351112283334966, 'Operational Process_Dredging Activities'),
 (0.03349027664228626, 'Sex_Male'),
 (0.03379890614716281, 'Occupation_Mine Supervisory and Management Roles'),
 (0.03464990134521028, 'Sex_Female'),
 (0.03641425437919529,
  'Occupation_Technical Services (Geology, Su

## Model 3 - Source of Injury (SOI)

In [75]:
SOI_target = df["Source of Injury"]
data = df[["Sex","Age", "Occupation", "Operational Process"]]
data.head()

Unnamed: 0,Sex,Age,Occupation,Operational Process
0,Female,20 - 29,Labourer Roles,Dredging Activities
1,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...
2,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...
3,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...
4,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...


In [76]:
data_dummies = pd.get_dummies(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dummies, SOI_target)

In [77]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.14640198511166252

In [78]:
# This is how we predict. 
rf.predict([[0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]])

array(['Caving rock, coal, ore, waste'], dtype=object)

In [79]:
# Export the Output Headers to Excel
X_train.to_csv("Machine Learning Output - Source of Injury.csv", index=False)

In [80]:
sorted(zip(rf.feature_importances_,data_dummies.columns))


[(0.0010746827764862314,
  'Operational Process_Culm Banks Activities (Coal Mining Only)'),
 (0.0021691964336285553, 'Age_80 - 89'),
 (0.0033105882471946035, 'Operational Process_Non-Mine Workshops and Yards'),
 (0.0035694577172941185, 'Occupation_Apprentice or Trainee Roles'),
 (0.0073555079179368765, 'Operational Process_Other Surface Facility'),
 (0.010676412288536408, 'Age_70 - 79'),
 (0.012502150825483805, 'Occupation_Maritime Roles'),
 (0.018295113377840892, 'Occupation_Explosives Roles'),
 (0.018745003857535376, 'Occupation_Other Role'),
 (0.01889934697265808, 'Operational Process_Office Located on Mine'),
 (0.0215102869354022, 'Operational Process_Dredging Activities'),
 (0.02350977990249253, 'Occupation_Driller and Support Roles'),
 (0.023565597879666368, 'Age_<20'),
 (0.026666797973611697, 'Sex_Male'),
 (0.02671029752940619, 'Sex_Female'),
 (0.027097979795897696,
  'Occupation_Technical Services (Geology, Survey, Engineer, Lab. Tech, OHS etc.)'),
 (0.0295963941615574,
  'Oper

## Model 4 - Activity (ACT)

In [81]:
ACT_target = df["Activity"]
data = df[["Sex","Age", "Occupation", "Operational Process"]]
data.head()

Unnamed: 0,Sex,Age,Occupation,Operational Process
0,Female,20 - 29,Labourer Roles,Dredging Activities
1,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...
2,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...
3,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...
4,Female,30 - 39,Labourer Roles,Crushing or Processing Facility (Incl. Associa...


In [82]:
data_dummies = pd.get_dummies(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dummies, ACT_target)

In [83]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.22249793217535152

In [84]:
# This is how we predict. 
rf.predict([[0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]])

array(['Handling supplies or material, load and unload'], dtype=object)

In [85]:
# Export the Output Headers to Excel
X_train.to_csv("Machine Learning Output - Activity.csv", index=False)

In [86]:
sorted(zip(rf.feature_importances_,data_dummies.columns))

[(0.0, 'Operational Process_Culm Banks Activities (Coal Mining Only)'),
 (0.0021682438627740053, 'Operational Process_Non-Mine Workshops and Yards'),
 (0.00322749692939162, 'Occupation_Apprentice or Trainee Roles'),
 (0.005372246671286388, 'Age_80 - 89'),
 (0.009075450523317865, 'Occupation_Maritime Roles'),
 (0.010155993545305202, 'Operational Process_Other Surface Facility'),
 (0.011003770797178769, 'Age_70 - 79'),
 (0.017114387911634898, 'Occupation_Explosives Roles'),
 (0.01715902487042439, 'Occupation_Other Role'),
 (0.020760097244429718, 'Age_<20'),
 (0.021429524928305948, 'Occupation_Driller and Support Roles'),
 (0.02289630078403155, 'Operational Process_Dredging Activities'),
 (0.02317163502900554, 'Operational Process_Office Located on Mine'),
 (0.026192037185472868, 'Sex_Male'),
 (0.02646712199871931, 'Occupation_Mine Supervisory and Management Roles'),
 (0.026799037778310827, 'Sex_Female'),
 (0.028234189262092414,
  'Occupation_Technical Services (Geology, Survey, Engineer,