In [7]:
from sklearn import tree
import pandas as pd
import pickle
import os
from sqlalchemy import create_engine

In [8]:
# Connect to the DB Server and retrieve table, export data to CSV.
engine = create_engine('postgresql+psycopg2://Grant_Nick_Proj3:Grant_Nick_Proj3@grantnickproj3.ckbobg8otu11.ap-southeast-2.rds.amazonaws.com/Project_3')
df = pd.read_sql_query('select * from "incident_data"',con=engine)

# Export the Full Table to CSV
df.to_csv('final_data.csv')

df.head(1)

Unnamed: 0,mine_id,Operational_Process,Date,State,Resource_Mined,Underground_Method,Accident_Injury_Illness_Classification,Accident_Type,Injury_Count,Sex,...,Days_Lost,Description_1,Description_2,Experience_Total,Experience_Mine,Experience_Job,Injury_Classification,Operator_Contractor,Commodity_WorkStatus,General_Incident_Type
0,1200945,Dredging_Activities,13/10/2020,Indiana,Sand & gravel,0,Handling material,Caught in-under-between a moving and stationar...,1,Female,...,0.0,EE WAS MOVING A 2X3X.75 STEEL PLATE OFF OF A P...,HE TIME OF THE INCIDENT.,1.15,1.15,1.15,NDL (No days lost),Operator,Sand & gravel operator,Handling material


## Model 1 - Nature of Injury (NOI)

In [9]:
NOI_target = df["Nature_of_Injury"]
data = df[["Sex","Age", "Occupation", "Operational_Process"]]
data.head()

Unnamed: 0,Sex,Age,Occupation,Operational_Process
0,Female,20_29,Labourer_Roles,Dredging_Activities
1,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
2,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
3,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
4,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...


In [10]:
data_dummies = pd.get_dummies(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dummies, NOI_target)

In [11]:
# Import Random Forrest, then provide a score for the model accuracy
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.3002481389578164

In [12]:
# This is how we predict and example based on default value
NOI_Prediction = rf.predict([[0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]])

# Export the Output Headers to Excel
X_train.to_csv("Machine Learning Output - Nature of Injury.csv", index=False)

# Export the model using pickle
with open('../NOI_rf.pickle', 'wb') as file:
    pickle.dump(rf, file)

# Show the importance of the columns
sorted(zip(rf.feature_importances_,data_dummies.columns))

[(0.0023983098500361486,
  'Operational_Process_Culm_Banks_Activities_Coal_Mining_Only'),
 (0.004104966939723136, 'Operational_Process_Non_Mine_Workshops_and_Yards'),
 (0.00473491381537721, 'Occupation_Apprentice_or_Trainee_Roles'),
 (0.005116893158519181, 'Age_80_89'),
 (0.009209685219105718, 'Age_70_79'),
 (0.011569784606213988, 'Occupation_Maritime_Roles'),
 (0.014660972587677014, 'Operational_Process_Other_Surface_Facility'),
 (0.022374814748000092, 'Occupation_Explosives_Roles'),
 (0.023379551801293696, 'Occupation_Other_Role'),
 (0.02434686456946614, 'Age_Under_20'),
 (0.024671358264174156, 'Sex_Male'),
 (0.026560453053544122, 'Sex_Female'),
 (0.027013159670048525, 'Operational_Process_Office_Located_on_Mine'),
 (0.027237066539501434, 'Occupation_Driller_and_Support_Roles'),
 (0.03233812910808721, 'Occupation_Technical_Services_Roles'),
 (0.032757211349518076, 'Operational_Process_Dredging_Activities'),
 (0.035048798977230476, 'Occupation_Mine_Supervisory_and_Management_Roles'),


## Model 2 - Injured Body Part (IBP)

In [13]:
IBP_target = df["Injured_Body_Part"]
data = df[["Sex","Age", "Occupation", "Operational_Process"]]
data.head()

Unnamed: 0,Sex,Age,Occupation,Operational_Process
0,Female,20_29,Labourer_Roles,Dredging_Activities
1,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
2,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
3,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
4,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...


In [14]:
data_dummies = pd.get_dummies(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dummies, IBP_target)

In [15]:
# Import Random Forrest, then provide a score for the model accuracy
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.24896608767576509

In [16]:
# This is how we predict and example based on default value
IBP_Prediction = rf.predict([[0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]])

# Export the Output Headers to Excel
X_train.to_csv("Machine Learning Output - Injured Body Part.csv", index=False)

# Export the model using pickle
with open('../IBP_rf.pickle', 'wb') as file:
    pickle.dump(rf, file)

# Show the importance of the columns
sorted(zip(rf.feature_importances_,data_dummies.columns))

[(0.0034016978398285404,
  'Operational_Process_Culm_Banks_Activities_Coal_Mining_Only'),
 (0.004274736017486355, 'Occupation_Apprentice_or_Trainee_Roles'),
 (0.004873317813073845, 'Operational_Process_Non_Mine_Workshops_and_Yards'),
 (0.004875332599875107, 'Age_80_89'),
 (0.01167123879749271, 'Age_70_79'),
 (0.014365755234880987, 'Operational_Process_Other_Surface_Facility'),
 (0.016183008283974558, 'Occupation_Maritime_Roles'),
 (0.01919566948249146, 'Occupation_Other_Role'),
 (0.019281096128429473, 'Age_Under_20'),
 (0.020573444020756837, 'Occupation_Explosives_Roles'),
 (0.028360434403889724, 'Operational_Process_Office_Located_on_Mine'),
 (0.029800899010214134, 'Operational_Process_Dredging_Activities'),
 (0.03247393704970036, 'Occupation_Driller_and_Support_Roles'),
 (0.03376093297880412, 'Occupation_Technical_Services_Roles'),
 (0.03460573080623559, 'Sex_Female'),
 (0.03487081600577125, 'Sex_Male'),
 (0.035858867938841944, 'Occupation_Mine_Supervisory_and_Management_Roles'),
 (0

## Model 3 - Source of Injury (SOI)

In [17]:
SOI_target = df["Source_of_Injury"]
data = df[["Sex","Age", "Occupation", "Operational_Process"]]
data.head()

Unnamed: 0,Sex,Age,Occupation,Operational_Process
0,Female,20_29,Labourer_Roles,Dredging_Activities
1,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
2,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
3,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
4,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...


In [18]:
data_dummies = pd.get_dummies(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dummies, SOI_target)

In [19]:
# Import Random Forrest, then provide a score for the model accuracy
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.13895781637717122

In [20]:
# This is how we predict and example based on default value
SOI_Prediction = rf.predict([[0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]])

# Export the Output Headers to Excel
X_train.to_csv("Machine Learning Output - Source of Injury.csv", index=False)

# Export the model using pickle
with open('../SOI_rf.pickle', 'wb') as file:
    pickle.dump(rf, file)

# Show Model Value importances
sorted(zip(rf.feature_importances_,data_dummies.columns))

[(0.002169329012941814, 'Occupation_Apprentice_or_Trainee_Roles'),
 (0.0023123200443384783,
  'Operational_Process_Culm_Banks_Activities_Coal_Mining_Only'),
 (0.003374137671427144, 'Age_80_89'),
 (0.0040839857663219, 'Operational_Process_Non_Mine_Workshops_and_Yards'),
 (0.008791132412862024, 'Operational_Process_Other_Surface_Facility'),
 (0.011606087703066752, 'Occupation_Maritime_Roles'),
 (0.011824762250664665, 'Age_70_79'),
 (0.017681817491452385, 'Occupation_Explosives_Roles'),
 (0.018431231002001756, 'Occupation_Other_Role'),
 (0.01959815204657135, 'Operational_Process_Office_Located_on_Mine'),
 (0.021687516165270315, 'Operational_Process_Dredging_Activities'),
 (0.024327764892919868, 'Occupation_Driller_and_Support_Roles'),
 (0.026535003337324024, 'Age_Under_20'),
 (0.030018677414630712, 'Sex_Male'),
 (0.030172136744450188, 'Occupation_Technical_Services_Roles'),
 (0.03063009266223306, 'Sex_Female'),
 (0.030743244919661655, 'Occupation_Mine_Supervisory_and_Management_Roles'),
 

## Model 4 - Activity (ACT)

In [21]:
ACT_target = df["Activity"]
data = df[["Sex","Age", "Occupation", "Operational_Process"]]
data.head()

Unnamed: 0,Sex,Age,Occupation,Operational_Process
0,Female,20_29,Labourer_Roles,Dredging_Activities
1,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
2,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
3,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
4,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...


In [22]:
data_dummies = pd.get_dummies(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dummies, ACT_target)

In [23]:
# Import Random Forrest, then provide a score for the model accuracy
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.22167080231596362

In [24]:
rf

RandomForestClassifier(n_estimators=200)

In [25]:
# This is how we predict and example based on default value
ACT_Prediction = rf.predict([[0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]])

# Export the Output Headers to Excel
X_train.to_csv("Machine Learning Output - Activity.csv", index=False)

# Export the model using pickle
with open('../ACT_rf.pickle', 'wb') as file:
    pickle.dump(rf, file)

# Show Model Value importances
sorted(zip(rf.feature_importances_,data_dummies.columns))

[(0.0011874389699767461,
  'Operational_Process_Culm_Banks_Activities_Coal_Mining_Only'),
 (0.0027542226341480606, 'Occupation_Apprentice_or_Trainee_Roles'),
 (0.0032136427951106462, 'Operational_Process_Non_Mine_Workshops_and_Yards'),
 (0.005115758201941739, 'Age_80_89'),
 (0.008193919576596913, 'Occupation_Maritime_Roles'),
 (0.009094520100006073, 'Operational_Process_Other_Surface_Facility'),
 (0.010431246934023839, 'Age_70_79'),
 (0.014515387958455826, 'Occupation_Other_Role'),
 (0.01743410666602112, 'Occupation_Explosives_Roles'),
 (0.020378850653619097, 'Occupation_Driller_and_Support_Roles'),
 (0.021493414654163344, 'Operational_Process_Office_Located_on_Mine'),
 (0.02203135148155171, 'Operational_Process_Dredging_Activities'),
 (0.024836267388732663, 'Sex_Female'),
 (0.02496527406233027, 'Age_Under_20'),
 (0.026517664175478337, 'Sex_Male'),
 (0.027618767085754673, 'Occupation_Technical_Services_Roles'),
 (0.028011903487656906, 'Occupation_Mine_Supervisory_and_Management_Roles')

# Summary of Prediction

In [26]:
# Summarise the model results
print(f"Model Predictions:")
print(f"")
print(f"Nature of injury: {NOI_Prediction[0]}")
print(f"Injured Body Part: {IBP_Prediction[0]}")
print(f"Source of Injury: {SOI_Prediction[0]}")
print(f"Activity: {ACT_Prediction[0]}")

Model Predictions:

Nature of injury: Sprain, strains
Injured Body Part: Wrist,_Hand_and_Fingers
Source of Injury: Boilers
Activity: Handling supplies or material, load and unload
