In [21]:
from sklearn import tree
import pandas as pd
import pickle
import os
from sqlalchemy import create_engine

In [22]:
# Connect to the DB Server and retrieve table, export data to CSV.
engine = create_engine('postgresql+psycopg2://Grant_Nick_Proj3:Grant_Nick_Proj3@grantnickproj3.ckbobg8otu11.ap-southeast-2.rds.amazonaws.com/Project_3')
df = pd.read_sql_query('select * from "incident_data"',con=engine)

# Export the Full Table to CSV
df.to_csv('final_data.csv')

df.head(1)

Unnamed: 0,mine_id,Operational_Process,Date,State,Resource_Mined,Underground_Method,Accident_Injury_Illness_Classification,Accident_Type,Injury_Count,Sex,...,Days_Lost,Description_1,Description_2,Experience_Total,Experience_Mine,Experience_Job,Injury_Classification,Operator_Contractor,Commodity_WorkStatus,General_Incident_Type
0,1200945,Dredging_Activities,13/10/2020,Indiana,Sand & gravel,0,Handling material,Caught in-under-between a moving and stationar...,1,Female,...,0.0,EE WAS MOVING A 2X3X.75 STEEL PLATE OFF OF A P...,HE TIME OF THE INCIDENT.,1.15,1.15,1.15,NDL (No days lost),Operator,Sand & gravel operator,Handling material


## Model 1 - Nature of Injury (NOI)

In [23]:
NOI_target = df["Nature_of_Injury"]
data = df[["Sex","Age", "Occupation", "Operational_Process"]]
data.head()

Unnamed: 0,Sex,Age,Occupation,Operational_Process
0,Female,20_29,Labourer_Roles,Dredging_Activities
1,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
2,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
3,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
4,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...


In [24]:
data_dummies = pd.get_dummies(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dummies, NOI_target)

In [25]:
# Import Random Forrest, then provide a score for the model accuracy
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.30851943755169564

In [26]:
# This is how we predict and example based on default value
NOI_Prediction = rf.predict([[0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]])

# Export the Output Headers to Excel
X_train.to_csv("Machine Learning Output - Nature of Injury.csv", index=False)

# Export the model using pickle
with open('../NOI_rf.pickle', 'wb') as file:
    pickle.dump(rf, file)

# Show the importance of the columns
sorted(zip(rf.feature_importances_,data_dummies.columns))

[(0.0031391719706777795,
  'Operational_Process_Culm_Banks_Activities_Coal_Mining_Only'),
 (0.0033966384537739763, 'Operational_Process_Non_Mine_Workshops_and_Yards'),
 (0.005185504759329169, 'Occupation_Apprentice_or_Trainee_Roles'),
 (0.006002669266999006, 'Age_80_89'),
 (0.008191147658943484, 'Age_70_79'),
 (0.012279358914734561, 'Occupation_Maritime_Roles'),
 (0.012348783305340555, 'Operational_Process_Other_Surface_Facility'),
 (0.02185682436835386, 'Sex_Female'),
 (0.02334432708463975, 'Age_Under_20'),
 (0.02351795186745267, 'Occupation_Explosives_Roles'),
 (0.02368093779713613, 'Occupation_Other_Role'),
 (0.023874867576065863, 'Sex_Male'),
 (0.02667330702424612, 'Occupation_Driller_and_Support_Roles'),
 (0.027088262874975146, 'Operational_Process_Office_Located_on_Mine'),
 (0.030288902868823103, 'Operational_Process_Dredging_Activities'),
 (0.03298395040604942, 'Occupation_Technical_Services_Roles'),
 (0.037816575339885444, 'Occupation_Mine_Supervisory_and_Management_Roles'),
 (

## Model 2 - Injured Body Part (IBP)

In [27]:
IBP_target = df["Injured_Body_Part"]
data = df[["Sex","Age", "Occupation", "Operational_Process"]]
data.head()

Unnamed: 0,Sex,Age,Occupation,Operational_Process
0,Female,20_29,Labourer_Roles,Dredging_Activities
1,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
2,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
3,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
4,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...


In [28]:
data_dummies = pd.get_dummies(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dummies, IBP_target)

In [29]:
# Import Random Forrest, then provide a score for the model accuracy
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.2704714640198511

In [30]:
# This is how we predict and example based on default value
IBP_Prediction = rf.predict([[0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]])

# Export the Output Headers to Excel
X_train.to_csv("Machine Learning Output - Injured Body Part.csv", index=False)

# Export the model using pickle
with open('../IBP_rf.pickle', 'wb') as file:
    pickle.dump(rf, file)

# Show the importance of the columns
sorted(zip(rf.feature_importances_,data_dummies.columns))

[(0.002237250419629438,
  'Operational_Process_Culm_Banks_Activities_Coal_Mining_Only'),
 (0.002943354452636807, 'Occupation_Apprentice_or_Trainee_Roles'),
 (0.004784950584163204, 'Operational_Process_Non_Mine_Workshops_and_Yards'),
 (0.005687834025165016, 'Age_80_89'),
 (0.01075584271266901, 'Age_70_79'),
 (0.013833454777028686, 'Operational_Process_Other_Surface_Facility'),
 (0.015813452726043897, 'Occupation_Maritime_Roles'),
 (0.019171891186110516, 'Age_Under_20'),
 (0.019879113067042733, 'Occupation_Other_Role'),
 (0.021261085879426482, 'Occupation_Explosives_Roles'),
 (0.026161129029450832, 'Operational_Process_Office_Located_on_Mine'),
 (0.02727238854512954, 'Occupation_Driller_and_Support_Roles'),
 (0.031953811341562374, 'Operational_Process_Dredging_Activities'),
 (0.03231636902438782, 'Sex_Female'),
 (0.0329104559456185, 'Occupation_Mine_Supervisory_and_Management_Roles'),
 (0.034114849398160015, 'Occupation_Technical_Services_Roles'),
 (0.03558354754061728, 'Sex_Male'),
 (0.

## Model 3 - Source of Injury (SOI)

In [31]:
SOI_target = df["Source_of_Injury"]
data = df[["Sex","Age", "Occupation", "Operational_Process"]]
data.head()

Unnamed: 0,Sex,Age,Occupation,Operational_Process
0,Female,20_29,Labourer_Roles,Dredging_Activities
1,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
2,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
3,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
4,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...


In [32]:
data_dummies = pd.get_dummies(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dummies, SOI_target)

In [33]:
# Import Random Forrest, then provide a score for the model accuracy
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.1381306865177833

In [34]:
# This is how we predict and example based on default value
SOI_Prediction = rf.predict([[0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]])

# Export the Output Headers to Excel
X_train.to_csv("Machine Learning Output - Source of Injury.csv", index=False)

# Export the model using pickle
with open('../SOI_rf.pickle', 'wb') as file:
    pickle.dump(rf, file)

# Show Model Value importances
sorted(zip(rf.feature_importances_,data_dummies.columns))

[(0.0019021231007183919,
  'Operational_Process_Culm_Banks_Activities_Coal_Mining_Only'),
 (0.003936010298165008, 'Occupation_Apprentice_or_Trainee_Roles'),
 (0.004012531392903511, 'Operational_Process_Non_Mine_Workshops_and_Yards'),
 (0.004479278879380603, 'Age_80_89'),
 (0.008393388177678736, 'Operational_Process_Other_Surface_Facility'),
 (0.011936933884137468, 'Occupation_Maritime_Roles'),
 (0.013811689568943474, 'Age_70_79'),
 (0.01584221351111064, 'Occupation_Explosives_Roles'),
 (0.01945083719541405, 'Operational_Process_Office_Located_on_Mine'),
 (0.020809344273050187, 'Occupation_Other_Role'),
 (0.022899674199086683, 'Operational_Process_Dredging_Activities'),
 (0.02352930319546391, 'Occupation_Driller_and_Support_Roles'),
 (0.024291542472299622, 'Age_Under_20'),
 (0.028492828182563126, 'Sex_Male'),
 (0.029194989743537195, 'Sex_Female'),
 (0.030511124715007366,
  'Operational_Process_Underground_Mine_Surface_Workshops_and_Yards'),
 (0.03193502277694616, 'Occupation_Technical_S

## Model 4 - Activity (ACT)

In [35]:
ACT_target = df["Activity"]
data = df[["Sex","Age", "Occupation", "Operational_Process"]]
data.head()

Unnamed: 0,Sex,Age,Occupation,Operational_Process
0,Female,20_29,Labourer_Roles,Dredging_Activities
1,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
2,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
3,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...
4,Female,30_39,Labourer_Roles,Crushing_or_Processing_Facility_Incl_Associate...


In [36]:
data_dummies = pd.get_dummies(data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dummies, ACT_target)

In [37]:
# Import Random Forrest, then provide a score for the model accuracy
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.23325062034739455

In [38]:
rf

RandomForestClassifier(n_estimators=200)

In [39]:
# This is how we predict and example based on default value
ACT_Prediction = rf.predict([[0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]])

# Export the Output Headers to Excel
X_train.to_csv("Machine Learning Output - Activity.csv", index=False)

# Export the model using pickle
with open('../ACT_rf.pickle', 'wb') as file:
    pickle.dump(rf, file)

# Show Model Value importances
sorted(zip(rf.feature_importances_,data_dummies.columns))

[(0.0021440106352040656, 'Operational_Process_Non_Mine_Workshops_and_Yards'),
 (0.0024502302364373583, 'Occupation_Apprentice_or_Trainee_Roles'),
 (0.0028080179094200153,
  'Operational_Process_Culm_Banks_Activities_Coal_Mining_Only'),
 (0.005911252520307671, 'Age_80_89'),
 (0.00783783377723168, 'Age_70_79'),
 (0.00923099006532065, 'Occupation_Maritime_Roles'),
 (0.010137386443668194, 'Operational_Process_Other_Surface_Facility'),
 (0.017176204435970253, 'Occupation_Explosives_Roles'),
 (0.017242385033561278, 'Operational_Process_Office_Located_on_Mine'),
 (0.017245549451342868, 'Occupation_Other_Role'),
 (0.02065040885764053, 'Occupation_Driller_and_Support_Roles'),
 (0.02465947834092045, 'Operational_Process_Dredging_Activities'),
 (0.025844179685183953, 'Age_Under_20'),
 (0.026042518429694187, 'Occupation_Technical_Services_Roles'),
 (0.026404738228028263, 'Occupation_Mine_Supervisory_and_Management_Roles'),
 (0.028358774620476056, 'Sex_Female'),
 (0.028553171011834244, 'Sex_Male'),

# Summary of Prediction

In [40]:
# Summarise the model results
print(f"Model Predictions:")
print(f"")
print(f"Nature of injury: {NOI_Prediction[0]}")
print(f"Injured Body Part: {IBP_Prediction[0]}")
print(f"Source of Injury: {SOI_Prediction[0]}")
print(f"Activity: {ACT_Prediction[0]}")

Model Predictions:

Nature of injury: Fracture, chip
Injured Body Part: Wrist, Hand and Fingers
Source of Injury: Rubber, glass, plastic
Activity: Handling supplies or material, load and unload
