# Classification

_Erin Cameron_

---

In [23]:
# Import the scikit learn library
# !pip3 install scikit-learn

In [24]:
import pandas as pd

In [25]:
# Formatting settings for the environment
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None) # To prevent truncation of long cell content

## 1.0) Format and Load Dataset

In [26]:
# Read in the formatted CSV file
print("Reading in data...")
data = pd.read_csv("./data/emergency_triage_data.csv")
print("Data read complete.")

Reading in data...
Data read complete.


In [27]:
display(data.head())

Unnamed: 0.1,Unnamed: 0,Group,Sex,Age,Patients number per hour,Arrival mode,Injury,Chief_complain,Mental,Pain,NRS_pain,SBP,DBP,HR,RR,BT,Saturation,KTAS_RN,Diagnosis in ED,Disposition,KTAS_expert,Error_group,Length of stay_min,KTAS duration_min,mistriage,CC_pain_trauma,CC_respiratory,CC_cardiovascular,CC_neurological,CC_gastrointestinal,CC_systemic_infectious,CC_other,DG_cardiovascular,DG_respiratory,DG_neurological,DG_gastrointestinal,DG_musculoskeletal_trauma,DG_infectious,DG_genitourinary,DG_other_systemic,DG_other,mistriage_original
0,0,2,2,71,3,3,2,right ocular pain,1,1,2,160.0,100.0,84.0,18.0,36.6,100.0,2,corneal abrasion,1,4,2,86,5.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
1,1,1,1,56,12,3,2,right forearm burn,1,1,2,137.0,75.0,60.0,20.0,36.5,,4,burn of hand first degree dorsum,1,5,4,64,3.95,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,2,2,1,68,8,2,2,arm pain left,1,1,2,130.0,80.0,102.0,20.0,36.6,98.0,4,fracture of surgical neck of humerus closed,2,5,4,862,1.0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,3,1,2,71,8,1,1,ascites tapping,1,1,3,139.0,94.0,88.0,20.0,36.5,,4,alcoholic liver cirrhosis with ascites,1,5,6,108,9.83,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
4,4,1,2,58,4,3,1,distension abdominal,1,1,3,91.0,67.0,93.0,18.0,36.5,,4,ascites,1,5,8,109,6.6,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1


In [28]:
# Remove the "Unnamed: 0" column
data = data.drop(data.columns[0], axis=1)

In [29]:
display(data.head())

Unnamed: 0,Group,Sex,Age,Patients number per hour,Arrival mode,Injury,Chief_complain,Mental,Pain,NRS_pain,SBP,DBP,HR,RR,BT,Saturation,KTAS_RN,Diagnosis in ED,Disposition,KTAS_expert,Error_group,Length of stay_min,KTAS duration_min,mistriage,CC_pain_trauma,CC_respiratory,CC_cardiovascular,CC_neurological,CC_gastrointestinal,CC_systemic_infectious,CC_other,DG_cardiovascular,DG_respiratory,DG_neurological,DG_gastrointestinal,DG_musculoskeletal_trauma,DG_infectious,DG_genitourinary,DG_other_systemic,DG_other,mistriage_original
0,2,2,71,3,3,2,right ocular pain,1,1,2,160.0,100.0,84.0,18.0,36.6,100.0,2,corneal abrasion,1,4,2,86,5.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
1,1,1,56,12,3,2,right forearm burn,1,1,2,137.0,75.0,60.0,20.0,36.5,,4,burn of hand first degree dorsum,1,5,4,64,3.95,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,2,1,68,8,2,2,arm pain left,1,1,2,130.0,80.0,102.0,20.0,36.6,98.0,4,fracture of surgical neck of humerus closed,2,5,4,862,1.0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,1,2,71,8,1,1,ascites tapping,1,1,3,139.0,94.0,88.0,20.0,36.5,,4,alcoholic liver cirrhosis with ascites,1,5,6,108,9.83,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
4,1,2,58,4,3,1,distension abdominal,1,1,3,91.0,67.0,93.0,18.0,36.5,,4,ascites,1,5,8,109,6.6,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1


## 2.0) Define ML Train Features

Based on the relationship with the target variable (mistriage), the following features will be selected for modelling:

| Feature Name    | Type    | Predictor Strength | Key Observation from Data |
|---------------|-------|--------------------|---------------------------|
| KTAS_RN | Categorical/Ordinal | Strong | The proportion of mistriage increases significantly with the triage level, ranging from 42.3% at KTAS Level 1 to between 8.6% to 18.0% at all other KTAS Levels. This is a critical variable. |
| HR | Continuous | Strong | The mean pulse rate is noticeably higher for mistriaged patients (85.0) compared to correctly triaged patients (80.0). |
| Injury | Categorical (1 vs. 2) | Strong | Patients presenting with an Injury (1) had a higher proportion of mistriage (15.7%) compared to non-injury patients (10.7%). |
| Diagnosis in ED | Categorical/Nominal | Strong | The proportion of mistriage varies widely across the top 10 diagnoses, from as low as 4.8% (Ischaemic chest pain) to over 26% (Unspecified abdominal pain). This variance indicates high predictive power, but requires careful encoding. |
| Age | Continuous | Weak | The difference in mean age is minor (58.0 for mistriaged vs 56.5 for correctly triaged), but still present. |
| Other Physiological Variables    | Continuous | Weak | Variables like SBP, DBP, RR, and BT have very similar mean values - or identical - for both groups (e.g., RR is 20.0 vs 20.0). They may be redundant or add noise, this will be determined during classification steps. |
| Time Variables | Continuous | Weak | KTAS duration_min and Length of stay_min show very small differences in means. They may be redundant or add noise, this will be determined during classification steps. |

In [30]:
features = ['KTAS_RN', 'HR', 'Injury', 'Age', 'SBP',
            'DBP', 'RR', 'BT', 'KTAS duration_min', 'Length of stay_min',
            'CC_pain_trauma', 'CC_respiratory', 'CC_cardiovascular',
            'CC_neurological', 'CC_gastrointestinal', 'CC_systemic_infectious',
            'CC_other', 'DG_cardiovascular', 'DG_respiratory',
            'DG_neurological', 'DG_gastrointestinal', 'DG_musculoskeletal_trauma',
            'DG_infectious', 'DG_genitourinary', 'DG_other_systemic',
            'DG_other']

In [31]:
prediction_class = ['mistriage']

## 3.0) Subset Data

In [32]:
# Subset data to relevent discrete variables and prediction class
data = data[features + prediction_class]
# Display the data
data.head()

Unnamed: 0,KTAS_RN,HR,Injury,Age,SBP,DBP,RR,BT,KTAS duration_min,Length of stay_min,CC_pain_trauma,CC_respiratory,CC_cardiovascular,CC_neurological,CC_gastrointestinal,CC_systemic_infectious,CC_other,DG_cardiovascular,DG_respiratory,DG_neurological,DG_gastrointestinal,DG_musculoskeletal_trauma,DG_infectious,DG_genitourinary,DG_other_systemic,DG_other,mistriage
0,2,84.0,2,71,160.0,100.0,18.0,36.6,5.0,86,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
1,4,60.0,2,56,137.0,75.0,20.0,36.5,3.95,64,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,4,102.0,2,68,130.0,80.0,20.0,36.6,1.0,862,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,4,88.0,1,71,139.0,94.0,20.0,36.5,9.83,108,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
4,4,93.0,1,58,91.0,67.0,18.0,36.5,6.6,109,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1


In [33]:
print(data['mistriage'].value_counts())

mistriage
0    1066
1     184
Name: count, dtype: int64
