In [None]:
# Import Required 3rd-Party Libraries
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

In [None]:
data = pd.read_csv(
    filepath_or_buffer='./data/workplace-injuries-by-industry-and-incident-types.csv',
    header=0
)

In [None]:
data.head()

In [None]:
data.describe().round(2)

In [None]:
data.drop(labels=['year', 'no._of_injuries'], axis=1).describe()

In [None]:
data.info()

In [None]:
data.rename(mapper={
    'year': 'Year',
    'degree_of_injury': 'Severity',
    'industry': 'Industry',
    'sub_industry': 'Sub-Industry',
    'incident_type': 'Type of Incident',
    'incident_agent': 'Cause of Incident',
    'incident_agent_sub_type': 'Specific Cause of Incident',
    'no._of_injuries': 'Number of Injuries'
}, axis=1, inplace=True)

In [None]:
sns.lineplot(
    data=data.groupby(by='Year').sum(),
    x='Year',
    y='Number of Injuries'
).set_ylim((0, 14000))

In [None]:
sns.lineplot(
    data=data.groupby(by=['Year', 'Severity']).sum(),
    x='Year',
    y='Number of Injuries',
    hue='Severity'
)

In [None]:
sns.lineplot(
    data=data[data['Severity'] == 'Fatal'].groupby(by=['Year', 'Severity']).sum(),
    x='Year',
    y='Number of Injuries',
    hue='Severity'
).set_ylim((0, 80))

In [None]:
sns.barplot(
    data=data.sort_values(
        by='Number of Injuries',
        ascending=False).groupby(
            by='Industry'
        ).sum().reset_index(),
    y='Industry',
    x='Number of Injuries'
)

In [None]:
sns.barplot(
    data=data,
    y='Cause of Incident',
    x='Number of Injuries'
)

In [None]:
sns.barplot(
    data=data,
    y='Type of Incident',
    x='Number of Injuries'
)

In [None]:
data.drop(
    labels=['Sub-Industry', 'Specific Cause of Incident'],
    axis=1,
    inplace=True
)
data.head()

In [None]:
data.groupby(by='Industry').count().max(axis=1).sort_values()

In [None]:
data['Industry'] = data['Industry'].replace(to_replace={
    'Mining & Quarrying': 'Physical Labour',
    'Construction': 'Physical Labour',
    'Agriculture & Fishing': 'Physical Labour',
    'Electricity, Gas and Air-Conditioning Supply': 'Utilities',
    'Water Supply, Sewerage & Waste Management': 'Utilities',
    'Transportation & Storage': 'Infrastructure',
    'Information & Communications': 'Infrastructure',
    'Financial & Insurance Services': 'Administrative',
    'Administrative & Support Services': 'Administrative',
    'Real Estate Activities': 'Business',
    'Wholesale & Retail Trade': 'Business'
})
pd.unique(data['Industry']).size

In [None]:
data.groupby(by='Cause of Incident').count().max(axis=1).sort_values()

In [None]:
pd.unique(data['Type of Incident']).size

In [None]:
data.groupby(by='Type of Incident').count().max(axis=1).sort_values()

In [None]:
data['Type of Incident'] = data['Type of Incident'].replace(to_replace={
    'Oxygen Defiiciency in Confined Space': 'Suffocation/Drowning',
    'Suffocation': 'Suffocation/Drowning',
    'Drowning': 'Suffocation/Drowning',
    'Cave-in of excavation, tunnel, etc': 'Structural Collapse',
    'Collapse of formwork/Failure of its supports': 'Structural Collapse',
    'Collapse/Failure of Structure & Equipment': 'Structural Collapse',
    'Exposure to Electric current': 'Electrocution',
    'Struck by Falling Objects from Heights': 'Struck by/against Objects',
    'Cut/Stabbed by Objects': 'Struck by/against Objects',
    'Struck by Moving Objects': 'Struck by/against Objects',
    'Striking against Objects': 'Struck by/against Objects',
    'Struck by Falling Objects': 'Struck by/against Objects',
    'Stepping on Objects': 'Struck by/against Objects',
    'Falls - Falls from Height': 'Falls',
    'Falls - Slips, Trips & Falls': 'Falls',
    'Exposure to Biological Materials': 'Exposure to Hazardous Substances',
    'Crane-related': 'Others'
})

In [None]:
eg = pd.DataFrame(
    data=[
        ['C', 'John', 87, 90],
        ['C', 'Mary', 91, 80],
        ['C', 'Joe', 67, 100]
    ],
    index=[1, 2, 3],
    columns=['Class', 'Student Name', 'English Marks', 'Mathematics Marks']
)
eg

In [None]:
eg.groupby(by='Class').mean().round(2)

In [None]:
def get_raw(df: pd.DataFrame):
    data_raw = pd.DataFrame(columns=df.columns.values[:-1])
    for i, n in enumerate(df['Number of Injuries']):
        record = df.iloc[i,:-1]
        for _ in range(n):
            data_raw = data_raw.append(other=record)
    return data_raw
# data_raw = get_raw(data)

In [None]:
data_raw = pd.read_csv('./data/raw.csv', index_col=0)
data_raw.head()

In [None]:
data_encoded = pd.concat(
    objs=(
        data_raw[['Year', 'Severity']],
        pd.get_dummies(data_raw[['Industry', 'Type of Incident', 'Cause of Incident']])
    ),
    axis=1
)

data_encoded

In [None]:
train, test = train_test_split(data_encoded, test_size=0.25, random_state=7)

In [None]:
train

In [None]:
X_train = train.drop(labels='Severity', axis=1)
y_train = train['Severity']

In [None]:
# best_algo = GridSearchCV(
#     estimator=SVC(),
#     param_grid={
#             'kernel': ['rbf'],
#             'gamma': np.logspace(-10, -5, 3),
#             'C': np.logspace(-5, 5, 3)
#     },
#     cv=3
# )

In [None]:
import pickle
best_algo = pickle.load(file=open('best_algo.p', 'rb'))

In [None]:
best_algo.best_params_

In [None]:
model = SVC(
    kernel='rbf',
    C=1e5,
    gamma=1e-5
)

In [None]:
model.fit(X=X_train, y=y_train)

In [None]:
X_test = test.drop(labels='Severity', axis=1)
y_test = test['Severity']

In [None]:
y_pred = model.predict(X=X_test)
y_pred

In [None]:
sns.countplot(x=y_pred, palette='Accent')

In [None]:
y_pred_s = pd.Series(
    data=y_pred,
    name='Predicted Injury Outcomes'
)
y_pred_s.groupby(by=y_pred_s).count()

In [None]:
y_test[y_pred == 'Fatal']

In [None]:
fatal_pred = X_test[y_pred == 'Fatal']
fatal_mask = fatal_pred != 0
fatal_pred = fatal_pred.loc[[True] * 6, np.any(fatal_mask, axis=0).values]
fatal_pred.loc[:,fatal_pred.columns.str.contains(pat='(Year)|(Cause)')]

In [None]:
fatal_pred = X_test[y_pred == 'Major']
fatal_mask = fatal_pred != 0
fatal_pred = fatal_pred.loc[:, np.any(fatal_mask, axis=0).values]
fatal_pred.loc[:,fatal_pred.columns.str.contains(pat='(Year)|(Cause)')]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

conf_mat = pd.DataFrame(
    data = confusion_matrix(y_test, y_pred),
    index=['Actual Fatal', 'Actual Major', 'Actual Minor'],
    columns=['Predicted', 'Predicted Major', 'Predicted Minor']
)

print(classification_report(y_test, y_pred), '\n\n', conf_mat)

In [None]:
tot = pd.read_csv('./data/workplace-injuries.csv', header=0)
tot.rename({'no._of_workplace_injuries': 'Number of Workplace Injuries'}, axis=1, inplace=True)

In [None]:
me = tot['Number of Workplace Injuries']

In [None]:
with sns.axes_style(style='darkgrid'):
    with sns.color_palette(palette='turbo'):
        toa = tot
        toa['_dummy'] = [1] * toa.shape[0]
        toa['Year'] = toa['year']
        ax = sns.barplot(data=toa, x='Year', y='Number of Workplace Injuries', color='tab:green')
        sns.lineplot(x=list(ax.get_xlim()) * 2, y=[me.mean(), me.mean(), me.median(), me.median()], hue=['mean', 'mean', 'median', 'median'], palette='dark')
        # sns.lineplot(x=ax.get_xlim(), y=[10000, 10000], color='grey')
        ax.set(
            title='Workplace Injuries in Singapore (2006 ~ 2018)',
            yticklabels=[f'{int(i / 1000)}k' for i in ax.get_yticks()],
            xticklabels=['\'' + (str(i + 6) if i >= 4 else '0' + f'{i + 6}') for i in ax.get_xticks()]
        )
        plt.savefig('./img/workplace-injuries.png')