In [664]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [665]:
cols_to_drop = ["country", "date", "title"]

In [666]:
incident_data = pd.read_csv("data/incidents_train.csv", index_col="Unnamed: 0")
incident_data["date"] = pd.to_datetime(incident_data[["year", "month", "day"]])
incident_data.drop(labels=["year", "month", "day", *cols_to_drop], axis=1, inplace=True)
incident_data.rename({"text": "recall"}, axis=1, inplace=True)

In [667]:
incident_data

Unnamed: 0,recall,hazard-category,product-category,hazard,product
0,Case Number: 024-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria monocytogenes,smoked sausage
1,Case Number: 033-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria spp,sausage
2,Case Number: 014-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria monocytogenes,ham slices
3,Case Number: 009-94 \n Date Opene...,foreign bodies,"meat, egg and dairy products",plastic fragment,thermal processed pork meat
4,Case Number: 001-94 \n Date Opene...,foreign bodies,"meat, egg and dairy products",plastic fragment,chicken breast
...,...,...,...,...,...
5979,Imported biscuit may contain allergen (peanuts...,allergens,cereals and bakery products,peanuts and products thereof,biscuits
5980,023-2022\n\n \n High - Class I\n\n Produc...,fraud,prepared dishes and snacks,inspection issues,pizza
5981,"FRESNO, Calif. – July 28, 2022 – Lyons Magnus ...",biological,non-alcoholic beverages,cronobacter spp,non-alcoholic beverages
5982,025-2022\n\n \n High - Class I\n\n Misbra...,allergens,"meat, egg and dairy products",eggs and products thereof,frozen beef products


In [668]:
keys = ['hazard-category', 'product-category', 'hazard', 'product']

In [669]:

encoder_dict = {key: LabelEncoder() for key in keys}

In [670]:
encoder_dict

{'hazard-category': LabelEncoder(),
 'product-category': LabelEncoder(),
 'hazard': LabelEncoder(),
 'product': LabelEncoder()}

In [671]:
for column, encoder in encoder_dict.items():

    incident_data[column] = encoder.fit_transform(incident_data[column])

In [672]:
incident_data

Unnamed: 0,recall,hazard-category,product-category,hazard,product
0,Case Number: 024-94 \n Date Opene...,1,13,55,858
1,Case Number: 033-94 \n Date Opene...,1,13,56,825
2,Case Number: 014-94 \n Date Opene...,1,13,55,511
3,Case Number: 009-94 \n Date Opene...,4,13,90,933
4,Case Number: 001-94 \n Date Opene...,4,13,90,168
...,...,...,...,...,...
5979,Imported biscuit may contain allergen (peanuts...,0,1,85,73
5980,023-2022\n\n \n High - Class I\n\n Produc...,5,18,52,712
5981,"FRESNO, Calif. – July 28, 2022 – Lyons Magnus ...",1,14,27,628
5982,025-2022\n\n \n High - Class I\n\n Misbra...,0,13,34,397


In [673]:
df = incident_data

In [674]:
def get_data(df: pd.DataFrame, label: str):

    if 'category' in label:
         edge_index = torch.tensor([df['product'].values, df['hazard'].values], dtype=torch.long)
         x_product = OneHotEncoder().fit_transform(df[['product']]).toarray()
         x_hazard = OneHotEncoder().fit_transform(df[['hazard']]).toarray()
    else:
         edge_index = torch.tensor([df['product-category'].values, df['hazard-category'].values], dtype=torch.long)
         x_product = OneHotEncoder().fit_transform(df[['product-category']]).toarray()
         x_hazard = OneHotEncoder().fit_transform(df[['hazard-category']]).toarray()

    

    x = np.concat((x_product, x_hazard), axis=1)
    x = torch.tensor(x, dtype=torch.float)
    
    y = torch.tensor(df[label].values, dtype=torch.long)
    
    data = Data(x=x, edge_index=edge_index, y=y)
    
    num_nodes = df.shape[0]
    try:
        train_mask, test_mask = train_test_split(torch.arange(num_nodes), stratify=df[label], test_size=0.2)
    except:
        train_mask, test_mask = train_test_split(torch.arange(num_nodes), test_size=0.2)
    
    data.train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    data.train_mask[train_mask] = True
    data.test_mask = torch.zeros(num_nodes, dtype=torch.bool)
    data.test_mask[test_mask] = True
    
    return data

In [675]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [676]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [677]:
def train_model(model, optimizer, criterion, data, epochs = 500):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')

    return model

In [678]:
def eval_model(model, data, label, encoder_dict):
     model.eval()
     with torch.inference_mode():
         _, pred = model(data.x, data.edge_index).max(dim=1)
     test_mask = data.test_mask.cpu().numpy()
     preds = pred.detach().cpu().numpy()[test_mask]
     test = data.y.detach().cpu().numpy()[test_mask]
     
     preds = encoder_dict[label].inverse_transform(preds)
     test = encoder_dict[label].inverse_transform(test)
     
     return classification_report(y_true=test, y_pred=preds, zero_division=0.0)

In [679]:
def train_and_eval_model_for_label(label: str, encoder_dict: dict):
     
     data = get_data(df, label=label)
     data.to(DEVICE)
     
     model = GCN(in_channels=data.x.shape[1], hidden_channels=64, out_channels=data.y.max().item()+1).to(DEVICE)
     criterion = torch.nn.NLLLoss()
     optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

     model = train_model(model, optimizer, criterion, data)

     cr = eval_model(model, data, label, encoder_dict)

     return cr

In [680]:
cr1 = train_and_eval_model_for_label(label='product-category', encoder_dict=encoder_dict)

Epoch 0, Loss: 3.0952842235565186
Epoch 10, Loss: 1.7714102268218994
Epoch 20, Loss: 0.606390655040741
Epoch 30, Loss: 0.19332003593444824
Epoch 40, Loss: 0.10853993147611618
Epoch 50, Loss: 0.07013975828886032
Epoch 60, Loss: 0.047755807638168335
Epoch 70, Loss: 0.0336003340780735
Epoch 80, Loss: 0.025779595598578453
Epoch 90, Loss: 0.021138116717338562
Epoch 100, Loss: 0.018015919253230095
Epoch 110, Loss: 0.01570763625204563
Epoch 120, Loss: 0.013905257917940617
Epoch 130, Loss: 0.012446459382772446
Epoch 140, Loss: 0.011234039440751076
Epoch 150, Loss: 0.010203688405454159
Epoch 160, Loss: 0.00931520201265812
Epoch 170, Loss: 0.008541933260858059
Epoch 180, Loss: 0.007859923876821995
Epoch 190, Loss: 0.007256597280502319
Epoch 200, Loss: 0.0067197224125266075
Epoch 210, Loss: 0.006237604655325413
Epoch 220, Loss: 0.005801772698760033
Epoch 230, Loss: 0.0054051573388278484
Epoch 240, Loss: 0.005042803008109331
Epoch 250, Loss: 0.004711811430752277
Epoch 260, Loss: 0.0044111986644566

In [681]:
print(cr1)

                                                   precision    recall  f1-score   support

                              alcoholic beverages       0.92      1.00      0.96        12
                      cereals and bakery products       0.89      0.93      0.91       134
     cocoa and cocoa preparations, coffee and tea       0.88      0.90      0.89        42
                                    confectionery       0.97      0.88      0.92        34
dietetic foods, food supplements, fortified foods       0.92      0.88      0.90        26
                                    fats and oils       1.00      0.25      0.40         4
                                   feed materials       1.00      1.00      1.00         1
                   food additives and flavourings       1.00      1.00      1.00         2
                           food contact materials       1.00      1.00      1.00         1
                            fruits and vegetables       0.71      0.95      0.82       10

In [682]:
cr2 = train_and_eval_model_for_label(label='hazard-category', encoder_dict=encoder_dict)

Epoch 0, Loss: 2.304474353790283
Epoch 10, Loss: 0.9384625554084778
Epoch 20, Loss: 0.29519301652908325
Epoch 30, Loss: 0.1325840801000595
Epoch 40, Loss: 0.0827619880437851
Epoch 50, Loss: 0.05696141719818115
Epoch 60, Loss: 0.04196888580918312
Epoch 70, Loss: 0.032489825040102005
Epoch 80, Loss: 0.02640814520418644
Epoch 90, Loss: 0.022075917571783066
Epoch 100, Loss: 0.018881892785429955
Epoch 110, Loss: 0.01645774394273758
Epoch 120, Loss: 0.014558665454387665
Epoch 130, Loss: 0.013022882863879204
Epoch 140, Loss: 0.01174929179251194
Epoch 150, Loss: 0.010678325779736042
Epoch 160, Loss: 0.009764589369297028
Epoch 170, Loss: 0.00897537637501955
Epoch 180, Loss: 0.008288758806884289
Epoch 190, Loss: 0.007692429702728987
Epoch 200, Loss: 0.00717469397932291
Epoch 210, Loss: 0.006718858145177364
Epoch 220, Loss: 0.0063180034048855305
Epoch 230, Loss: 0.005959402769804001
Epoch 240, Loss: 0.00563934538513422
Epoch 250, Loss: 0.0053501189686357975
Epoch 260, Loss: 0.005086170043796301
E

In [683]:
print(cr2)

                                precision    recall  f1-score   support

                     allergens       0.98      0.98      0.98       371
                    biological       0.95      0.95      0.95       348
                      chemical       0.86      0.86      0.86        57
food additives and flavourings       0.71      1.00      0.83         5
                foreign bodies       0.89      0.95      0.92       112
                         fraud       0.91      0.91      0.91        74
                     migration       1.00      1.00      1.00         1
          organoleptic aspects       0.88      0.64      0.74        11
                  other hazard       1.00      0.74      0.85        27
              packaging defect       0.83      0.91      0.87        11

                      accuracy                           0.94      1017
                     macro avg       0.90      0.89      0.89      1017
                  weighted avg       0.94      0.94      0.94 

In [684]:
cr3 = train_and_eval_model_for_label(label='hazard', encoder_dict=encoder_dict)

Epoch 0, Loss: 4.852696895599365
Epoch 10, Loss: 3.3973803520202637
Epoch 20, Loss: 2.517347812652588
Epoch 30, Loss: 2.1000161170959473
Epoch 40, Loss: 1.9218454360961914
Epoch 50, Loss: 1.8276537656784058
Epoch 60, Loss: 1.766427993774414
Epoch 70, Loss: 1.7251580953598022
Epoch 80, Loss: 1.6975746154785156
Epoch 90, Loss: 1.6778879165649414
Epoch 100, Loss: 1.6632657051086426
Epoch 110, Loss: 1.6522129774093628
Epoch 120, Loss: 1.6437969207763672
Epoch 130, Loss: 1.6373289823532104
Epoch 140, Loss: 1.632362961769104
Epoch 150, Loss: 1.62852942943573
Epoch 160, Loss: 1.625535011291504
Epoch 170, Loss: 1.623153567314148
Epoch 180, Loss: 1.6212289333343506
Epoch 190, Loss: 1.6196531057357788
Epoch 200, Loss: 1.618338942527771
Epoch 210, Loss: 1.617220401763916
Epoch 220, Loss: 1.6162490844726562
Epoch 230, Loss: 1.6153911352157593
Epoch 240, Loss: 1.6146245002746582
Epoch 250, Loss: 1.6139318943023682
Epoch 260, Loss: 1.6132997274398804
Epoch 270, Loss: 1.6127172708511353
Epoch 280, Lo

In [685]:
print(cr3)

                                                   precision    recall  f1-score   support

                                        Aflatoxin       0.00      0.00      0.00         2
                                   abnormal smell       0.00      0.00      0.00         1
                                  alcohol content       0.00      0.00      0.00         0
                                        alkaloids       0.00      0.00      0.00         1
                                        allergens       0.00      0.00      0.00         3
                                           almond       0.00      0.00      0.00        13
             altered organoleptic characteristics       0.00      0.00      0.00         1
                                        amygdalin       0.00      0.00      0.00         1
                           antibiotics, vet drugs       0.00      0.00      0.00         1
                                    bacillus spp.       0.00      0.00      0.00         

In [686]:
cr4 = train_and_eval_model_for_label(label='product', encoder_dict=encoder_dict)

Epoch 0, Loss: 6.932307243347168
Epoch 10, Loss: 5.560369491577148
Epoch 20, Loss: 4.402565956115723
Epoch 30, Loss: 3.567417860031128
Epoch 40, Loss: 3.223235845565796
Epoch 50, Loss: 3.0862696170806885
Epoch 60, Loss: 3.0261213779449463
Epoch 70, Loss: 2.9988696575164795
Epoch 80, Loss: 2.985868453979492
Epoch 90, Loss: 2.978632926940918
Epoch 100, Loss: 2.9744555950164795
Epoch 110, Loss: 2.971348524093628
Epoch 120, Loss: 2.969273567199707
Epoch 130, Loss: 2.9677157402038574
Epoch 140, Loss: 2.966475009918213
Epoch 150, Loss: 2.96547532081604
Epoch 160, Loss: 2.964646816253662
Epoch 170, Loss: 2.9639503955841064
Epoch 180, Loss: 2.963353157043457
Epoch 190, Loss: 2.9628374576568604
Epoch 200, Loss: 2.9623866081237793
Epoch 210, Loss: 2.9619905948638916
Epoch 220, Loss: 2.961639404296875
Epoch 230, Loss: 2.961325168609619
Epoch 240, Loss: 2.961042642593384
Epoch 250, Loss: 2.9607863426208496
Epoch 260, Loss: 2.960554599761963
Epoch 270, Loss: 2.960341453552246
Epoch 280, Loss: 2.960

In [687]:
print(cr4)

                                                         precision    recall  f1-score   support

                                 Catfishes (freshwater)       0.40      1.00      0.57         2
                                  Fishes not identified       0.29      0.80      0.42         5
                               Not classified pork meat       0.00      0.00      0.00         2
                    Precooked cooked pork meat products       0.00      0.00      0.00         1
                                          Veggie Burger       0.00      0.00      0.00         2
                                        adobo seasoning       0.00      0.00      0.00         1
                                     after dinner mints       0.00      0.00      0.00         1
                                        alfalfa sprouts       0.00      0.00      0.00         3
                                                  algae       0.00      0.00      0.00         1
                             