 ## Explore data

In [34]:
import pandas as pd
import re
import spacy

# Caricare il modello di spaCy per l'analisi sintattica e semantica
nlp = spacy.load("en_core_web_sm")

# Definire le relazioni esistenti nel dataset
RELATIONS = [
    "Cause-Effect", "Instrument-Agency", "Product-Producer", "Content-Container",
    "Entity-Origin", "Entity-Destination", "Component-Whole", "Member-Collection",
    "Message-Topic", "Other"
]

def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for i in range(0, len(lines), 2):  # Ogni esempio è su 2 righe (frase + relazione)
        if '"' in lines[i]:  
            sentence = re.findall(r'"(.*?)"', lines[i])[0]
            
            # Estrai la relazione senza la parte con le parentesi
            relation = re.sub(r'\(.*?\)', '', lines[i + 1]).strip() if i + 1 < len(lines) else "Other"
            
            # Estrai le entità
            entities = re.findall(r'<e1>(.*?)</e1>', lines[i]) + re.findall(r'<e2>(.*?)</e2>', lines[i])
            data.append({'sentence': sentence, 'entities': entities, 'relation': relation})

    return pd.DataFrame(data)

# Caricare i dati
train_df = load_data('data/TRAIN_FILE.TXT')

# Mostra le prime righe per verifica
print(train_df.head())


                                            sentence  \
0  The system as described above has its greatest...   
1  The <e1>child</e1> was carefully wrapped and b...   
2  The <e1>author</e1> of a keygen uses a <e2>dis...   
3  A misty <e1>ridge</e1> uprises from the <e2>su...   
4  The <e1>student</e1> <e2>association</e2> is t...   

                    entities           relation  
0  [configuration, elements]    Component-Whole  
1            [child, cradle]              Other  
2     [author, disassembler]  Instrument-Agency  
3             [ridge, surge]              Other  
4     [student, association]  Member-Collection  


# Rule based approach

### l'idea che ho avuto è di prendere il verbo e di controllarne il prefisso

In [49]:
import spacy

# Caricare il modello di spaCy per il POS tagging
nlp = spacy.load("en_core_web_sm")

# Lista di prefissi e le loro corrispondenti relazioni
prefix_to_relation = {
    "cau": "Cause-Effect",  # Cause, causare
    "lead": "Cause-Effect",  # Lead
    "resu": "Cause-Effect",  # Result
    "use": "Instrument-Agency",  # Use
    "oper": "Instrument-Agency",  # Operate
    "make": "Product-Producer",  # Make
    "prod": "Product-Producer",  # Produce, Produce
    "manu": "Product-Producer",  # Manufacture
    "crea": "Product-Producer",  # Create
    "cont": "Content-Container",  # Contain
    "insi": "Content-Container",  # Inside
    "orig": "Entity-Origin",  # Originate
    "from": "Entity-Origin",  # From
    "move": "Entity-Destination",  # Move
    "to": "Entity-Destination",  # To
    "part": "Component-Whole",  # Part
    "incl": "Component-Whole",  # Include
    "belon": "Member-Collection",  # Belong
    "membe": "Member-Collection",  # Member of
    "talk": "Message-Topic",  # Talk
    "topi": "Message-Topic",  # Topic
}

def classify_relation_using_verb_with_prefixes(sentence, entities):
    # Controlla se entities contiene esattamente due elementi
    if len(entities) != 2:
        return "Other"  # Se non ci sono esattamente due entità, ritorna "Other"

    # Analizzare la frase con spaCy
    doc = nlp(sentence)

    # Estrarre e1 ed e2 dalle entità
    e1, e2 = entities

    # Trova il verbo tra e1 ed e2
    verb = None
    for token in doc:
        if token.pos_ == "VERB":
            verb = token.lemma_  # Usare la forma lemmatizzata del verbo
            break  # Prendiamo il primo verbo che troviamo tra e1 ed e2

    # Se c'è un verbo, controlliamo i prefissi
    if verb:
        verb_prefix = verb[:4]  # Prendi le prime 4 lettere del verbo
        if verb_prefix in prefix_to_relation:
            return prefix_to_relation[verb_prefix]  # Restituisci la relazione basata sul prefisso

    return "Other"  # Se nessun prefisso corrisponde

# Applicare la funzione al training set
train_df['predicted_relation'] = train_df.apply(lambda row: classify_relation_using_verb_with_prefixes(row['sentence'], row['entities']), axis=1)

# Visualizza i risultati
print(train_df[['sentence', 'entities', 'predicted_relation']].head())


                                               sentence              entities  \
4554  Kevin Kim is surprised by the arrival of an an...     [archer, century]   
322   The <e1>name</e1> of the aquarium was derived ...          [name, vote]   
6871  The injured <e1>worker</e1> fell from an eleva...     [worker, surface]   
5756  A spaghetti <e1>pie</e1> served at a church di...      [pie, poisoning]   
1147  Using a teleport, the <e1>protagonist</e1> tra...  [protagonist, place]   

     predicted_relation  
4554              Other  
322               Other  
6871              Other  
5756              Other  
1147  Instrument-Agency  


In [50]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Calcolare l'accuratezza
accuracy = accuracy_score(train_df['relation'], train_df['predicted_relation'])
print(f"Accuracy: {accuracy * 100:.2f}%")

# Calcolare Precision, Recall e F1-Score per ciascuna classe
precision, recall, f1, support = precision_recall_fscore_support(
    train_df['relation'], train_df['predicted_relation'], average=None, labels=train_df['relation'].unique()
)

# Visualizzare i risultati delle metriche per ogni classe
metrics_df = pd.DataFrame({
    'Class': train_df['relation'].unique(),
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'Support': support
})

print(metrics_df)

# Calcolare la F1-Score macro-media
macro_f1 = f1.mean()
print(f"Macro F1-Score: {macro_f1:.2f}")

# Calcolare la F1-Score pesata
weighted_f1 = (precision * support).sum() / support.sum()
print(f"Weighted F1-Score: {weighted_f1:.2f}")


Accuracy: 19.25%
                 Class  Precision    Recall  F1-Score  Support
0        Entity-Origin   1.000000  0.027027  0.052632       74
1         Cause-Effect   0.500000  0.011364  0.022222       88
2   Entity-Destination   1.000000  0.042254  0.081081       71
3     Product-Producer   0.250000  0.112903  0.155556       62
4                Other   0.177953  0.904000  0.297368      125
5      Component-Whole   0.250000  0.012048  0.022989       83
6    Member-Collection   0.000000  0.000000  0.000000       53
7    Instrument-Agency   0.333333  0.093023  0.145455       43
8        Message-Topic   0.000000  0.000000  0.000000       44
9                        0.000000  0.000000  0.000000       10
10   Content-Container   0.333333  0.069767  0.115385       43
Macro F1-Score: 0.08
Weighted F1-Score: 0.40


  _warn_prf(average, modifier, msg_start, len(result))


# Feature vector + machine learning solution based

### Creazione del feature vector

In [51]:
import spacy
import pandas as pd
import re

# Caricare il modello di spaCy per il POS tagging
nlp = spacy.load("en_core_web_sm")

# Lista di prefissi e le loro corrispondenti relazioni
prefix_to_relation = {
    "cau": "Cause-Effect",  # Cause, causare
    "lead": "Cause-Effect",  # Lead
    "resu": "Cause-Effect",  # Result
    "use": "Instrument-Agency",  # Use
    "oper": "Instrument-Agency",  # Operate
    "make": "Product-Producer",  # Make
    "prod": "Product-Producer",  # Produce, Produce
    "manu": "Product-Producer",  # Manufacture
    "crea": "Product-Producer",  # Create
    "cont": "Content-Container",  # Contain
    "insi": "Content-Container",  # Inside
    "orig": "Entity-Origin",  # Originate
    "from": "Entity-Origin",  # From
    "move": "Entity-Destination",  # Move
    "to": "Entity-Destination",  # To
    "part": "Component-Whole",  # Part
    "incl": "Component-Whole",  # Include
    "belon": "Member-Collection",  # Belong
    "membe": "Member-Collection",  # Member of
    "talk": "Message-Topic",  # Talk
    "topi": "Message-Topic",  # Topic
}

# Funzione per estrarre il verbo e creare un feature vector basato sui prefissi
def extract_features_based_on_verb_with_prefixes(sentence, entities):
    # Controlla se entities contiene esattamente due entità
    if len(entities) != 2:
        return None  # Se non ci sono esattamente due entità, non possiamo estrarre caratteristiche

    # Rimuovere i tag delle entità dalla frase
    sentence = re.sub(r'<e1>|</e1>|<e2>|</e2>', '', sentence)

    # Analizzare la frase con spaCy
    doc = nlp(sentence)

    # Estrarre e1 ed e2 dalle entità
    e1, e2 = entities

    # Trova il verbo tra e1 ed e2
    verb = None
    for token in doc:
        if token.pos_ == "VERB":
            verb = token.lemma_  # Usare la forma lemmatizzata del verbo
            break  # Prendiamo il primo verbo che troviamo tra e1 ed e2

    # Se non è stato trovato un verbo tra le entità, ritorna None
    if not verb:
        return None

    # Estrarre il prefisso del verbo (le prime 4 lettere)
    verb_prefix = verb[:4]  # Prendi le prime 4 lettere del verbo

    # Creazione di un feature vector
    feature_vector = {
        "sentence_length": len(sentence.split()),  # Numero di parole nella frase
        "has_verb_prefix_cause": 1 if verb_prefix == "cau" else 0,
        "has_verb_prefix_lead": 1 if verb_prefix == "lead" else 0,
        "has_verb_prefix_resu": 1 if verb_prefix == "resu" else 0,
        "has_verb_prefix_use": 1 if verb_prefix == "use" else 0,
        "has_verb_prefix_oper": 1 if verb_prefix == "oper" else 0,
        "has_verb_prefix_make": 1 if verb_prefix == "make" else 0,
        "has_verb_prefix_prod": 1 if verb_prefix == "prod" else 0,
        "has_verb_prefix_manu": 1 if verb_prefix == "manu" else 0,
        "has_verb_prefix_crea": 1 if verb_prefix == "crea" else 0,
        "has_verb_prefix_cont": 1 if verb_prefix == "cont" else 0,
        "has_verb_prefix_insi": 1 if verb_prefix == "insi" else 0,
        "has_verb_prefix_orig": 1 if verb_prefix == "orig" else 0,
        "has_verb_prefix_from": 1 if verb_prefix == "from" else 0,
        "has_verb_prefix_move": 1 if verb_prefix == "move" else 0,
        "has_verb_prefix_to": 1 if verb_prefix == "to" else 0,
        "has_verb_prefix_part": 1 if verb_prefix == "part" else 0,
        "has_verb_prefix_incl": 1 if verb_prefix == "incl" else 0,
        "has_verb_prefix_belon": 1 if verb_prefix == "belon" else 0,
        "has_verb_prefix_membe": 1 if verb_prefix == "membe" else 0,
        "has_verb_prefix_talk": 1 if verb_prefix == "talk" else 0,
        "has_verb_prefix_topi": 1 if verb_prefix == "topi" else 0,
    }

    return feature_vector

# Applicare la funzione al training set per creare il feature vector
train_features = []
for _, row in train_df.iterrows():
    sentence = row['sentence']
    entities = row['entities']
    feature_vector = extract_features_based_on_verb_with_prefixes(sentence, entities)
    
    if feature_vector:  # Aggiungi solo se c'è un feature vector
        feature_vector['actual_relation'] = row['relation']
        train_features.append(feature_vector)

# Creazione del dataframe con i feature vectors
train_feature_df = pd.DataFrame(train_features)

# Visualizza i primi 5 feature vector
print(train_feature_df.head())
print(train_feature_df.size)



   sentence_length  has_verb_prefix_cause  has_verb_prefix_lead  \
0               16                      0                     0   
1               14                      0                     0   
2               21                      0                     0   
3               16                      0                     0   
4               10                      0                     0   

   has_verb_prefix_resu  has_verb_prefix_use  has_verb_prefix_oper  \
0                     0                    0                     0   
1                     0                    0                     0   
2                     0                    0                     0   
3                     0                    0                     0   
4                     0                    1                     0   

   has_verb_prefix_make  has_verb_prefix_prod  has_verb_prefix_manu  \
0                     0                     0                     0   
1                     0           

 ### SVM

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assicurati che la colonna di relazione (target) sia separata
X = train_feature_df.drop(columns=['actual_relation'])  # Le caratteristiche
y = train_feature_df['actual_relation']  # Le etichette (relazioni effettive)

# Suddividere i dati in training e validation set (80% / 20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Creare il modello SVM
svm_model = SVC(kernel='linear', random_state=42)

# Allenare il modello
svm_model.fit(X_train, y_train)

# Predire le etichette sul set di validazione
y_pred = svm_model.predict(X_val)

# Calcolare l'accuratezza
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Matrice di confusione
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Report di classificazione
print("\nClassification Report:")
print(classification_report(y_val, y_pred))



Accuracy: 0.1908
Confusion Matrix:
[[ 0  4  0  0  0  0  0  0 16  0]
 [ 0  2  0  0  0  1  0  0  8  0]
 [ 0  0  0  0  0  0  0  0  7  0]
 [ 0  0  0  1  0  0  0  0 11  0]
 [ 0  2  0  0  0  0  0  0  9  2]
 [ 0  1  0  0  0  3  0  0  4  0]
 [ 0  1  0  0  0  0  0  0  9  0]
 [ 0  0  0  0  0  0  0  0 16  0]
 [ 0  3  0  1  0  0  0  0 18  1]
 [ 0  3  0  0  0  0  0  0  7  1]]

Classification Report:
                    precision    recall  f1-score   support

      Cause-Effect       0.00      0.00      0.00        20
   Component-Whole       0.12      0.18      0.15        11
 Content-Container       0.00      0.00      0.00         7
Entity-Destination       0.50      0.08      0.14        12
     Entity-Origin       0.00      0.00      0.00        13
 Instrument-Agency       0.75      0.38      0.50         8
 Member-Collection       0.00      0.00      0.00        10
     Message-Topic       0.00      0.00      0.00        16
             Other       0.17      0.78      0.28        23
  Product

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Neural Network

In [48]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.cuda.amp import autocast, GradScaler  # For mixed precision

# Load the DistilBERT tokenizer and model (smaller model)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Label encoding for relations
label_encoder = LabelEncoder()
train_df['relation_label'] = label_encoder.fit_transform(train_df['relation'])

# Split train_df into train (80%), val (10%), and test (10%)
train_df, test_val_df = train_test_split(train_df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_val_df, test_size=0.5, random_state=42)

# Custom Dataset for BERT tokenization
class RelationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=32):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        sentence = row['sentence']
        relation_label = row['relation_label']

        # Tokenize the sentence
        encoding = self.tokenizer(
            sentence, 
            add_special_tokens=True, 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_length, 
            return_attention_mask=True, 
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(relation_label, dtype=torch.long)
        }

# Instantiate Datasets and DataLoaders
train_dataset = RelationDataset(train_df, tokenizer)
val_dataset = RelationDataset(val_df, tokenizer)
test_dataset = RelationDataset(test_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

class SelfAttentionModel(nn.Module):
    def __init__(self, hidden_dim, num_labels):
        super(SelfAttentionModel, self).__init__()
        
        # Caricamento di un modello più piccolo (DistilBERT) per ridurre la memoria
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        
        # Layer Fully Connected per la classificazione
        self.fc = nn.Linear(hidden_dim, num_labels)

    def forward(self, input_ids, attention_mask):
        # Ottenere l'output di DistilBERT (last hidden state)
        distilbert_output = self.distilbert(input_ids, attention_mask=attention_mask)
        
        # Estrarre l'output dell'ultimo stato nascosto
        hidden_state = distilbert_output.last_hidden_state
        
        # Usare l'output del token [CLS] per la classificazione (primo token)
        cls_token_output = hidden_state[:, 0, :]  # Prendere il primo token (CLS)

        # Passare attraverso un layer fully connected per ottenere le predizioni
        logits = self.fc(cls_token_output)  # La forma ora è (batch_size, num_labels)
        
        return logits



# Model, Loss, Optimizer
model = SelfAttentionModel(hidden_dim=768, num_labels=len(label_encoder.classes_))
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Mixed Precision Scaler
scaler = GradScaler()

# Training Loop
def train_epoch(model, dataloader, loss_fn, optimizer):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass with mixed precision
        optimizer.zero_grad()
        with autocast():  # Use mixed precision
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
        
        total_loss += loss.item()

        # Backward pass with mixed precision
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Calculate accuracy
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions.double() / len(dataloader.dataset)
    return avg_loss, accuracy

# Validation Loop
def eval_epoch(model, dataloader, loss_fn):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()

            # Calculate accuracy
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions.double() / len(dataloader.dataset)
    return avg_loss, accuracy

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Training and Evaluation
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    
    # Train for one epoch
    train_loss, train_accuracy = train_epoch(model, train_dataloader, loss_fn, optimizer)
    print(f"Training Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.4f}")
    
    # Evaluate on validation data
    val_loss, val_accuracy = eval_epoch(model, val_dataloader, loss_fn)
    print(f"Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")
    
    # Evaluate on test data
    test_loss, test_accuracy = eval_epoch(model, test_dataloader, loss_fn)
    print(f"Test Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}")

    # Clear GPU memory after each epoch
    torch.cuda.empty_cache()


  scaler = GradScaler()
  with autocast():  # Use mixed precision


Epoch 1/3
Training Loss: 2.2629, Accuracy: 0.1825
Validation Loss: 2.0538, Accuracy: 0.2874
Test Loss: 2.1287, Accuracy: 0.2299
Epoch 2/3
Training Loss: 1.8115, Accuracy: 0.3736
Validation Loss: 1.5244, Accuracy: 0.5057
Test Loss: 1.6548, Accuracy: 0.4828
Epoch 3/3
Training Loss: 1.2332, Accuracy: 0.6609
Validation Loss: 1.1375, Accuracy: 0.6322
Test Loss: 1.1666, Accuracy: 0.6667
