 ## Explore data

In [87]:
import pandas as pd
import re
import spacy

# Caricare il modello di spaCy per l'analisi sintattica e semantica
nlp = spacy.load("en_core_web_sm")

# Definire le relazioni esistenti nel dataset
RELATIONS = [
    "Cause-Effect", "Instrument-Agency", "Product-Producer", "Content-Container",
    "Entity-Origin", "Entity-Destination", "Component-Whole", "Member-Collection",
    "Message-Topic", "Other"
]

def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for i in range(0, len(lines), 2):  # Ogni esempio è su 2 righe (frase + relazione)
        if '"' in lines[i]:  
            sentence = re.findall(r'"(.*?)"', lines[i])[0]
            relation = lines[i + 1].strip() if i + 1 < len(lines) else "Other"
            entities = re.findall(r'<e1>(.*?)</e1>', lines[i]) + re.findall(r'<e2>(.*?)</e2>', lines[i])
            data.append({'sentence': sentence, 'entities': entities, 'relation': relation})

    return pd.DataFrame(data)

# Caricare i dati
train_df = load_data('data/TRAIN_FILE.TXT')
test_df = load_data('data/TEST_FILE.txt')

# Mostra le prime righe per verifica
print(test_df.head())


                                            sentence            entities  \
0  The most common <e1>audits</e1> were about <e2...     [audits, waste]   
1  The school <e1>master</e1> teaches the lesson ...     [master, stick]   
2  Avian <e1>influenza</e1> is an infectious dise...  [influenza, virus]   
3  A child is told a <e1>lie</e1> for several yea...      [lie, parents]   
4  The disgusting scene was retaliation against h...       [room, house]   

                                            relation  
0  8002\t"The <e1>company</e1> fabricates plastic...  
1  8004\t"The suspect dumped the dead <e1>body</e...  
2  8006\t"The <e1>ear</e1> of the African <e2>ele...  
3  8008\t"Skype, a free software, allows a <e1>ho...  
4  8010\t"This <e1>thesis</e1> defines the <e2>cl...  


# Rule based approach

### l'idea che ho avuto è di prendere il verbo e di controllarne il prefisso

In [90]:
import spacy

# Caricare il modello di spaCy per il POS tagging
nlp = spacy.load("en_core_web_sm")

def classify_relation_using_verb(sentence, entities):
    # Controllare se entities contiene esattamente due elementi
    if len(entities) != 2:
        return "Other"  # Se non ci sono esattamente due entità, ritorna "Other"

    # Analizzare la frase con spaCy
    doc = nlp(sentence)

    # Estrarre e1 ed e2 dalle entità
    e1, e2 = entities

    # Trova il verbo tra e1 e e2
    verb = None
    for token in doc:
        if token.pos_ == "VERB":
            verb = token.lemma_
            break  # Prendiamo il primo verbo che troviamo tra e1 ed e2

    # Controllare il verbo per determinare la relazione
    if verb:
        if verb in ["cause", "lead", "result"]:
            return "Cause-Effect"
        elif verb in ["use", "operate"]:
            return "Instrument-Agency"
        elif verb in ["make", "produce", "manufacture", "create"]:
            return "Product-Producer"
        elif verb in ["contain", "inside"]:
            return "Content-Container"
        elif verb in ["originate", "from"]:
            return "Entity-Origin"
        elif verb in ["move", "to"]:
            return "Entity-Destination"
        elif verb in ["part of", "include"]:
            return "Component-Whole"
        elif verb in ["belong", "member of"]:
            return "Member-Collection"
        elif verb in ["talk", "topic"]:
            return "Message-Topic"
    
    return "Other"  # Se nessun verbo corrisponde

# Applicare la funzione al training set
train_df['predicted_relation'] = train_df.apply(lambda row: classify_relation_using_verb(row['sentence'], row['entities']), axis=1)

# Visualizza i risultati
print(train_df[['sentence', 'entities', 'predicted_relation']].head())



                                            sentence  \
0  The system as described above has its greatest...   
1  The <e1>child</e1> was carefully wrapped and b...   
2  The <e1>author</e1> of a keygen uses a <e2>dis...   
3  A misty <e1>ridge</e1> uprises from the <e2>su...   
4  The <e1>student</e1> <e2>association</e2> is t...   

                    entities predicted_relation  
0  [configuration, elements]              Other  
1            [child, cradle]              Other  
2     [author, disassembler]  Instrument-Agency  
3             [ridge, surge]              Other  
4     [student, association]              Other  


In [91]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Calcolare l'accuratezza
accuracy = accuracy_score(train_df['relation'], train_df['predicted_relation'])
print(f"Accuracy: {accuracy * 100:.2f}%")

# Calcolare Precision, Recall e F1-Score per ciascuna classe
precision, recall, f1, support = precision_recall_fscore_support(
    train_df['relation'], train_df['predicted_relation'], average=None, labels=train_df['relation'].unique()
)

# Visualizzare i risultati delle metriche per ogni classe
metrics_df = pd.DataFrame({
    'Class': train_df['relation'].unique(),
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'Support': support
})

print(metrics_df)

# Calcolare la F1-Score macro-media
macro_f1 = f1.mean()
print(f"Macro F1-Score: {macro_f1:.2f}")

# Calcolare la F1-Score pesata
weighted_f1 = (precision * support).sum() / support.sum()
print(f"Weighted F1-Score: {weighted_f1:.2f}")


Accuracy: 16.15%
                        Class  Precision    Recall  F1-Score  Support
0      Component-Whole(e2,e1)   0.000000  0.000000  0.000000      471
1                       Other   0.181102  0.929787  0.303156     1410
2    Instrument-Agency(e2,e1)   0.000000  0.000000  0.000000      407
3    Member-Collection(e1,e2)   0.000000  0.000000  0.000000       78
4         Cause-Effect(e2,e1)   0.000000  0.000000  0.000000      659
5   Entity-Destination(e1,e2)   0.000000  0.000000  0.000000      844
6    Content-Container(e1,e2)   0.000000  0.000000  0.000000      374
7        Message-Topic(e1,e2)   0.000000  0.000000  0.000000      490
8     Product-Producer(e2,e1)   0.000000  0.000000  0.000000      394
9    Member-Collection(e2,e1)   0.000000  0.000000  0.000000      612
10       Entity-Origin(e1,e2)   0.000000  0.000000  0.000000      568
11        Cause-Effect(e1,e2)   0.000000  0.000000  0.000000      344
12                              0.000000  0.000000  0.000000      118
13 

  _warn_prf(average, modifier, msg_start, len(result))


# Feature vector + machine learning solution based

### Creazione del feature vector

In [92]:
import spacy
import pandas as pd

# Caricare il modello di spaCy per il POS tagging
nlp = spacy.load("en_core_web_sm")

# Funzione per estrarre il verbo e creare un feature vector
def extract_features_based_on_verb(sentence, entities):
    # Controllare se entities contiene esattamente due entità
    if len(entities) != 2:
        return None  # Se non ci sono esattamente due entità, non possiamo estrarre caratteristiche

    sentence = re.sub(r'<e1>|</e1>|<e2>|</e2>', '', sentence)
    # Analizzare la frase con spaCy
    doc = nlp(sentence)

    # Estrarre e1 ed e2 dalle entità
    e1, e2 = entities

    # Trova il verbo tra e1 e e2
    verb = None
    for token in doc:
        if token.pos_ == "VERB":
            verb = token.lemma_
            break  # Prendiamo il primo verbo che troviamo tra e1 ed e2

    # Se non è stato trovato un verbo tra le entità, ritorna None
    if not verb:
        return None

    # Creazione di un feature vector
    feature_vector = {
        #"e1": e1,  --------> not numerical feature
        #"e2": e2,
        #"verb": verb,
        #"verb_lemma": token.lemma_,
        #"verb_pos": token.pos_,
        "sentence_length": len(sentence.split()),  # Numero di parole nella frase
        "has_verb_prefix_cause": 1 if verb in ["cause", "lead", "result"] else 0,
        "has_verb_prefix_use": 1 if verb in ["use", "operate"] else 0,
        "has_verb_prefix_make": 1 if verb in ["make", "produce", "manufacture", "create"] else 0,
        "has_verb_prefix_contain": 1 if verb in ["contain", "inside"] else 0,
        "has_verb_prefix_originate": 1 if verb in ["originate", "from"] else 0,
        "has_verb_prefix_move": 1 if verb in ["move", "to"] else 0,
        "has_verb_prefix_part": 1 if verb in ["part of", "include"] else 0,
        "has_verb_prefix_belong": 1 if verb in ["belong", "member of"] else 0,
        "has_verb_prefix_talk": 1 if verb in ["talk", "topic"] else 0
    }

    return feature_vector

# Applicare la funzione al training set per creare il feature vector
train_features = []
for _, row in train_df.iterrows():
    sentence = row['sentence']
    entities = row['entities']
    feature_vector = extract_features_based_on_verb(sentence, entities)
    
    if feature_vector:  # Aggiungi solo se c'è un feature vector
        feature_vector['actual_relation'] = row['relation']
        train_features.append(feature_vector)

# Creazione del dataframe con i feature vectors
train_feature_df = pd.DataFrame(train_features)

# Visualizza i primi 5 feature vector
print(train_feature_df.head())
print(train_feature_df.size)


   sentence_length  has_verb_prefix_cause  has_verb_prefix_use  \
0               16                      0                    0   
1               15                      0                    0   
2               15                      0                    1   
3               12                      0                    0   
4               35                      1                    0   

   has_verb_prefix_make  has_verb_prefix_contain  has_verb_prefix_originate  \
0                     0                        0                          0   
1                     0                        0                          0   
2                     0                        0                          0   
3                     0                        0                          0   
4                     0                        0                          0   

   has_verb_prefix_move  has_verb_prefix_part  has_verb_prefix_belong  \
0                     0                     0          

 ### SVM

In [93]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assicurati che la colonna di relazione (target) sia separata
X = train_feature_df.drop(columns=['actual_relation'])  # Le caratteristiche
y = train_feature_df['actual_relation']  # Le etichette (relazioni effettive)

# Suddividere i dati in training e validation set (80% / 20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Creare il modello SVM
svm_model = SVC(kernel='linear', random_state=42)

# Allenare il modello
svm_model.fit(X_train, y_train)

# Predire le etichette sul set di validazione
y_pred = svm_model.predict(X_val)

# Calcolare l'accuratezza
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Matrice di confusione
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Report di classificazione
print("\nClassification Report:")
print(classification_report(y_val, y_pred))



Accuracy: 0.2339
Confusion Matrix:
[[  0  38   0   0   0   2   7   0   0   0   0   0   0   0   0  29   0   0]
 [  0  32   0   1   0   0   8   0   0   0   0   0   0   0   0  74   0   0]
 [  0   0   0   0   0   1  13   0   0   0   4   0   0   0   0  66   0   0]
 [  0   0   0   5   0   6  17   0   0   0   3   0   0   0   0  61   0   0]
 [  0   0   0   1   0   8   7   0   0   0   1   0   0   0   0  38   0   0]
 [  0   0   0   2   0   3   4   0   0   0   0   0   0   0   0  18   0   0]
 [  0   0   0   0   0   0 118   0   0   0   0   0   0   0   0  40   0   0]
 [  0   0   0   0   0   0  32   2   0   0   0   0   0   0   0  69   0   0]
 [  0   0   0   1   0   0   3   0   0   0   3   0   0   0   0  14   0   0]
 [  0   0   0   0   0   0   2   0   0   0   5   0   0   0   0   9   0   0]
 [  0   0   0   0   0   0  25   0   0   0   7   0   0   0   0  50   0   0]
 [  0   0   0   0   0   0   1   0   0   0   1   0   0   0   0  10   0   0]
 [  0   2   0   0   0   1  10   0   0   0   3   0   0   0   0 103

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
