In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
import json
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import spacy
import Utility as util
from Utility import REDataset_entities, REModelWithAttention

In [2]:
df_fewRel = pd.read_csv('fewRel_entities.csv')

In [3]:
df_aug_falcon = pd.read_csv('aug_falcon_entities.csv')

In [4]:
df_aug_falcon = util.sampleData(df_aug_falcon,'relation') # this code is for sampling

In [4]:
print(f'orignal dataset = {len(df_fewRel)} and augumented data = {len(df_aug_falcon)}')

orignal dataset = 44800 and augumented data = 3200


In [3]:
train_df, test_df = train_test_split(df_fewRel, test_size=0.2, random_state=42) # To take test set from orignal data
print(f'Train set = {len(train_df)} and test set = {len(test_df)}')

Train set = 35840 and test set = 8960


In [7]:
df_combined = pd.concat([train_df, df_aug_falcon], ignore_index=True) # combined augumented data with orignal FewRel training data
print(len(df_combined))

39040


In [4]:
train_df = train_df.drop(columns=['entities'])
test_df = test_df.drop(columns=['entities'])

In [5]:
# Encode training and test set for relation
label_encoder = LabelEncoder()
train_df['relation_id'] = label_encoder.fit_transform(train_df['relation'])
test_df['relation_id'] = label_encoder.fit_transform(test_df['relation'])

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Choose the same model as in your model
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

In [8]:
max_seq_length = 128  
train_dataset = util.REDataset(train_df,tokenizer, max_seq_length)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = util.REDataset(test_df, tokenizer, max_seq_length)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [9]:
model = REModelWithAttention(tokenizer, num_classes=len(label_encoder.classes_)).to(device)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)


In [13]:
train_loss, valid_loss = util.train(model, train_loader, test_loader, criterion, optimizer, device, patience=4, num_epochs=20) # 7 % with entities

Epoch [1/20] - Training Loss: 2.2417- Validation Loss: 1.3332
Epoch [2/20] - Training Loss: 1.1481- Validation Loss: 1.0907
Epoch [3/20] - Training Loss: 0.7950- Validation Loss: 1.0404
Epoch [4/20] - Training Loss: 0.5755- Validation Loss: 1.0393
Epoch [5/20] - Training Loss: 0.4227- Validation Loss: 1.0597
Epoch [6/20] - Training Loss: 0.3233- Validation Loss: 1.1274
Epoch [7/20] - Training Loss: 0.2529- Validation Loss: 1.1750
Epoch [8/20] - Training Loss: 0.2034- Validation Loss: 1.2572
Early stopping after 8 epochs without improvement.


In [21]:
evaluate_model(model, test_loader) # 7 % with entities

Accuracy: 0.7079
Precision: 0.7132
Recall: 0.7059
F1 Score: 0.7055


In [14]:
train_loss, valid_loss = util.train(model, train_loader, test_loader, criterion, optimizer, device, patience=4, num_epochs=20) # 3 % with entities

Epoch [1/20] - Training Loss: 2.1620- Validation Loss: 1.3198
Epoch [2/20] - Training Loss: 1.1457- Validation Loss: 1.1031
Epoch [3/20] - Training Loss: 0.8042- Validation Loss: 1.0131
Epoch [4/20] - Training Loss: 0.5729- Validation Loss: 1.0141
Epoch [5/20] - Training Loss: 0.4173- Validation Loss: 1.0388
Epoch [6/20] - Training Loss: 0.3083- Validation Loss: 1.0921
Epoch [7/20] - Training Loss: 0.2330- Validation Loss: 1.1676
Early stopping after 7 epochs without improvement.


In [18]:
evaluate_model(model, test_loader,device) # 3 % with entities

Accuracy: 0.7213
Precision: 0.7215
Recall: 0.7201
F1 Score: 0.7167


In [8]:
train_loss, valid_loss = util.train(model, train_loader, test_loader, criterion, optimizer, device, patience=4, num_epochs=20) # Orignal data

Epoch [1/20] - Training Loss: 2.1018- Validation Loss: 1.3351
Epoch [2/20] - Training Loss: 1.0879- Validation Loss: 1.0731
Epoch [3/20] - Training Loss: 0.7618- Validation Loss: 0.9812
Epoch [4/20] - Training Loss: 0.5373- Validation Loss: 1.0099
Epoch [5/20] - Training Loss: 0.3895- Validation Loss: 1.0136
Epoch [6/20] - Training Loss: 0.2866- Validation Loss: 1.0684
Epoch [7/20] - Training Loss: 0.2177- Validation Loss: 1.1077
Early stopping after 7 epochs without improvement.


In [11]:
evaluate_model(model, test_loader,device) # orignal data

Accuracy: 0.7215
Precision: 0.7239
Recall: 0.7191
F1 Score: 0.7169


In [15]:
train_loss, valid_loss = util.train(model, train_loader, test_loader, criterion, optimizer, device, patience=4, num_epochs=20) # without entities 7%

Epoch [1/20] - Training Loss: 2.2260- Validation Loss: 1.3595
Epoch [2/20] - Training Loss: 1.1679- Validation Loss: 1.1595
Epoch [3/20] - Training Loss: 0.8275- Validation Loss: 1.1447
Epoch [4/20] - Training Loss: 0.6153- Validation Loss: 1.2201
Epoch [5/20] - Training Loss: 0.4624- Validation Loss: 1.2791
Epoch [6/20] - Training Loss: 0.3590- Validation Loss: 1.3769
Epoch [7/20] - Training Loss: 0.2815- Validation Loss: 1.4314
Early stopping after 7 epochs without improvement.


In [17]:
evaluate_model(model, test_loader,device) # without entities 7%

Accuracy: 0.6616
Precision: 0.6609
Recall: 0.6601
F1 Score: 0.6539


In [10]:
train_loss, valid_loss = util.train(model, train_loader, test_loader, criterion, optimizer, device, patience=4, num_epochs=20) # without entities orignal

Epoch [1/20] - Training Loss: 2.1552- Validation Loss: 1.3565
Epoch [2/20] - Training Loss: 1.1240- Validation Loss: 1.1456
Epoch [3/20] - Training Loss: 0.8066- Validation Loss: 1.1516
Epoch [4/20] - Training Loss: 0.5917- Validation Loss: 1.1909
Epoch [5/20] - Training Loss: 0.4344- Validation Loss: 1.2529
Epoch [6/20] - Training Loss: 0.3204- Validation Loss: 1.3646
Early stopping after 6 epochs without improvement.


In [12]:
evaluate_model(model, test_loader,device) # without entities orignal

Accuracy: 0.6675
Precision: 0.6655
Recall: 0.6655
F1 Score: 0.6614


In [11]:
def evaluate_model(model, data_loader,device):
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    all_predictions = []
    all_targets = []
 
    with torch.no_grad():
        for input_ids, attention_mask, targets in data_loader:
            input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_samples += targets.size(0)
            all_predictions.extend(predicted.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
    average_loss = total_loss / len(data_loader)

    accuracy = accuracy_score(all_targets, all_predictions)
    precision = precision_score(all_targets, all_predictions, average='macro')
    recall = recall_score(all_targets, all_predictions, average='macro')
    f1 = f1_score(all_targets, all_predictions, average='macro')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
import json
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import spacy
import Utility as util
from Utility import REDataset_entities, REModelWithAttention

In [2]:
df_fewRel = pd.read_csv('fewRel_entities.csv')

In [3]:
df_aug_falcon = pd.read_csv('aug_falcon_entities.csv')

In [4]:
df_aug_falcon = util.sampleData(df_aug_falcon,'relation') # this code is for sampling

In [4]:
print(f'orignal dataset = {len(df_fewRel)} and augumented data = {len(df_aug_falcon)}')

orignal dataset = 44800 and augumented data = 3200


In [3]:
train_df, test_df = train_test_split(df_fewRel, test_size=0.2, random_state=42) # To take test set from orignal data
print(f'Train set = {len(train_df)} and test set = {len(test_df)}')

Train set = 35840 and test set = 8960


In [7]:
df_combined = pd.concat([train_df, df_aug_falcon], ignore_index=True) # combined augumented data with orignal FewRel training data
print(len(df_combined))

39040


In [4]:
train_df = train_df.drop(columns=['entities'])
test_df = test_df.drop(columns=['entities'])

In [5]:
# Encode training and test set for relation
label_encoder = LabelEncoder()
train_df['relation_id'] = label_encoder.fit_transform(train_df['relation'])
test_df['relation_id'] = label_encoder.fit_transform(test_df['relation'])

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Choose the same model as in your model
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

In [8]:
max_seq_length = 128  
train_dataset = util.REDataset(train_df,tokenizer, max_seq_length)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = util.REDataset(test_df, tokenizer, max_seq_length)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [9]:
model = REModelWithAttention(tokenizer, num_classes=len(label_encoder.classes_)).to(device)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)


In [13]:
train_loss, valid_loss = util.train(model, train_loader, test_loader, criterion, optimizer, device, patience=4, num_epochs=20) # 7 % with entities

Epoch [1/20] - Training Loss: 2.2417- Validation Loss: 1.3332
Epoch [2/20] - Training Loss: 1.1481- Validation Loss: 1.0907
Epoch [3/20] - Training Loss: 0.7950- Validation Loss: 1.0404
Epoch [4/20] - Training Loss: 0.5755- Validation Loss: 1.0393
Epoch [5/20] - Training Loss: 0.4227- Validation Loss: 1.0597
Epoch [6/20] - Training Loss: 0.3233- Validation Loss: 1.1274
Epoch [7/20] - Training Loss: 0.2529- Validation Loss: 1.1750
Epoch [8/20] - Training Loss: 0.2034- Validation Loss: 1.2572
Early stopping after 8 epochs without improvement.


In [21]:
evaluate_model(model, test_loader) # 7 % with entities

Accuracy: 0.7079
Precision: 0.7132
Recall: 0.7059
F1 Score: 0.7055


In [14]:
train_loss, valid_loss = util.train(model, train_loader, test_loader, criterion, optimizer, device, patience=4, num_epochs=20) # 3 % with entities

Epoch [1/20] - Training Loss: 2.1620- Validation Loss: 1.3198
Epoch [2/20] - Training Loss: 1.1457- Validation Loss: 1.1031
Epoch [3/20] - Training Loss: 0.8042- Validation Loss: 1.0131
Epoch [4/20] - Training Loss: 0.5729- Validation Loss: 1.0141
Epoch [5/20] - Training Loss: 0.4173- Validation Loss: 1.0388
Epoch [6/20] - Training Loss: 0.3083- Validation Loss: 1.0921
Epoch [7/20] - Training Loss: 0.2330- Validation Loss: 1.1676
Early stopping after 7 epochs without improvement.


In [18]:
evaluate_model(model, test_loader,device) # 3 % with entities

Accuracy: 0.7213
Precision: 0.7215
Recall: 0.7201
F1 Score: 0.7167


In [8]:
train_loss, valid_loss = util.train(model, train_loader, test_loader, criterion, optimizer, device, patience=4, num_epochs=20) # Orignal data

Epoch [1/20] - Training Loss: 2.1018- Validation Loss: 1.3351
Epoch [2/20] - Training Loss: 1.0879- Validation Loss: 1.0731
Epoch [3/20] - Training Loss: 0.7618- Validation Loss: 0.9812
Epoch [4/20] - Training Loss: 0.5373- Validation Loss: 1.0099
Epoch [5/20] - Training Loss: 0.3895- Validation Loss: 1.0136
Epoch [6/20] - Training Loss: 0.2866- Validation Loss: 1.0684
Epoch [7/20] - Training Loss: 0.2177- Validation Loss: 1.1077
Early stopping after 7 epochs without improvement.


In [11]:
evaluate_model(model, test_loader,device) # orignal data

Accuracy: 0.7215
Precision: 0.7239
Recall: 0.7191
F1 Score: 0.7169


In [15]:
train_loss, valid_loss = util.train(model, train_loader, test_loader, criterion, optimizer, device, patience=4, num_epochs=20) # without entities 7%

Epoch [1/20] - Training Loss: 2.2260- Validation Loss: 1.3595
Epoch [2/20] - Training Loss: 1.1679- Validation Loss: 1.1595
Epoch [3/20] - Training Loss: 0.8275- Validation Loss: 1.1447
Epoch [4/20] - Training Loss: 0.6153- Validation Loss: 1.2201
Epoch [5/20] - Training Loss: 0.4624- Validation Loss: 1.2791
Epoch [6/20] - Training Loss: 0.3590- Validation Loss: 1.3769
Epoch [7/20] - Training Loss: 0.2815- Validation Loss: 1.4314
Early stopping after 7 epochs without improvement.


In [17]:
evaluate_model(model, test_loader,device) # without entities 7%

Accuracy: 0.6616
Precision: 0.6609
Recall: 0.6601
F1 Score: 0.6539


In [10]:
train_loss, valid_loss = util.train(model, train_loader, test_loader, criterion, optimizer, device, patience=4, num_epochs=20) # without entities orignal

Epoch [1/20] - Training Loss: 2.1552- Validation Loss: 1.3565
Epoch [2/20] - Training Loss: 1.1240- Validation Loss: 1.1456
Epoch [3/20] - Training Loss: 0.8066- Validation Loss: 1.1516
Epoch [4/20] - Training Loss: 0.5917- Validation Loss: 1.1909
Epoch [5/20] - Training Loss: 0.4344- Validation Loss: 1.2529
Epoch [6/20] - Training Loss: 0.3204- Validation Loss: 1.3646
Early stopping after 6 epochs without improvement.


In [12]:
evaluate_model(model, test_loader,device) # without entities orignal

Accuracy: 0.6675
Precision: 0.6655
Recall: 0.6655
F1 Score: 0.6614


In [11]:
def evaluate_model(model, data_loader,device):
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    all_predictions = []
    all_targets = []
 
    with torch.no_grad():
        for input_ids, attention_mask, targets in data_loader:
            input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_samples += targets.size(0)
            all_predictions.extend(predicted.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
    average_loss = total_loss / len(data_loader)

    accuracy = accuracy_score(all_targets, all_predictions)
    precision = precision_score(all_targets, all_predictions, average='macro')
    recall = recall_score(all_targets, all_predictions, average='macro')
    f1 = f1_score(all_targets, all_predictions, average='macro')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

In [9]:
data = """
Epoch [1/20] - Training Loss: 1.5920- Validation Loss: 0.5885
Epoch [2/20] - Training Loss: 0.4043- Validation Loss: 0.4213
Epoch [3/20] - Training Loss: 0.2274- Validation Loss: 0.4154
Epoch [4/20] - Training Loss: 0.1438- Validation Loss: 0.4207
Epoch [5/20] - Training Loss: 0.0939- Validation Loss: 0.4386
Epoch [6/20] - Training Loss: 0.0646- Validation Loss: 0.5011
Epoch [7/20] - Training Loss: 0.0483- Validation Loss: 0.5251
"""

# Initialize empty lists for training and validation losses
training_losses = []
validation_losses = []

# Split the data into lines and iterate through them
for line in data.strip().split('\n'):
    # Split each line into words
    words = line.split()
    
    # Extract training and validation losses and append them to the respective lists
    training_loss = float(words[5].split('-')[0])
    validation_loss = float(words[-1])
    
    training_losses.append(training_loss)
    validation_losses.append(validation_loss)

# Print the extracted lists
print("Training Losses:", training_losses)
print("Validation Losses:", validation_losses)


Training Losses: [1.592, 0.4043, 0.2274, 0.1438, 0.0939, 0.0646, 0.0483]
Validation Losses: [0.5885, 0.4213, 0.4154, 0.4207, 0.4386, 0.5011, 0.5251]
