<a href="https://colab.research.google.com/github/dattali18/IR_Assignments/blob/main/Assignment.04/IR_04_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import warnings

warnings.filterwarnings("ignore")

In [4]:
!pip install --upgrade pip



In [5]:
!pip install pandas numpy torch transformers datasets scikit-learn tqdm



In [6]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [8]:
link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.04/sentences.csv?raw=true"
df = pd.read_csv(link)
texts = df['sentence'].values
labels = df['label'].values

In [9]:
# take a subset of the df 100 from each of the 5 classes
df = df.groupby('label').apply(lambda x: x.sample(n=500, random_state=42)).reset_index(drop=True)
texts = df['sentence'].values
labels = df['label'].values

In [10]:
# check if the df contains 500 instances
df.groupby('label').count()

Unnamed: 0_level_0,id,sentence,type
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,500,476,500
1,500,500,500
2,500,500,500
3,500,500,500
4,500,500,500


In [11]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.15, random_state=42
)

In [12]:
# Create dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [13]:
# Create model class
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes=5):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.drop = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(output[0][:, 0, :])
        return self.fc(output)

In [14]:
# Initialize tokenizer and create datasets
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

In [15]:
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [16]:
# Initialize model and move to GPU
model = SentimentClassifier()
model = model.to(device)

In [17]:
# Training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
n_epochs = 5

In [18]:
# Training loop
def train_model():
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc='Training'):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)

In [19]:
# Evaluation loop
def evaluate_model():
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            _, predictions = torch.max(outputs, dim=1)

            total_loss += loss.item()
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = np.mean(np.array(all_predictions) == np.array(all_labels))
    return total_loss / len(val_loader), accuracy

In [20]:
# Training
for epoch in range(n_epochs):
    print(f'\nEpoch {epoch + 1}/{n_epochs}')
    train_loss = train_model()
    val_loss, val_accuracy = evaluate_model()

    print(f'Training Loss: {train_loss:.4f}')
    print(f'Validation Loss: {val_loss:.4f}')
    print(f'Validation Accuracy: {val_accuracy:.4f}')


Epoch 1/5


Training: 100%|██████████| 133/133 [00:26<00:00,  4.95it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 16.37it/s]


Training Loss: 0.7387
Validation Loss: 0.1756
Validation Accuracy: 0.9493

Epoch 2/5


Training: 100%|██████████| 133/133 [00:22<00:00,  5.79it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 16.36it/s]


Training Loss: 0.1003
Validation Loss: 0.1437
Validation Accuracy: 0.9573

Epoch 3/5


Training: 100%|██████████| 133/133 [00:23<00:00,  5.66it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 16.79it/s]


Training Loss: 0.0402
Validation Loss: 0.1156
Validation Accuracy: 0.9680

Epoch 4/5


Training: 100%|██████████| 133/133 [00:23<00:00,  5.54it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 13.52it/s]


Training Loss: 0.0237
Validation Loss: 0.1606
Validation Accuracy: 0.9547

Epoch 5/5


Training: 100%|██████████| 133/133 [00:24<00:00,  5.43it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 16.36it/s]

Training Loss: 0.0076
Validation Loss: 0.1725
Validation Accuracy: 0.9627





In [21]:
# Save the model
torch.save(model.state_dict(), 'sentiment_model_v2.pth')

In [22]:
# use the model for testing a sentence

e1 = "The Hamas terroriest have launch a rocket attack to major isreali cities"

In [23]:
# import torch
# from transformers import DistilBertTokenizer

# Assuming the SentimentClassifier class is already defined as provided

# Initialize the model and tokenizer
# model = SentimentClassifier()
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Function to classify a sentence and get probabilities
def classify_sentence_prob(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=512)

    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get the model's output
    with torch.no_grad():
        logits = model(inputs['input_ids'], inputs['attention_mask'])

    # Apply softmax to get probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=1)

    return probabilities

# Example usage
# sentence = "This is a sample sentence for classification."
probabilities = classify_sentence_prob(e1)

print("Probabilities for each class:", probabilities)

Probabilities for each class: tensor([[7.5548e-05, 6.6228e-05, 8.5338e-05, 3.5502e-05, 9.9974e-01]],
       device='cuda:0')


In [24]:
# import torch
# from transformers import DistilBertTokenizer

# Assuming the SentimentClassifier class is already defined as provided

# Initialize the model and tokenizer
# model = SentimentClassifier()
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Function to classify a sentence and get one-hot encoded vector
def classify_sentence_class(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get the model's output
    with torch.no_grad():
        logits = model(inputs['input_ids'], inputs['attention_mask'])

    # Apply softmax to get probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=1)

    # Get the predicted class (index of the max probability)
    predicted_class = torch.argmax(probabilities, dim=1).item()

    # Create a one-hot encoded vector
    one_hot_vector = torch.zeros(probabilities.size(1))
    one_hot_vector[predicted_class] = 1

    return one_hot_vector

# Example usage
# sentence = "This is a sample sentence for classification."
one_hot_vector = classify_sentence_class(e1)

print("One-hot encoded vector for the predicted class:", one_hot_vector)

One-hot encoded vector for the predicted class: tensor([0., 0., 0., 0., 1.])


In [25]:
def one_hot_to_class(one_hot_vector, map_class):
    # Find the index of the maximum value in the one-hot encoded vector
    class_index = torch.argmax(one_hot_vector).item()

    # Get the string representation of the class from the map_class dictionary
    class_string = map_class[class_index]

    return class_string

map_class = {0: 'pro-israeli', 1: 'pro-palestinan', 2: 'neutral', 3: 'anti-isreali', 4: 'anti-palestinian'}

one_hot_to_class(one_hot_vector, map_class)

'anti-palestinian'

In [26]:
def classify_sentence(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # Get the model's output
    with torch.no_grad():
        logits = model(inputs['input_ids'], inputs['attention_mask'])

    # Apply softmax to get probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=1)

    # get the str of the calss based on the one-hot-encoded vector
    predicted_class = torch.argmax(probabilities, dim=1).item()

    # Create a one-hot encoded vector
    one_hot_vector = torch.zeros(probabilities.size(1))
    one_hot_vector[predicted_class] = 1

    map_class = {0: 'pro-israeli', 1: 'pro-palestinan', 2: 'neutral', 3: 'anti-isreali', 4: 'anti-palestinian'}


    return probabilities, one_hot_to_class(one_hot_vector, map_class)

In [27]:
# get the data from the github repository
aj_url = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/word/A_J_word.csv?raw=true"
bbc_url = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/word/BBC_word.csv?raw=true"
jp_url = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/word/J_P_word.csv?raw=true"
nyt_url = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.01/data/word/NYT_word.csv?raw=true"

In [28]:
import pandas as pd

# load the data
aj_df = pd.read_csv(aj_url)
bbc_df = pd.read_csv(bbc_url)
jp_df = pd.read_csv(jp_url)
nyt_df = pd.read_csv(nyt_url)

In [29]:
import re


def clean_text(text):
    # Normalize all types of single and double quotation marks to standard forms
    text = re.sub(r"[‘’`]", "'", text)  # Convert all single quote variations to '
    text = re.sub(r"[“”]", '"', text)  # Convert all double quote variations to "

    # remove any and all special characters since it will not be useful for our analysis
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    return text

def extract_all_sentences(df):
    # this will return a dict with key the id of the article "aj_1" for example
    # and a list of all the sentences in the article

    all_sentences = []

    for index, row in df.iterrows():
        text = row["document"]
        # TODO - ask gpt for a smarter sentence extratctor
        sentences = re.split(r"[.!?]", text)
        sentences = [sentence for sentence in sentences if sentence != ""]
        # clean the sentences
        sentences = [clean_text(sentence) for sentence in sentences]

        # for all sentence in sentences add to df
        for sentence in sentences:
            all_sentences.append({"id": row["id"], "document": sentence})

    return all_sentences

In [30]:
aj_sentences = extract_all_sentences(aj_df)
bbc_sentences = extract_all_sentences(bbc_df)
jp_sentences = extract_all_sentences(jp_df)
nyt_sentences = extract_all_sentences(nyt_df)

In [31]:
aj_df = pd.DataFrame(aj_sentences)
bbc_df = pd.DataFrame(bbc_sentences)
jp_df = pd.DataFrame(jp_sentences)
nyt_df = pd.DataFrame(nyt_sentences)

In [32]:
df = pd.DataFrame(columns=["id", "document", "pro-israeli", "pro-palestinan", "neutral", "anti-isreali", "anti-palestinian", "majority_class"])
df = pd.concat([df, aj_df], ignore_index=True)
df = pd.concat([df, bbc_df], ignore_index=True)
df = pd.concat([df, jp_df], ignore_index=True)
df = pd.concat([df, nyt_df], ignore_index=True)

df[["pro-israeli", "pro-palestinan", "neutral", "anti-isreali", "anti-palestinian"]] = 0
df['majority_class'] = ''

df.head()

Unnamed: 0,id,document,pro-israeli,pro-palestinan,neutral,anti-isreali,anti-palestinian,majority_class
0,aj_1,pope renews call for gaza ceasefire release o...,0,0,0,0,0,
1,aj_1,pope francis has renewed calls for an immedia...,0,0,0,0,0,
2,aj_2,biden is still the best us president israel co...,0,0,0,0,0,
3,aj_2,united states president ronald reagans order ...,0,0,0,0,0,
4,aj_3,israeli air strikes continue across gaza as tr...,0,0,0,0,0,


In [33]:
# rename document to sentnce in df

df = df.rename(columns={"document": "sentence"})

In [34]:

# now we wil use the model to classify each sentences for each article
# and we will classify the article based on the majority of the sentences

# TODO: load the model

# df is the new dataframe with id, class1, ..., calss5, majority_class
# the classi will contain the number of sentences in the article that belong to that class

# loop through every items in df
for index, row in df.iterrows():
  sentence = row['sentence']

  probs, cls = classify_sentence(sentence)
  # put the values in the df
  df.at[index, 'majority_class'] = cls
  # unpack the values in probs (len 5) to the 5 classes of ["pro-israeli", "pro-palestinan", "neutral", "anti-isreali", "anti-palestinian"]
  # map index to key
  map_class = {0: 'pro-israeli', 1: 'pro-palestinan', 2: 'neutral', 3: 'anti-isreali', 4: 'anti-palestinian'}

  for i in range(5):
    df.at[index, map_class[i]] = probs[0][i].item()

  # print at interval of 100 indexs
  if index % 100 == 0:
    print(f"processing index {index}")


df.head()

processing index 0
processing index 100
processing index 200
processing index 300
processing index 400
processing index 500
processing index 600
processing index 700
processing index 800
processing index 900
processing index 1000
processing index 1100
processing index 1200
processing index 1300
processing index 1400
processing index 1500
processing index 1600
processing index 1700
processing index 1800
processing index 1900
processing index 2000
processing index 2100
processing index 2200
processing index 2300
processing index 2400
processing index 2500
processing index 2600
processing index 2700
processing index 2800
processing index 2900
processing index 3000
processing index 3100
processing index 3200
processing index 3300
processing index 3400
processing index 3500
processing index 3600
processing index 3700
processing index 3800
processing index 3900
processing index 4000
processing index 4100
processing index 4200
processing index 4300
processing index 4400
processing index 4500


Unnamed: 0,id,sentence,pro-israeli,pro-palestinan,neutral,anti-isreali,anti-palestinian,majority_class
0,aj_1,pope renews call for gaza ceasefire release o...,0.999303,8.6e-05,0.000212,0.00031,8.8e-05,pro-israeli
1,aj_1,pope francis has renewed calls for an immedia...,0.999188,0.000102,0.000255,0.000333,0.000122,pro-israeli
2,aj_2,biden is still the best us president israel co...,7.5e-05,0.999781,6e-05,5.1e-05,3.3e-05,pro-palestinan
3,aj_2,united states president ronald reagans order ...,0.000434,9.7e-05,0.000106,0.999193,0.00017,anti-isreali
4,aj_3,israeli air strikes continue across gaza as tr...,0.000137,6.5e-05,0.000929,0.998556,0.000313,anti-isreali


In [35]:
# save the df into a csv file


df.to_csv("sentences_with_class.csv", index=False)