# IR Assignment 03

## Sentiment Analysis 

In this assignment we are asked to perform sentiment analysis on the article we were given for assignment 01. This mean that in stage 1 we have built a corpus of positive (in our context) and negative words. In stage 2 we will use this corpus to perform sentiment analysis on the article.

> Install the required libraries

```bash
pip install tensorflow tensorflow_hub tensorflow_text transformers
```

In [None]:
# import libraries

import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, BertTokenizer

In [None]:
# load the data
github_link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.03/extracted_sentences.csv?raw=true"

df = pd.read_csv(github_link)

# label mapping
label_mapping = { 'pro-israeli': 0, 'pro-palestinian': 1, 'neutral': 2 }
df['label_int'] = df['label'].map(label_mapping)

# make sure all the labels are in the right format 
# meaning if some labels didn't get mapped, then we should remove them

df = df.dropna(subset=['label_int'])

In [None]:
# get a subset of the data using random sampling
df_subset = df.sample(n=1000, random_state=1)

In [None]:
# split the data into training and testing 30% testing
from sklearn.model_selection import train_test_split

X = df_subset['sentence']
y = df_subset['label_int']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# random_state is set to 42 so that the results are reproducible

# load the tokenizer
# take the smallest model possible since my machine is not very powerful
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_sentences(sentences, tokenizer, max_length):
    tokenized_sentences = tokenizer(
        list(sentences),
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )

    return   {
            "input_ids": tokenized_sentences["input_ids"],
            "attention_mask": tokenized_sentences["attention_mask"],
        }

# Tokenize the training and testing data
max_length = 128
X_train_tokenized = tokenize_sentences(X_train, tokenizer, max_length)
X_test_tokenized = tokenize_sentences(X_test, tokenizer, max_length)

In [None]:
batch_size = 16
train_dataset = tf.data.Dataset.from_tensor_slices((dict(X_train_tokenized), y_train)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(X_test_tokenized), y_test)).batch(batch_size)

# load the model, get the smallest model possible
model_name = "bert-base-uncased"
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=["accuracy"])

In [None]:
# train the model

EPOCHS = 5
history = model.fit(train_dataset, epochs=EPOCHS, validation_data=test_dataset, verbose=1)

# evaluate the model
loss, accuracy = model.evaluate(test_dataset)
print(f"Test accuracy: {accuracy}")
print(f"Test loss: {loss}")

model_name_save = "model_1"
# save the model
model.save_pretrained(model_name_save)
tokenizer.save_pretrained(model_name_save)

In [None]:
# plot the training history
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0, 1])
plt.legend(loc='lower right')
plt.show()