## Classifier - Try 1

Classify if a article has the Morality Frame or not using just the article as input.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  os.chdir('drive/MyDrive/Git/MasterThesis/data')
else:
  os.chdir('../../data/')

labels_path = "data/en/train-labels-subtask-2.txt"
articles_path = "data/en/train-articles-subtask-2/"

In [3]:
import pandas as pd

# Read the dev-labels-subtask-2.txt file
labels_df = pd.read_csv(labels_path, sep="\t")

# Rename the columns for easier processing
labels_df.columns = ["article_id", "frames"]


labels_df.head()

Unnamed: 0,article_id,frames
0,832959523,"Morality,Security_and_defense,Policy_prescript..."
1,833039623,"Political,Crime_and_punishment,External_regula..."
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq..."
3,814777937,"Political,Morality,Fairness_and_equality,Exter..."
4,821744708,"Policy_prescription_and_evaluation,Political,L..."


In [4]:
# A function to read the article text given its ID
def get_article_content(article_id):
    try:
        with open(f"{articles_path}/article{article_id}.txt", "r") as f:
            return f.read()
    except FileNotFoundError:
        return None

df = labels_df

# Apply the function to get the article content
df["content"] = df["article_id"].apply(get_article_content)

# Drop rows where content could not be found
df.dropna(subset=["content"], inplace=True)

df.head()


Unnamed: 0,article_id,frames,content
0,832959523,"Morality,Security_and_defense,Policy_prescript...",How Theresa May Botched\n\nThose were the time...
1,833039623,"Political,Crime_and_punishment,External_regula...",Robert Mueller III Rests His Case—Dems NEVER W...
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq...",Robert Mueller Not Recommending Any More Indic...
3,814777937,"Political,Morality,Fairness_and_equality,Exter...",The Far Right Is Trying to Co-opt the Yellow V...
4,821744708,"Policy_prescription_and_evaluation,Political,L...",‘Special place in hell’ for those who promoted...


In [5]:
# Split the frames column into a list of frames
df["frames_list"] = df["frames"].str.split(",")

# create for each frame a new column with the frame as name and 1 if the frame is present in the article and 0 if not
for frame in df["frames_list"].explode().unique():
    df[frame] = df["frames_list"].apply(lambda x: 1 if frame in x else 0)

df.head()

Unnamed: 0,article_id,frames,content,frames_list,Morality,Security_and_defense,Policy_prescription_and_evaluation,Legality_Constitutionality_and_jurisprudence,Economic,Political,Crime_and_punishment,External_regulation_and_reputation,Public_opinion,Fairness_and_equality,Capacity_and_resources,Quality_of_life,Cultural_identity,Health_and_safety
0,832959523,"Morality,Security_and_defense,Policy_prescript...",How Theresa May Botched\n\nThose were the time...,"[Morality, Security_and_defense, Policy_prescr...",1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,833039623,"Political,Crime_and_punishment,External_regula...",Robert Mueller III Rests His Case—Dems NEVER W...,"[Political, Crime_and_punishment, External_reg...",0,0,1,1,0,1,1,1,1,0,0,0,0,0
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq...",Robert Mueller Not Recommending Any More Indic...,"[Political, Crime_and_punishment, Fairness_and...",0,0,0,1,0,1,1,1,0,1,0,0,0,0
3,814777937,"Political,Morality,Fairness_and_equality,Exter...",The Far Right Is Trying to Co-opt the Yellow V...,"[Political, Morality, Fairness_and_equality, E...",1,1,0,0,1,1,0,1,1,1,0,0,0,0
4,821744708,"Policy_prescription_and_evaluation,Political,L...",‘Special place in hell’ for those who promoted...,"[Policy_prescription_and_evaluation, Political...",0,0,1,1,0,1,0,1,0,0,0,0,0,0


In [6]:
X = df["content"]
y = df.drop(columns=["article_id", "frames", "frames_list", "content"])

In [7]:
X.head()

0    How Theresa May Botched\n\nThose were the time...
1    Robert Mueller III Rests His Case—Dems NEVER W...
2    Robert Mueller Not Recommending Any More Indic...
3    The Far Right Is Trying to Co-opt the Yellow V...
4    ‘Special place in hell’ for those who promoted...
Name: content, dtype: object

In [8]:
y.head()

Unnamed: 0,Morality,Security_and_defense,Policy_prescription_and_evaluation,Legality_Constitutionality_and_jurisprudence,Economic,Political,Crime_and_punishment,External_regulation_and_reputation,Public_opinion,Fairness_and_equality,Capacity_and_resources,Quality_of_life,Cultural_identity,Health_and_safety
0,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,1,1,1,1,0,0,0,0,0
2,0,0,0,1,0,1,1,1,0,1,0,0,0,0
3,1,1,0,0,1,1,0,1,1,1,0,0,0,0
4,0,0,1,1,0,1,0,1,0,0,0,0,0,0


In [9]:
# modify y to binary classification morality or Security_and_defense
y = y[["Morality"]]
y.head()

Unnamed: 0,Morality
0,1
1,0
2,0
3,1
4,0


In [10]:
y.value_counts()

Morality
0           230
1           202
dtype: int64

In [11]:
len(X), len(y)

(432, 432)

### Create the PyTorch Model

In [12]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/7.7 MB[0m [31m4.4 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/7.7 MB[0m [31m26.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.7/7.7 MB[0m [31m76.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/295.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW

In [14]:
# Tokenize
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [15]:
class ArticleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long).squeeze()
        }

In [32]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='transformers')

In [27]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Create DataLoaders for train and test sets
BATCH_SIZE = 16

train_dataset = ArticleDataset(X_train, y_train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = ArticleDataset(X_test, y_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


### 2. Model Definition

In [25]:
# Clear previous models from GPU
if 'model' in locals():
    model.cpu()
    model = None
    del model
    torch.cuda.empty_cache()

# Define the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Move the model to the device
model = model.to(device)

Using bos_token, but it is not set yet.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using eos_token, but it is not set yet.


In [26]:
def compute_accuracy(logits, labels):
    _, preds = torch.max(logits, dim=1)
    correct = (preds == labels).float().sum()
    accuracy = correct / len(labels)
    return accuracy.item()

### 3. Training

In [28]:
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss().to(device)

# Training loop
for epoch in range(EPOCHS):
    total_loss = 0
    total_accuracy = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        accuracy = compute_accuracy(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_accuracy += accuracy

        # Clear GPU memory
        del input_ids
        del attention_mask
        del labels
        torch.cuda.empty_cache()

    avg_loss = total_loss / len(train_loader)
    avg_accuracy = total_accuracy / len(train_loader)

    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {avg_loss:.4f} | Accuracy: {avg_accuracy:.4f}")


Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Epoch 1/5 | Loss: 0.6087 | Accuracy: 0.6725


Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Epoch 2/5 | Loss: 0.4154 | Accuracy: 0.8225


Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Epoch 3/5 | Loss: 0.3049 | Accuracy: 0.8875


Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Epoch 4/5 | Loss: 0.1777 | Accuracy: 0.9550


Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Epoch 5/5 | Loss: 0.0801 | Accuracy: 0.9775


### Validate model

In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Place model in evaluation mode
model.eval()

all_preds = []
all_labels = []

# Use torch.no_grad() to turn off gradient computation for faster inference
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.7500
Precision: 0.7273
Recall: 0.5000
F1 Score: 0.5926


Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


In [29]:
def predict_article(article, model, tokenizer, device):
    model.eval()  # Set the model to evaluation mode

    encoding = tokenizer.encode_plus(
        article,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

    return "Morality" if preds[0].item() == 1 else "Not Morality"


In [30]:
article = """EU Profits From Trading With UK While London Loses Money – Political Campaigner

With the Parliamentary vote on British Prime Minister Theresa May’s Brexit plan set to be held next month; President of the European Commission Jean Claude Juncker has criticised the UK’s preparations for their departure from the EU.
But is there any chance that May's deal will make it through parliament and if it fails, how could this ongoing political deadlock finally come to an end?
Sputnik spoke with political campaigner Michael Swadling for more…
Sputnik: Does Theresa May have any chance of getting her deal through Parliament on the 14th January?
Michael Swadling: I guess her only chance is if Labour decides that they want to dishonour democracy and effectively keep us in the EU.
© AP Photo / Pablo Martinez Monsivais UK 'In Need of Leadership', May's Brexit Deal Unwelcome to Trump - US Ambassador
There is a chance; as unfortunately there are many MPs who don't respect the vote and may just turn on it, but short of that I don't see any way the Conservatives would vote for it, and the majority is slender as it is, as the DUP is bitterly against it, and I can't see the Lib Dems voting for it, so it will only be if there are enough, what I can describe as remoaner MPs, that the deal won't be dead in the water.
Sputnik: What could be a solution to the political chaos if the Prime Minister's deal is not approved?
Michael Swadling: The EU withdrawal act is in place; we'll leave and revert to WTO terms and that works, that's fine.
I often use the example of an iPhone to people; that's a piece of technology which is manufactured in China, uses American technology and these are two countries we deal with on WTO terms, this isn't a fantasy, stuck in a port somewhere, there isn't a massive tariff, this is the world that really exists today.
When we exit the EU on WTO terms; that will be fine for whatever trading we do with the EU, just as well as it does for our trade in China.READ MORE: UK Finance Chief Bashed for Failing to Unlock Money for No-Deal Brexit — Reports
Sputnik: Do you think that the EU needs the UK more than the UK needs the EU?
Michael Swadling: The EU makes a profit on its trade with the UK; the UK makes a loss on its trade with the EU.
They have a financial incentive to ensure that good trading relations continue far more than we do.
© REUTERS / Toby Melville UK Trade Minister Says '50-50' Chance Brexit Will Not Happen – Reports
The lifeblood and cash flow that keeps manufacturing in Europe going, comes from the city of London.
If someone in a city in Germany wants to do a deal with someone in Japan; the financial services of that are probably going through the city of London, they're not going through Frankfurt and Paris.
Views and opinions, expressed in the article are those of Michael Swadling and do not necessarily reflect those of Sputnik

"""
prediction = predict_article(article, model, tokenizer, device)
print(prediction)

Not Morality




### Other Memory Stuff

In [22]:
!nvidia-smi

Fri Oct  6 20:06:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   78C    P0    71W /  70W |  10131MiB / 15360MiB |    100%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [47]:
print(f"Allocated Memory: {torch.cuda.memory_allocated() / (1024**2)} MB")
print(f"Reserved Memory: {torch.cuda.memory_reserved() / (1024**2)} MB")

Allocated Memory: 14121.36328125 MB
Reserved Memory: 14226.0 MB


In [35]:
torch.cuda.empty_cache()

In [33]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in list(
                          locals().items())), key= lambda x: -x[1]):
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                     labels_df:  4.7 MiB
                            df:  4.7 MiB
                             X:  4.5 MiB
                            _5: 50.7 KiB
                            _4: 49.5 KiB
                            _7: 48.5 KiB
                             y:  3.5 KiB
                 BertTokenizer:  2.0 KiB
 BertForSequenceClassification:  2.0 KiB
                    DataLoader:  1.4 KiB
                         AdamW:  1.2 KiB
                            _3:  1.1 KiB
                       Dataset:  1.0 KiB
                ArticleDataset:  1.0 KiB
                          _i15: 1005.0 B
                          _i22:  984.0 B
                          _i24:  982.0 B
                          _i29:  982.0 B
                          _i20:  775.0 B
                            _8:  704.0 B
                          _i19:  656.0 B
                          _i33:  590.0 B
                           _i4:  522.0 B
                          _iii:  452.0 B
                

In [38]:
pip install numba



In [39]:
from numba import cuda
device = cuda.get_current_device()
device.reset()