In [1]:
!pip install torch transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0m

In [2]:
import sys
import pandas as pd
from tqdm.auto import tqdm
import random
import numpy as np
import random
import torch
import gc
import os
import csv
import torch.nn as nn
import evaluate
from datetime import datetime
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import load_metric, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification,get_scheduler

In [3]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
CHECKPOINT = "xlm-roberta-base"  # transformer model checkpoint
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
print(DEVICE)

Downloading config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading sentencepiece.bpe.model:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

cuda:0


In [4]:
%env CUBLAS_WORKSPACE_CONFIG = :4096:8
os.getenv('CUBLAS_WORKSPACE_CONFIG')

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


':4096:8'

In [5]:
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)
g = torch.Generator()
g.manual_seed(0)
rng = np.random.default_rng(seed=0)

In [6]:
def seed_worker(worker_id):
    worker_seed = 0
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [7]:
def tokenize_function(examples):
        return tokenizer(examples["prompt"], truncation=True)

In [13]:
def load_train_data(fileName):

    df = pd.read_json("/datasets/jsonlemails/FrenchPrompts_prepared_train.jsonl", lines = True)

    ham = len(df[df['completion'] == 0])
    spam = len(df[df['completion'] == 1])

    total = ham + spam
    print(f"Total = {total}")
    hamratio = ham / total
    spamratio = spam / total

    print(f"Ham:{ham}")
    print(f"Spam:{spam}")
    print(f"Ham Ration:{hamratio}")
    print(f"Spam Ration:{spamratio}")

    raw_datasets = Dataset.from_pandas(df)

    raw_datasets = raw_datasets.shuffle(generator = rng)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

    tokenized_datasets = tokenized_datasets.remove_columns("prompt")
    tokenized_datasets = tokenized_datasets.rename_column("completion", "labels")
    tokenized_datasets.set_format("torch")

    if tokenized_datasets.column_names.count("__index_level_0__") > 0:
        tokenized_datasets = tokenized_datasets.remove_columns("__index_level_0__") 

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainloader = DataLoader(
        tokenized_datasets,
        batch_size=16,
        collate_fn=data_collator,
        worker_init_fn=seed_worker,
        generator=g,
    )
    
    return trainloader

In [9]:
def load_test_data(fileName):
    
    df = pd.read_json("/datasets/jsonlemails/enron1_prepared_valid.jsonl", lines = True)

    ham = len(df[df['completion'] == 0])
    spam = len(df[df['completion'] == 1])

    total = ham + spam
    print(f"Total = {total}")
    hamratio = ham / total
    spamratio = spam / total

    print(f"Ham:{ham}")
    print(f"Spam:{spam}")
    print(f"Ham Ration:{hamratio}")
    print(f"Spam Ration:{spamratio}")

    raw_datasets = Dataset.from_pandas(df)
    raw_datasets = raw_datasets.shuffle(generator = rng)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns("prompt")
    tokenized_datasets = tokenized_datasets.rename_column("completion", "labels")
    tokenized_datasets.set_format("torch")
    
    if tokenized_datasets.column_names.count("__index_level_0__") > 0:
        tokenized_datasets = tokenized_datasets.remove_columns("__index_level_0__") 

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    testloader = DataLoader(
        tokenized_datasets, batch_size=16, collate_fn=data_collator, worker_init_fn=seed_worker, generator=g
    )
    return testloader

In [10]:
def train(net, trainloader, epochs):
    total_steps = len(trainloader) * epochs
    total_train_loss = 0
    optimizer = AdamW(net.parameters(),lr = 5e-5)

    lr_scheduler = get_scheduler(
        "linear", 
        optimizer=optimizer,
        num_warmup_steps=0, 
        num_training_steps=total_steps
)
    
    progress_bar = tqdm(range(total_steps))

    net.train()
    for epoch in range(epochs):
        for batch in trainloader:
            start_time = datetime.now()
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = net(**batch)
            loss = outputs.loss
            total_train_loss += loss.item()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

            end_time = datetime.now()
            print("Epoch: "+ str(epoch + 1) + "\tTime: " + str(end_time - start_time) + "\tLoss: " + str((loss.item())))

    avg_train_loss = total_train_loss / len(trainloader) 
    print("Average training loss: {0:.2f}".format(avg_train_loss))

In [11]:
def test(net, testloader):
    accuracy_metric = evaluate.load("accuracy")

    precision_metric0 = evaluate.load("precision")
    precision_metric1 = evaluate.load("precision")

    recall_metric0 = evaluate.load("recall")
    recall_metric1 = evaluate.load("recall")

    f1_metric0 = evaluate.load("f1")
    f1_metric1 = evaluate.load("f1")

    net.eval()
    for batch in testloader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        with torch.no_grad():
            outputs = net(**batch)

        loss = outputs.loss.item()
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

        precision_metric0.add_batch(predictions=predictions, references=batch["labels"])
        recall_metric0.add_batch(predictions=predictions, references=batch["labels"])
        f1_metric0.add_batch(predictions=predictions, references=batch["labels"])

        precision_metric1.add_batch(predictions=predictions, references=batch["labels"])
        recall_metric1.add_batch(predictions=predictions, references=batch["labels"])
        f1_metric1.add_batch(predictions=predictions, references=batch["labels"])
    
    accuracy = accuracy_metric.compute()

    precison0 = precision_metric0.compute(pos_label = 0)
    precison1 = precision_metric1.compute(pos_label = 1)

    recall0 = recall_metric0.compute(pos_label = 0)
    recall1 = recall_metric1.compute(pos_label = 1)

    f1_0 = f1_metric0.compute(pos_label = 0)
    f1_1 = f1_metric1.compute(pos_label = 1)


    return accuracy, precison0, precison1, recall0, recall1, f1_0, f1_1

In [14]:
trainloader = load_train_data(0)
testloader = load_test_data(0)


Total = 472
Ham:232
Spam:240
Ham Ration:0.4915254237288136
Spam Ration:0.5084745762711864


  0%|          | 0/1 [00:00<?, ?ba/s]

Total = 996
Ham:713
Spam:283
Ham Ration:0.7158634538152611
Spam Ration:0.28413654618473894


  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
net = AutoModelForSequenceClassification.from_pretrained(
        CHECKPOINT, num_labels=2
    ).to(DEVICE)
train(net, trainloader,2)

In [None]:
accuracy, precision0, precision1, recall0, recall1, f1_0, f1_1 = test(net, testloader)
print(f"""Final test set performance:
    \n\t{accuracy}\n\t

    \n\t(Pos Label 0){precision0}
    \n\t(Pos Label 1){precision1}\n\t

    \n\t(Pos Label 0){recall0}
    \n\t(Pos Label 1){recall1}\n\t

    \n\t(Pos Label 0){f1_0}
    \n\t(Pos Label 1){f1_1}\n\t
    """)

In [None]:
!ls /datasets/downsampleddata