In [4]:
!pip install torch transformers datasets evaluate

[0m

In [5]:
import sys
import pandas as pd
from tqdm.auto import tqdm
import random
import numpy as np
import random
import torch
import gc
import os
import csv
import torch.nn as nn
import evaluate
from datetime import datetime
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import load_metric, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification,get_scheduler, GPT2Tokenizer, GPT2Model

In [6]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
CHECKPOINT = "gpt2"  # transformer model checkpoint
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
print(DEVICE)

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

cuda:0


In [7]:
%env CUBLAS_WORKSPACE_CONFIG = :4096:8
os.getenv('CUBLAS_WORKSPACE_CONFIG')

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


':4096:8'

In [8]:
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)
g = torch.Generator()
g.manual_seed(0)
rng = np.random.default_rng(seed=0)

In [9]:
def seed_worker(worker_id):
    worker_seed = 0
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [10]:
def tokenize_function(examples):
        return tokenizer(examples["prompt"], truncation=True)

In [8]:
def load_train_data(fileName):

    df = pd.read_json("/datasets/jsonlemails/FrenchRussianTrain_prepared.jsonl", lines = True)
    #df = df.sample(n= 150).reset_index(drop=True)

    ham = len(df[df['completion'] == 0])
    spam = len(df[df['completion'] == 1])

    total = ham + spam
    print(f"Total = {total}")
    hamratio = ham / total
    spamratio = spam / total

    print(f"Ham:{ham}")
    print(f"Spam:{spam}")
    print(f"Ham Ration:{hamratio}")
    print(f"Spam Ration:{spamratio}")

    raw_datasets = Dataset.from_pandas(df)

    raw_datasets = raw_datasets.shuffle(generator = rng)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

    tokenized_datasets = tokenized_datasets.remove_columns("prompt")
    tokenized_datasets = tokenized_datasets.rename_column("completion", "labels")
    tokenized_datasets.set_format("torch")

    if tokenized_datasets.column_names.count("__index_level_0__") > 0:
        tokenized_datasets = tokenized_datasets.remove_columns("__index_level_0__") 

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainloader = DataLoader(
        tokenized_datasets,
        batch_size=1,
        collate_fn=data_collator,
        worker_init_fn=seed_worker,
        generator=g,
    )
    
    return trainloader

In [9]:
def load_test_data(fileName):
    
    df = pd.read_json("/datasets/jsonlemails/enron1_prepared_valid.jsonl", lines = True)

    ham = len(df[df['completion'] == 0])
    spam = len(df[df['completion'] == 1])

    total = ham + spam
    print(f"Total = {total}")
    hamratio = ham / total
    spamratio = spam / total

    print(f"Ham:{ham}")
    print(f"Spam:{spam}")
    print(f"Ham Ration:{hamratio}")
    print(f"Spam Ration:{spamratio}")

    raw_datasets = Dataset.from_pandas(df)
    raw_datasets = raw_datasets.shuffle(generator = rng)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns("prompt")
    tokenized_datasets = tokenized_datasets.rename_column("completion", "labels")
    tokenized_datasets.set_format("torch")
    
    if tokenized_datasets.column_names.count("__index_level_0__") > 0:
        tokenized_datasets = tokenized_datasets.remove_columns("__index_level_0__") 

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    testloader = DataLoader(
        tokenized_datasets, batch_size=1, collate_fn=data_collator, worker_init_fn=seed_worker, generator=g
    )
    return testloader

In [10]:
def train(net, trainloader, epochs):
    total_steps = len(trainloader) * epochs
    total_train_loss = 0
    optimizer = AdamW(net.parameters(),lr = 5e-5)

    lr_scheduler = get_scheduler(
        "linear", 
        optimizer=optimizer,
        num_warmup_steps=0, 
        num_training_steps=total_steps
)
    
    progress_bar = tqdm(range(total_steps))

    net.train()
    for epoch in range(epochs):
        for batch in trainloader:
            start_time = datetime.now()
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = net(**batch)
            loss = outputs.loss
            total_train_loss += loss.item()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

            end_time = datetime.now()
            print("Epoch: "+ str(epoch + 1) + "\tTime: " + str(end_time - start_time) + "\tLoss: " + str((loss.item())))

    avg_train_loss = total_train_loss / len(trainloader) 
    print("Average training loss: {0:.2f}".format(avg_train_loss))

In [11]:
def test(net, testloader):
    accuracy_metric = evaluate.load("accuracy")

    precision_metric0 = evaluate.load("precision")
    precision_metric1 = evaluate.load("precision")

    recall_metric0 = evaluate.load("recall")
    recall_metric1 = evaluate.load("recall")

    f1_metric0 = evaluate.load("f1")
    f1_metric1 = evaluate.load("f1")

    net.eval()
    for batch in testloader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        with torch.no_grad():
            outputs = net(**batch)

        loss = outputs.loss.item()
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

        precision_metric0.add_batch(predictions=predictions, references=batch["labels"])
        recall_metric0.add_batch(predictions=predictions, references=batch["labels"])
        f1_metric0.add_batch(predictions=predictions, references=batch["labels"])

        precision_metric1.add_batch(predictions=predictions, references=batch["labels"])
        recall_metric1.add_batch(predictions=predictions, references=batch["labels"])
        f1_metric1.add_batch(predictions=predictions, references=batch["labels"])
    
    accuracy = accuracy_metric.compute()

    precison0 = precision_metric0.compute(pos_label = 0)
    precison1 = precision_metric1.compute(pos_label = 1)

    recall0 = recall_metric0.compute(pos_label = 0)
    recall1 = recall_metric1.compute(pos_label = 1)

    f1_0 = f1_metric0.compute(pos_label = 0)
    f1_1 = f1_metric1.compute(pos_label = 1)


    return accuracy, precison0, precison1, recall0, recall1, f1_0, f1_1

In [12]:
trainloader = load_train_data(0)
testloader = load_test_data(0)


Total = 810
Ham:403
Spam:407
Ham Ration:0.49753086419753084
Spam Ration:0.5024691358024691


  0%|          | 0/1 [00:00<?, ?ba/s]

Total = 996
Ham:713
Spam:283
Ham Ration:0.7158634538152611
Spam Ration:0.28413654618473894


  0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
net = AutoModelForSequenceClassification.from_pretrained(
        CHECKPOINT, num_labels=2
    ).to(DEVICE)
train(net, trainloader,2)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1620 [00:00<?, ?it/s]

Epoch: 1	Time: 0:00:01.089461	Loss: 0.10557769238948822
Epoch: 1	Time: 0:00:00.235759	Loss: 0.09168857336044312
Epoch: 1	Time: 0:00:00.233342	Loss: 0.007470765151083469
Epoch: 1	Time: 0:00:00.222249	Loss: 0.013143331743776798
Epoch: 1	Time: 0:00:00.129212	Loss: 0.011142647825181484
Epoch: 1	Time: 0:00:00.145272	Loss: 0.26557084918022156
Epoch: 1	Time: 0:00:00.220997	Loss: 0.0036199060268700123
Epoch: 1	Time: 0:00:00.190202	Loss: 4.865379333496094
Epoch: 1	Time: 0:00:00.226547	Loss: 4.895878791809082
Epoch: 1	Time: 0:00:00.245702	Loss: 5.484696388244629
Epoch: 1	Time: 0:00:00.231937	Loss: 5.404824733734131
Epoch: 1	Time: 0:00:00.146477	Loss: 4.322720527648926
Epoch: 1	Time: 0:00:00.235830	Loss: 0.0036024453584104776
Epoch: 1	Time: 0:00:00.233916	Loss: 0.006379714701324701
Epoch: 1	Time: 0:00:00.150015	Loss: 0.005719131324440241
Epoch: 1	Time: 0:00:00.142430	Loss: 0.046454522758722305
Epoch: 1	Time: 0:00:00.234389	Loss: 0.030034109950065613
Epoch: 1	Time: 0:00:00.175126	Loss: 0.027911635

In [14]:
accuracy, precision0, precision1, recall0, recall1, f1_0, f1_1 = test(net, testloader)
print(f"""Final test set performance:
    \n\t{accuracy}\n\t

    \n\t(Pos Label 0){precision0}
    \n\t(Pos Label 1){precision1}\n\t

    \n\t(Pos Label 0){recall0}
    \n\t(Pos Label 1){recall1}\n\t

    \n\t(Pos Label 0){f1_0}
    \n\t(Pos Label 1){f1_1}\n\t
    """)

Final test set performance:
    
	{'accuracy': 0.7238955823293173}
	

    
	(Pos Label 0){'precision': 0.7239263803680982}
    
	(Pos Label 1){'precision': 0.7222222222222222}
	

    
	(Pos Label 0){'recall': 0.9929873772791024}
    
	(Pos Label 1){'recall': 0.045936395759717315}
	

    
	(Pos Label 0){'f1': 0.8373743347131876}
    
	(Pos Label 1){'f1': 0.08637873754152824}
	
    


In [15]:
!ls /datasets/jsonlemails

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
EnglishFrenchTraining_prepared.jsonl  RussianPrompts_prepared_train.jsonl
EnglishRussianTrain_prepared.jsonl    RussianPrompts_prepared_valid.jsonl
FrenchPrompts_prepared_train.jsonl    enron1_prepared_train.jsonl
FrenchPrompts_prepared_valid.jsonl    enron1_prepared_valid.jsonl
FrenchRussianTrain_prepared.jsonl
