In [1]:
import os
import sys
import torch

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sys.path.append('../../..')

from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from datasets import Dataset
from typing import Dict, List, Optional, Union
from dataclasses import dataclass
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, PreTrainedTokenizerFast, DataCollatorForLanguageModeling
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

from src.custom_training.model_utils import load_model

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report

2023-08-07 06:01:27.825455: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2023-08-07 06:01:29,848] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


## Load the model and data

In [2]:
model_dir = '/shared/3/projects/hiatus/models/pan20/roberta-test/last/'
# model_dir = 'roberta-base'
pretrained_model = 'roberta-base'

def load_tokenizer(pretrained_model):
    print(f"Loading in {pretrained_model} tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

model = AutoModel.from_pretrained(model_dir)
model = model.to('cuda')
tokenizer = load_tokenizer(pretrained_model)

Loading in roberta-base tokenizer


In [3]:
input_directory = '/shared/3/datasets/PAN/pan20-av-training-small/'

train = pd.read_json(input_directory + 'train.jsonl', lines=True)
dev = pd.read_json(input_directory + 'dev.jsonl', lines=True)
test = pd.read_json(input_directory + 'test.jsonl', lines=True)
print(len(train), len(dev), len(test))

42098 5263 5262


## Setup dataset and loaders

In [4]:
@dataclass
class TextCollator(DataCollatorForLanguageModeling):
    tokenizer: PreTrainedTokenizerFast
    padding: Union[bool, str] = True
    return_attention_mask: Optional[bool] = True
    max_length: Optional[int] = 350

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]):
        text_one_sents = self._encode_text(features, 0)
        text_two_sents = self._encode_text(features, 1)

        batchA = self._prepare_batch(text_one_sents)
        batchB = self._prepare_batch(text_two_sents)
        
        labels = [feature['same'] for feature in features]

        return batchA, batchB, labels

    def _encode_text(self, features, index):
        return [{'input_ids': self.tokenizer(feature['pair'][index])['input_ids'][:self.max_length]} for feature in
                features]

    def _prepare_batch(self, sents):
        return self.tokenizer.pad(
            sents,
            padding=self.padding,
            max_length=self.max_length,
            return_attention_mask=self.return_attention_mask,
            return_tensors="pt",
        )

collator = TextCollator(tokenizer=tokenizer, max_length=350)
train_dataset = Dataset.from_pandas(train).shuffle()
test_dataset = Dataset.from_pandas(test).shuffle()

train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,  
    shuffle=False,
    collate_fn=collator 
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=4,  
    shuffle=False,
    collate_fn=collator 
)

## Build the embeddings for the training and test data

In [5]:
out_directory = '/shared/3/datasets/PAN/pan20-av-training-small/'
train_file = os.path.join(out_directory, 'pan_small_roberta_train_embeddings_zero.parquet.gzip')

def get_cls(outputs):
    last_hidden_state = outputs['hidden_states'][-1]
    cls_representation = last_hidden_state[:, 0, :]
    return cls_representation

def extract_and_store_data(data_loader, model, device, parquet_file):
    output_data = []

    for _, (batchA, batchB, labels) in tqdm(enumerate(data_loader), total=len(data_loader)):
        batchA, batchB = batchA.to(device), batchB.to(device)

        # Get styles from model output
        outputsA = model(**batchA, output_hidden_states=True)
        cls_A = get_cls(outputsA).tolist()
        outputsB = model(**batchB, output_hidden_states=True)
        cls_B = get_cls(outputsB).tolist()

        # Combine labels and embedding information
        batch_output = [[label] + sA + sB for label, sA, sB in zip(labels, cls_A, cls_B)]
        output_data.extend(batch_output)

    # Create dataframe from output_data and save to parquet
    cols_a = [f'A{i}' for i in range(768)]
    cols_b = [f'B{i}' for i in range(768)]
    label_column = ['same']
    df_columns = label_column + cols_a + cols_b

    df = pd.DataFrame(output_data, columns=df_columns)
    df.columns = df.columns.astype(str)
    df.to_parquet(parquet_file, compression='gzip')


def process_training_data():
#     if os.path.isfile(train_file):
    if False:
        print("Loading from file")
        return pd.read_parquet(train_file)
    else:
        device = 'cuda'
        extract_and_store_data(train_dataloader, model, device, train_file)
        return pd.read_parquet(train_file)

train_style = process_training_data()

  0%|                                                                                                                                            | 0/10525 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (5093 > 512). Running this sequence through the model will result in indexing errors
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10525/10525 [30:44<00:00,  5.71it/s]


**Repeat for the testing data**

In [6]:
out_directory = '/shared/3/datasets/PAN/pan20-av-training-small/'
test_file = os.path.join(out_directory, 'pan_small_roberta_test_embeddings_zero.parquet.gzip')

def process_testing_data():
#     if os.path.isfile(test_file):
    if False:
        print("Loading from file")
        return pd.read_parquet(test_file)
    else:
        print("Building test file")
        device = 'cuda'
        extract_and_store_data(test_dataloader, model, device, test_file)
        return pd.read_parquet(test_file)

test_style = process_testing_data()

Building test file


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1316/1316 [04:03<00:00,  5.40it/s]


## Train a Random Forest model and run evaluation

In [7]:
y_train = train_style['same']
X_train = train_style.drop('same', axis=1)

y_test = test_style['same']
X_test = test_style.drop('same', axis=1)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6742683390345876
              precision    recall  f1-score   support

       False       0.64      0.71      0.67      2498
        True       0.71      0.64      0.67      2764

    accuracy                           0.67      5262
   macro avg       0.68      0.68      0.67      5262
weighted avg       0.68      0.67      0.67      5262



In [8]:
y_train = train_style['same']
X_train = train_style.drop('same', axis=1)

y_test = test_style['same']
X_test = test_style.drop('same', axis=1)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7673888255416191
              precision    recall  f1-score   support

       False       0.79      0.70      0.74      2498
        True       0.75      0.83      0.79      2764

    accuracy                           0.77      5262
   macro avg       0.77      0.76      0.76      5262
weighted avg       0.77      0.77      0.77      5262



In [9]:
y_train = train_style['same']
X_train = train_style.drop('same', axis=1)

y_test = test_style['same']
X_test = test_style.drop('same', axis=1)

clf = RandomForestClassifier(n_estimators=500, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7803116685670848
              precision    recall  f1-score   support

       False       0.81      0.71      0.75      2498
        True       0.76      0.85      0.80      2764

    accuracy                           0.78      5262
   macro avg       0.78      0.78      0.78      5262
weighted avg       0.78      0.78      0.78      5262



In [10]:
y_train = train_style['same']
X_train = train_style.drop('same', axis=1)

y_test = test_style['same']
X_test = test_style.drop('same', axis=1)

clf = RandomForestClassifier(n_estimators=1000, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7848726719878373
              precision    recall  f1-score   support

       False       0.82      0.70      0.76      2498
        True       0.76      0.86      0.81      2764

    accuracy                           0.78      5262
   macro avg       0.79      0.78      0.78      5262
weighted avg       0.79      0.78      0.78      5262

