In [1]:
import pandas as pd
import numpy as np



In [2]:
from pynvml import *
from IPython.display import display, HTML

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()
print_gpu_utilization()

GPU memory occupied: 920 MB.


### loading the data
we will store the data in their raw formats using huggingface's datasets library to load the data as it makes porting to NCC easier 

In [3]:
from datasets import load_dataset
import os
blogs_raw = load_dataset('blog_authorship_corpus', cache_dir=os.getcwd()+ '/Datasets')
scientific_raw = load_dataset('scientific_papers', 'arxiv', trust_remote_code=True, cache_dir=os.getcwd() + '/Datasets')
journalistic_raw = load_dataset('newsroom', trust_remote_code=True, data_dir= 'Datasets/newsroom/release/')
narrative_raw = load_dataset('roneneldan/tinystories', cache_dir=os.getcwd()+ '/Datasets')


Repo card metadata block was not found. Setting CardData to empty.


### exploring the data
we will make a datasets list that we can itterate through and then the plan is to work with this to 

In [4]:
datasets = [('blogs', blogs_raw['train']['text']), ('sci papers', scientific_raw['train']['abstract']), ('journal papers', journalistic_raw['train']['text']), ('stories', narrative_raw['train']['text'])]
import numpy as np

# Generate random indices
num_samples = 5

# Select samples by random indices


for df in datasets:
    print(f'head of {df[0]}')
    #print(df[1])
    random_indices = np.random.randint(0, len(df[0]), num_samples)
    for sample in random_indices:
        print(df[1][sample])
        print('\n')

    
'''head of blogs
Dataset({
    features: ['text', 'date', 'gender', 'age', 'horoscope', 'job'],
    num_rows: 689793
})
head of sci papers
Dataset({
    features: ['article', 'abstract', 'section_names'],
    num_rows: 203037
})
head of journal papers
Dataset({
    features: ['text', 'summary', 'title', 'url', 'date', 'density_bin', 'coverage_bin', 'compression_bin', 'density', 'coverage', 'compression'],
    num_rows: 995041
})
head of stories
Dataset({
    features: ['text'],
    num_rows: 2119719
})
stories is the largest. with 2 million entries
'''

head of blogs
Yeah, so today was ok, late arrival. I'm not in the mood to write much, so..I probably will end up writing a ton. I always end up doing that when I say I don't want to write much. Nothing interesting happened in any of my classes, and I only talked with a couple of good people including, "You know who"...except, you don't know who, "You know who," is. Oh well, no one reads this thing anyway. I made it to the top of the rock climbing wall today. Kelly and I intend to conquer the whole wall. It'll be fun. Jazz was good fun as always...heh, and that's really it. Have a lovely evening everyone, and I am off to sleepyland. (I actually didn't write a lot. I'm shocked.)  *Ya di Amore*


Yeah, so today was ok, late arrival. I'm not in the mood to write much, so..I probably will end up writing a ton. I always end up doing that when I say I don't want to write much. Nothing interesting happened in any of my classes, and I only talked with a couple of good people including, "You kno

"head of blogs\nDataset({\n    features: ['text', 'date', 'gender', 'age', 'horoscope', 'job'],\n    num_rows: 689793\n})\nhead of sci papers\nDataset({\n    features: ['article', 'abstract', 'section_names'],\n    num_rows: 203037\n})\nhead of journal papers\nDataset({\n    features: ['text', 'summary', 'title', 'url', 'date', 'density_bin', 'coverage_bin', 'compression_bin', 'density', 'coverage', 'compression'],\n    num_rows: 995041\n})\nhead of stories\nDataset({\n    features: ['text'],\n    num_rows: 2119719\n})\nstories is the largest. with 2 million entries\n"

### looking at the respective data
if we look at the data samples above we can see that the blog posts are punctuated a lot more, which is probably due to the nature of it being more social media related data. we may need to remove this but let's see. this is in the form of ... and collocations.

the scientific data contains latex so we will need to remove this 
the scientific data also contains large sections of numbers. we will need to remove these as well as references n ,  g. ; bluem ,  h. ; todd ,  a. m.  m. the new ir and thz fel facility at the fritz haber institute in berlin .\nspie _ * 2015 * , _ 9512 _ , 95121l paarmann ,  a. ; razdolski ,  i. ; melnikov ,  a. ; gewinner ,  s. ; schllkopf ,  w. ; wolf ,  m. second harmonic generation spectroscopy in the reststrahl band of sic using an infrared free - electron laser .\nlett . _ * 2015 * , _ 107 _ , 081101 ,  a. ; razdolski ,  i. ; gewinner ,

potentially using the abstract may be a better idea.

journalistic data contains a lot of facts and numbers, this may have to be dealt with. as well as quotes as well as statistical sections which we need to remove.
also uses a lot of name data which isn't really needed and can be tagged as name data potentially

short stories are quite simplistic and use language that is childish in nature. we may want to change our data to reflect more serious story writing. 



In [5]:
#let's look at the ngrams and collocations of the data.
from nltk.tokenize import word_tokenize
import string
import nltk
from nltk.util import ngrams
from collections import Counter
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
translator = str.maketrans('','', string.punctuation)
for dataset in datasets:
    texts = dataset[1][:10000]  # Work with a subset for efficiency
    tokenized_texts = [word_tokenize(text.lower().translate(translator)) for text in texts]
    # Example for bigrams
    bigrams = [list(ngrams(tokens, 2)) for tokens in tokenized_texts]

    # Example for trigrams
    trigrams = [list(ngrams(tokens, 3)) for tokens in tokenized_texts]
    bigram_counts = Counter([bigram for text in bigrams for bigram in text])
    print(f'brigam most common for {dataset[0]}:\n{bigram_counts.most_common(10)}' )  # Print the 10 most common bigrams

    # Do the same for trigrams
    trigram_counts = Counter([trigram for text in trigrams for trigram in text])
    print(f'trigam most common for {dataset[0]}:\n{trigram_counts.most_common(10)}')  # Print the 10 most common trigrams
    
    # Combine all tokens into a single list
    all_tokens = [token for tokens in tokenized_texts for token in tokens]

    # Find collocations
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(all_tokens)
    finder.apply_freq_filter(5)  # Only consider bigrams that occur at least 5 times
    collocations = finder.nbest(bigram_measures.pmi, 10)  # Top 10 collocations by PMI
    print(f'collocations are {collocations}')


brigam most common for blogs:
[(('of', 'the'), 5801), (('in', 'the'), 5745), (('and', 'i'), 4880), (('i', 'was'), 3660), (('to', 'be'), 3587), (('it', 'was'), 3576), (('i', 'have'), 3528), (('to', 'the'), 3397), (('on', 'the'), 3263), (('that', 'i'), 3056)]
trigam most common for blogs:
[(('a', 'lot', 'of'), 746), (('i', 'have', 'to'), 704), (('i', 'dont', 'know'), 683), (('im', 'going', 'to'), 672), (('i', 'want', 'to'), 608), (('i', 'need', 'to'), 518), (('i', 'think', 'i'), 489), (('one', 'of', 'the'), 479), (('i', 'went', 'to'), 476), (('i', 'have', 'a'), 452)]
collocations are [('durwood', 'busse'), ('justine', 'bateman'), ('pav', 'tav'), ('1220pm', 'pols2606'), ('dorje', 'phamo'), ('bona', 'fide'), ('feliz', 'navidad'), ('paya', 'lebar'), ('dag', 'nabbit'), ('french2306', 'textes')]
brigam most common for sci papers:
[(('of', 'the'), 32974), (('in', 'the'), 16597), (('to', 'the'), 9232), (('for', 'the'), 7135), (('that', 'the'), 6411), (('on', 'the'), 6363), (('and', 'the'), 6148

### results and inference

#### Blogs
we can see that the blogs all use similar language that is quite casual and uses a lot of personal reference (i)

#### scientific
we can see that the papers we can see the language is quite formal and has no reference to self so it is non personal and formal. most of the collocations are based around references. we should try to remove these and see if more come along

#### journals

similar with the data being formal mosstly but special note of mentions of the US in the data. this means the data is mostly speaking about america which could mean the data is americanised. this could change the speach as americans could speak differently to english journals.

#### stories
once upon a time and forms of this are really common. this makes me worry hte text data is quite chidlish so we may need to change the stories data to a larger dataset. maybe look for a specific writer.



In [6]:
#preprocess function 
#remove the latex the is in the documents, 


In [7]:
from datasets import concatenate_datasets, Dataset
min_size = min([len(df) for _, df in datasets])
np.random.seed(42)

def add_source_label(example, label):
    example['source_label'] = label
    return example
new_dataset = {'text':[], 
                'label':[]}
for i in range(len(datasets)):
    random_indices = np.random.randint(0, len(datasets[i][1]), min_size)
    new_dataset['text'] +=  datasets[i][1][:min_size]
    new_dataset['label'] += list(np.full(min_size, i, dtype=int))
    
from datasets import DatasetDict

combined_dataset = Dataset.from_dict(new_dataset)
train_test = combined_dataset.train_test_split(test_size=0.2)  # 80% train, 20% test
train_val = train_test['train'].train_test_split(test_size=0.15)  # Split remaining train into 80% train, 20% validation

# Assemble everything into a single DatasetDict
splits = DatasetDict({
    'train': train_val['train'],
    'test': train_test['test'],
    'validation': train_val['test']
})





print(splits)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 552260
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 162430
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 97458
    })
})


In [8]:
print(splits['train'][0])

{'text': 'urlLink    A picture of the kitchen I work in. That\'s Dave the sous chef there on the right, and in the distance are Frakes and Nicole, the meat cooks. This is my home away from home.&nbsp; urlLink      urlLink    This is my deranged station partner, Jonathan, posing in what we refer to as our "office." It\'s actually one of the downstairs walk-ins, but we go in there to talk because it\'s nice and cold.&nbsp; urlLink     It\'s an off day for the Sox, so I\'m off topic.  Work was boring tonight, but I love what I do.  We got in fresh abalone today - it was the first time I\'ve gotten an opportunity to work with it, and it was pretty neat.  Abalone are gastropod mollusks, and they come in still alive so that when you touch the abductor muscle it moves and ripples underneath your fingers.  They\'re sort of a bitch to clean, and I kept popping open their shit veins, but it was still exciting to play with something new.  That\'s the really cool thing about working the fish stati

In [9]:
splits['train'].save_to_disk(os.getcwd() + '/Datasets/combined_data/train')
splits['test'].save_to_disk(os.getcwd() + '/Datasets/combined_data/test')
splits['validation'].save_to_disk(os.getcwd() + '/Datasets/combined_data/validation')



Saving the dataset (0/2 shards):   0%|          | 0/552260 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/162430 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/97458 [00:00<?, ? examples/s]

In [1]:

import re
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", use_fast=True)


def remove_latex_markup(text):
    # Remove inline math wrapped in $...$
    text = re.sub(r'\$.*?\$', '', text)
    
    # Remove display math wrapped in \[...\]
    text = re.sub(r'\\\[.*?\\\]', '', text)
    
    # Remove simple LaTeX commands like \command{arg}
    text = re.sub(r'\\[a-zA-Z]+\{.*?\}', '', text)
    
    # Remove custom @xmath and @xcite commands from the provided example
    text = re.sub(r'@\w+', '', text)
    
    # Remove remaining braces after previous replacements
    text = re.sub(r'[\{\}\[\]]', '', text)
    
    return text

def clean_text(text):
    """Apply text cleaning and normalization steps."""
    # Normalize excessive punctuation, remove numbers, and LaTeX
    text = re.sub(r"\.\.\.+", ".", text)
    text = re.sub(r"\d+(\.\d+)?", "", text)
    text = remove_latex_markup(text)
    # Further custom cleaning steps can be added here
    return text

def preprocess_and_tokenize(batch):
    """Clean and tokenize texts using the BERT tokenizer."""
    # Apply custom text cleaning
    batch["text"] = [clean_text(text) for text in batch["text"]]
    tokenized_inputs = tokenizer(batch["text"], padding=True, max_length=512, truncation=True, return_tensors="pt")
    batch.update(tokenized_inputs)
    return batch


from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


KeyboardInterrupt: 

In [None]:
encoded_train = splits["train"].map(preprocess_and_tokenize, batched=True)
encoded_eval = splits['validation'].map(preprocess_and_tokenize, batched=True)
encoded_test = splits["test"].map(preprocess_and_tokenize, batched=True)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "BLOG", 1: "SCIENTIFIC", 2:"JOURNALISTIC", 3:"NARRATIVE"}
label2id = {"BLOG": 0, "SCIENTIFIC": 1, "JOURNALISTIC": 2, "NARRATIVE":3}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=4, id2label=id2label, label2id=label2id
)
print(type(model))

training_args = TrainingArguments(
    output_dir="DistilBert_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

splits["train"] = splits["train"].map(preprocess_and_tokenize, batched=True)
splits["validation"] = splits["validation"].map(preprocess_and_tokenize, batched=True)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()
!pip install ray[tune]

In [None]:
train_dataset = encoded_dataset["train"].shard(index=1, num_shards=10) 
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()