In [6]:
import torch 
import numpy as np
import pandas as pd

# from nltk.tokenize import sent_tokenize 

from pathlib import Path 
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding
import re
from transformers import Trainer, TrainingArguments, AdamW


device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'sshleifer/tiny-gpt2'


In [7]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Set the seed
set_seed(1786)

In [8]:
torch.cuda.empty_cache()

In [9]:
df=pd.read_excel('Winter23_logs.xlsx').drop(0)
df['Outcome']=df['Outcome'].str.lower()
df1=df['Outcome']
df1=df1.dropna()
df1.to_csv("outcome.txt", header=False, index=False, sep='\n')
df1.head()

10                                                  yes
11                                            re-enable
14    whoever pressed the button hightailed it out o...
15                                   it's a class - yes
20       co-pilot lost track of time doing install work
Name: Outcome, dtype: object

In [None]:
text = Path(base_path, "outcome.txt"]).read_text(encoding='utf-8')
text = text.replace('-', '')
sentences = sent_tokenize(text)

In [112]:
tkenizer= AutoTokenizer.from_pretrained(model_name)
tkenizer.pad_token = tkenizer.eos_token
max_sentence_length = np.max([len(d) for d in lines])
tkenized = tkenizer(lines, padding = 'max_length', truncation = True, max_length = max_sentence_length)
data = tkenized['input_ids']
data_collator = DataCollatorWithPadding(tokenizer=data.tkenizer)

AttributeError: 'list' object has no attribute 'tkenizer'

In [101]:
class LanguageModelingDataset(Dataset):
    
    def __init__(self, ds_choice="outcome", split="train", truncation=-1):
        
        self.tokenizer= AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        with open("outcome.txt", 'r', encoding = ('utf-8')) as f:
            sentences = [self.remove_punctuation_and_dashes(line.strip()) for line in f.readlines()]
        
        
        # Count some items
        self.max_sentence_length = np.max([len(d) for d in sentences])
        self.data = self.tokenizer(sentences, padding = 'max_length', truncation = True, max_length = self.max_sentence_length) # List of 1-d pytorch tensor

    def remove_punctuation_and_dashes(self,line):
        return ''.join(char for char in line if char.isalnum() or char.isspace())

    def __len__(self):
        return len(self.data)

    def get_vocab_size(self):
        """
        We have to set this to the max vocab size (i.e., that decided by the BPE tokenizer), 
        but actually, only a small number of vocab is used, especially for the small text. 
        """
        return 50257

    def __getitem__(self, idx):
        """
        The output should be a tuple x and y, both as pytorch tensors.
        Please refer to the `run()` method in the mingpt/trainer.py script for 
        how the x and y are going to be used.
        """
        x = self.data[idx][:-1]
        y = self.data[idx][1:]
        return (x, y)

    def get_block_size(self):
        """
        block_size is the size at which lines are truncated to ensure they are equal-length.
        """
        return self.max_sentence_length
    

In [10]:
class LanguageModelingDataset(Dataset):
    def __init__(self,  tokenizer) -> None:
        super().__init__()

        
        with open("outcome.txt", 'r', encoding = ('utf-8')) as f:
            self.sentences = [self.remove_punctuation_and_dashes(line.strip()) for line in f.readlines()]
        self.tokenizer = tokenizer
        self.max_sentence_length = np.max([len(d) for d in self.sentences])
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def remove_punctuation_and_dashes(self,line):
        return ''.join(char for char in line if char.isalnum() or char.isspace())
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        
        # Tokenize the text using your custom tokenizer
        tokenized = self.tokenizer(self.sentences, padding = 'max_length', truncation = True, max_length = self.max_sentence_length)

        return {'input_ids': tokenized['input_ids'],'attention_mask': tokenized['attention_mask']}

In [11]:
data_full =LanguageModelingDataset(tokenizer = AutoTokenizer.from_pretrained("gpt2"))

In [12]:
# Create the data collator
data_collator = DataCollatorWithPadding(tokenizer=data_full.tokenizer)

In [13]:
 #!pip install evaluate

In [14]:
import evaluate
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
train_data, val_data = train_test_split(data_full, test_size=0.2)

In [114]:
# Instantiate the Training Dataset
train_dataset = LanguageModelingDataset(ds_choice="outcome", split="train")  

# Instantiate a Validation Dataset (this is only really needed for the fine-tune task, not the LM task)
val_dataset = LanguageModelingDataset(ds_choice="outcome", split="validation")

TypeError: LanguageModelingDataset.__init__() got an unexpected keyword argument 'ds_choice'

In [57]:
def lm_collate_fn(batch, device):
    x = [item[0] for item in batch]  # List (len B) of varying lengths
    y = [item[1] for item in batch]  # List (len B) of the same lengths as x
    maxlen = max([len(s) for s in x])

    padded_x, padded_y = [], []
    for sx, sy in zip(x, y):
        padded_x.append(torch.cat([sx, torch.ones(maxlen - len(sx))]))
        padded_y.append(torch.cat([sy, torch.ones(maxlen - len(sy))]))
    return torch.stack(padded_x).long().to(device), torch.stack(padded_y).long().to(device)


In [104]:
# Print out an example of the data - this is processed more once it reaches lm_collate_fn (above)
x,y = train_dataset[5]
print(x, y)


TypeError: 'tokenizers.Encoding' object is not subscriptable

In [None]:
print("X: ",train_dataset.tokenizer.decode(x))
print("Y: ",train_dataset.tokenizer.decode(y))

In [55]:
class LanguageModelingDataset(Dataset):
    
    def __init__(self, ds_choice="outcome", split="train", truncation=-1):
        
        base_path = "./"
        fn = {"outcome": "outcome.txt"}
        self.ds_choice = ds_choice
        self.truncation = truncation  # int. If -1, then
        text = Path(base_path, fn[ds_choice]).read_text(encoding='utf-8')
        text = text.replace('-', '')
        sentences = sent_tokenize(text)

        # Train / test split
        train, val = train_test_split(sentences, test_size=0.2, shuffle=False)
        if split == "train":
            raw_data = train 
        else:
            raw_data = val 

        # Tokenize
        self.tokenizer = BPETokenizer()
        self.data = []  # List of 1-d pytorch tensor
        for sent in raw_data:
            tokenized = self.tokenizer(sent).view(-1)  # pytorch tensor
            if truncation >= 0:
                self.data.append(tokenized[:truncation])
            else:
                self.data.append(tokenized)

        # Count some items
        self.max_sentence_length = np.max([len(d) for d in self.data])

    def __len__(self):
        return len(self.data)

    def get_vocab_size(self):
        """
        We have to set this to the max vocab size (i.e., that decided by the BPE tokenizer), 
        but actually, only a small number of vocab is used, especially for the small text. 
        """
        return 50257

    def __getitem__(self, idx):
        """+
        
        
        The output should be a tuple x and y, both as pytorch tensors.
        Please refer to the `run()` method in the mingpt/trainer.py script for 
        how the x and y are going to be used.
        """
        x = self.data[idx][:-1]
        y = self.data[idx][1:]
        return (x, y)

    def get_block_size(self):
        """
        block_size is the size at which lines are truncated to ensure they are equal-length.
        """
        return self.max_sentence_length
    

In [59]:
x1,y1 = train_dataset[9]
print(x1, y1)
print("X: ",train_dataset.tokenizer.decode(x1))
print("Y: ",train_dataset.tokenizer.decode(y1))

tensor([1631,  510,  852,  220, 1877,   11,  475,  407, 2636, 2644,  198,  220,
         220,  220,  220,  220,  220,  220,  220,  220,  220,  220,  220,  220,
         220,  220,  220,  220,  220,  220,  220,  220,  645, 6589, 3159,  287,
        2119]) tensor([ 510,  852,  220, 1877,   11,  475,  407, 2636, 2644,  198,  220,  220,
         220,  220,  220,  220,  220,  220,  220,  220,  220,  220,  220,  220,
         220,  220,  220,  220,  220,  220,  220,  645, 6589, 3159,  287, 2119,
          13])
X:  ended up being  low, but not dead ...
                      no installed screen in room
Y:   up being  low, but not dead ...
                      no installed screen in room.


In [60]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model_config.n_classification_class = 2
model = GPT(model_config)

number of parameters: 2.56M


In [61]:
# Create a Trainer object and set the core hyper-parameters
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 3000  # For small corpus: 3000 iterations is plenty. For large corpus: 100000 iterations is needed
train_config.num_workers = 0
train_config.batch_size = 4    # For small corpus, batch size of 4 is fine.  For large corpus use 16
trainer = Trainer(train_config, model, train_dataset, val_dataset, collate_fn=lm_collate_fn)

running on device cuda


In [62]:
# This function is called at the end of every batch in training
# and is used to report the amount of time per 100 batches, and the loss at that point

def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

# Train!
trainer.run()

iter_dt 0.00ms; iter 0: train loss 10.87948


OutOfMemoryError: CUDA out of memory. Tried to allocate 656.00 MiB (GPU 0; 4.00 GiB total capacity; 1.13 GiB already allocated; 490.50 MiB free; 1.99 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF