In [37]:
!pip install evaluate



In [60]:
!pip install accelerate -U

Collecting accelerate
  Obtaining dependency information for accelerate from https://files.pythonhosted.org/packages/13/9e/ee987874058f2d93006961f6ff49e0bcb60ab9c26709ebe06bfa8707a4d8/accelerate-0.24.1-py3-none-any.whl.metadata
  Downloading accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
   ---------------------------------------- 0.0/261.4 kB ? eta -:--:--
   - -------------------------------------- 10.2/261.4 kB ? eta -:--:--
   --------- ------------------------------ 61.4/261.4 kB 1.1 MB/s eta 0:00:01
   ------------------------------------- -- 245.8/261.4 kB 2.5 MB/s eta 0:00:01
   ---------------------------------------- 261.4/261.4 kB 2.3 MB/s eta 0:00:00
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [48]:
import torch 
import numpy as np
import pandas as pd

# from nltk.tokenize import sent_tokenize 

from pathlib import Path 
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer,TrainingArguments, Trainer, DataCollatorWithPadding
import re

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'sshleifer/tiny-gpt2'

In [49]:
torch.cuda.empty_cache()

In [50]:
df=pd.read_csv('cleaned_winter_data.csv')
df.head()

Unnamed: 0,Received from:,Description of problem,Requires further investigation to resolve?,Sent staff to room?
0,OI 8170,jr. station enable request from kyle,0,0
1,BA 1180,wanted to know if his log in would work. it did.,0,0
2,UC 85,log in request,0,0
3,ES 4001,mike showing prof how intercom works,0,0
4,BL 313,prof testing powerpoint for next week,0,0


In [51]:
dataset=df.drop(['Received from:','Requires further investigation to resolve?'], axis=1)
dataset.head()

Unnamed: 0,Description of problem,Sent staff to room?
0,jr. station enable request from kyle,0
1,wanted to know if his log in would work. it did.,0
2,log in request,0
3,mike showing prof how intercom works,0
4,prof testing powerpoint for next week,0


In [52]:
import string
def remove_punctuation_and_quotes(text):
    return ''.join(char.lower() for char in text if char not in string.punctuation + "'\"")
dataset['Description of problem'] = dataset['Description of problem'].apply(remove_punctuation_and_quotes)
dataset['Sent staff to room?'] = dataset['Sent staff to room?'].map({'1': 1, '0': 0})

In [53]:
#This is the custom dataset
class ClassificationDataset(Dataset):
    def __init__(self,  tokenizer) -> None:
        super().__init__()

        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_sentence_length = np.max([len(d) for d in self.dataset['Description of problem']])
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def __len__(self):
        return len(self.dataset['Description of problem'])

    def __getitem__(self, index):
        text = self.dataset['Description of problem'][index]
        label = self.dataset['Sent staff to room?'][index]
        # Tokenize the text using your custom tokenizer
        tokenized = self.tokenizer(text, padding = 'max_length', truncation = True, max_length = self.max_sentence_length)

        return {'input_ids': tokenized['input_ids'], 'labels': torch.tensor(label),'attention_mask': tokenized['attention_mask']}

In [54]:
data_full = ClassificationDataset(tokenizer = AutoTokenizer.from_pretrained("gpt2"))

In [55]:
# Create the data collator
data_collator = DataCollatorWithPadding(tokenizer=data_full.tokenizer)

In [56]:
import evaluate
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [57]:
train_data, val_data = train_test_split(data_full, test_size=0.2)

In [58]:
# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("gpt2")
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
!pip install transformers[torch]



In [63]:
# Create the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    logging_steps=100,
    evaluation_strategy="epoch"
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
# Fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=data_full.tokenizer,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)