In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

seed = 18

In [4]:
merged_resistance_df = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/essay_level/merged_themes_essay_level/merged_Resistance_essay_level_batch_1.csv", encoding='utf-8')

# Shuffle the merged dataset
merged_resistance_df = shuffle(merged_resistance_df, random_state=seed)

# Train-test split 
training_df, test_df = train_test_split(merged_resistance_df, test_size=0.2, random_state=18, stratify=merged_resistance_df['label'])

training_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [5]:
training_df

Unnamed: 0,essay,label,phrase
0,I am here to take this physics class so I can ...,1,['I believe I will succeed in making my goal o...
1,"I am sitting here, in this building, at this s...",0,['0']
2,I think this is kind of a hard question to ans...,0,['0']
3,"Why am I here? Honestly, I think I'm here by s...",0,['0']
4,"I am here because of course, its a requirement...",0,['0']
...,...,...,...
937,Why Am I Here? The reason I am here is because...,0,['0']
938,The most reason why that I am taking this cour...,0,['0']
939,Although I do enjoy physics and find it intere...,0,['0']
940,"My Dead Donald,As I write to you from the war ...",0,['0']


In [6]:
import torch
from transformers import BertTokenizer
from torch.utils.data import Dataset

class EssayDataset(Dataset):
    def __init__(self, essays, snippets, tokenizer, max_length=512):
        self.encodings = []
        self.labels = []

        for essay, snippet_list in zip(essays, snippets):
            encoding = tokenizer(essay, truncation=True, padding='max_length', max_length=max_length)
            labels = [0] * max_length
            
            if snippet_list != ['0']:  # Only process snippets if they are not empty
                for snippet in snippet_list:
                    snippet_encoding = tokenizer(snippet, truncation=True, padding='max_length', max_length=max_length)
                    for idx, (token, snippet_token) in enumerate(zip(encoding['input_ids'], snippet_encoding['input_ids'])):
                        if token == snippet_token and token != tokenizer.pad_token_id:
                            labels[idx] = 1  # Mark as relevant snippet
            
            self.encodings.append(encoding)
            self.labels.append(labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings[idx].items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [12]:
import ast

essays = merged_resistance_df["essay"].to_list()
snippets = merged_resistance_df['phrase'].to_list()
snippets = [ast.literal_eval(item) for item in snippets]

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create dataset
dataset = EssayDataset(essays, snippets, tokenizer)

In [13]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments

# Load pre-trained BERT model for token classification
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Split dataset into training and evaluation sets
train_size = int(0.8 * len(dataset))
train_dataset, eval_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train the model
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/708 [00:00<?, ?it/s]

IndexError: list index out of range