# Assignment 2
Emilia Zielinska
CSI 5386 <br>
Dr. Diana Inkpen <br>
March 24, 2024

## Imports and Function Definitions

In [7]:
import cohere
import evaluate
import numpy as np
import pandas as pd
from datasets import Dataset    
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

In [2]:
def print_results(pred, path):
    print_data = test_data.drop(['text'], axis=1)
    print_data['label'] = pd.Series(pred)
    print_data.to_json(path, orient='records', lines=True)

In [3]:
# Import data
train_data = pd.read_json(path_or_buf='./train/subtaskA_train_monolingual.jsonl', lines=True)
test_data = pd.read_json(path_or_buf='./test/subtaskA_monolingual.jsonl', lines=True)
gold_data = pd.read_json(path_or_buf='./gold/subtaskA_monolingual.jsonl', lines=True)

In [4]:
# Create the y labels
y_train = train_data['label']
y_gold = gold_data['label']

## SVM Baseline Classifier

In [16]:
# Vectorize the text
vectorizer = TfidfVectorizer(stop_words='english', max_features=59552)
train_vec = vectorizer.fit_transform(train_data['text'])
gold_vec = vectorizer.fit_transform(gold_data['text'])

In [5]:
# Train baseline and get accuracy based on gold standard
baseline = svm.SVC()
baseline.fit(train_vec, y_train)

In [6]:
# Calculate accuracy
y_pred = baseline.predict(gold_vec)
print(accuracy_score(y_true=y_gold, y_pred=y_pred))
print(f1_score(y_true=y_gold, y_pred=y_pred, average='macro'))
print(f1_score(y_true=y_gold, y_pred=y_pred, average='micro'))

0.45777894491129784
0.38915683580868426
0.45777894491129784


In [28]:
# Print results for SVM
print_results(y_pred, 'svm.jsonl')

## BERT Base Uncased Finetuned Model

In [5]:
# Split the training dataset into training and eval datasets
train_dataset, eval_dataset = train_test_split(train_data, test_size=0.2)

# Convert from dataframe objects to dataset objects
train_dataset = Dataset.from_pandas(train_dataset.drop(['id', 'model', 'source'], axis=1))
eval_dataset = Dataset.from_pandas(eval_dataset.drop(['id', 'model', 'source'], axis=1))

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize the data and define the model, training args, and metrics
train_tok = train_dataset.map(tokenize_function, batched=True)
eval_tok = eval_dataset.map(tokenize_function, batched=True)
bert = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2)
training_args = TrainingArguments(output_dir="test_trainer")
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
# Fine-tune the BERT model
trainer = Trainer(
    model=bert,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss
500,0.3857
1000,0.2869
1500,0.2767
2000,0.3012
2500,0.2324
3000,0.2374
3500,0.2286
4000,0.1994
4500,0.2329
5000,0.2094


TrainOutput(global_step=35928, training_loss=0.13476263444212916, metrics={'train_runtime': 9862.8114, 'train_samples_per_second': 29.141, 'train_steps_per_second': 3.643, 'total_flos': 7.56220639762944e+16, 'train_loss': 0.13476263444212916, 'epoch': 3.0})

In [20]:
# Tokenize the test dataset
test_dataset = Dataset.from_pandas(gold_data.drop(['id', 'label'], axis=1))
test_tok = test_dataset.map(tokenize_function, batched=True)
# Get the predictions of the test dataset
bert_pred = trainer.predict(test_tok)

                                                                                                                                                                                  

In [29]:
# Put all predictions into a list
b_pred = []
for pred in bert_pred.predictions:
    if pred[0] > pred[1]:
        b_pred.append(0)
    else:
        b_pred.append(1)

In [30]:
# Print the metrics
print(accuracy_score(y_true=y_gold, y_pred=b_pred))
print(f1_score(y_true=y_gold, y_pred=b_pred, average='macro'))
print(f1_score(y_true=y_gold, y_pred=b_pred, average='micro'))

0.6977999533146592
0.6783040771963408
0.6977999533146592


In [35]:
# Print the results for BERT
print_results(b_pred, 'bert.jsonl')

## Cohere Fine-tuned Classification Model

In [19]:
# Make JSON file to give to Cohere dashboard
train_jsonl = train_data.drop(['id', 'model', 'source'], axis=1)
train_jsonl.label = train_jsonl.label.astype(str)
train_jsonl.to_json('train.jsonl', orient='records', lines=True)

In [6]:
texts = test_data['text'].tolist()
# Make a list of lists where each inner list is of max size 96 (API limitations)
inputs = []
curr_input = []
for text in texts:
    curr_input.append(text)
    if len(curr_input) == 96:
        inputs.append(curr_input)
        curr_input = []

In [7]:
# Get the Cohere predictions and put them into a list
co = cohere.Client('SECRET') # This is your trial API key
responses = []
for input in inputs:
    response = co.classify(
      model='77a9f33a-4bba-4b4f-9fee-074373d8b122-ft',
      inputs=input)
    responses.append(response)

In [9]:
# Print results for Cohere
cohere_pred = []
for response in responses:
    for pred in response:
        cohere_pred.append(pred.predictions[0])
cohere_pred = [int(x) for x in cohere_pred]
print_results(cohere_pred, 'cohere.jsonl')

In [17]:
print(accuracy_score(y_true=y_gold, y_pred=cohere_pred))
print(f1_score(y_true=y_gold, y_pred=cohere_pred, average='macro'))
print(f1_score(y_true=y_gold, y_pred=cohere_pred, average='micro'))

0.7626633986928104
0.756575262126163
0.7626633986928104
