# Key Point Analysis

In [1]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import ContrastiveLoss
from sentence_transformers.models import Transformer, Pooling

from torch.utils.data import DataLoader

import constants as c
import utilities as u

## 1. Data Preprocessing

In [2]:
for data_type in [c.TRAIN, c.TEST, c.DEV]:
    # labels, key point and arguments datasets are loaded
    labels_df = pd.read_csv(c.DATA_DIR + "labels_" + data_type + ".csv")
    kp_df = pd.read_csv(c.DATA_DIR + "key_points_" + data_type + ".csv")
    arg_df = pd.read_csv(c.DATA_DIR + "arguments_" + data_type + ".csv")

    # the datasets are merged together
    result_df = pd.merge(labels_df, arg_df)
    result_df = pd.merge(result_df, kp_df)

    # an additional "topic_key_point" column is created, as the concatenation of the topics and the key points themselves
    result_df[c.TOPIC_KP] = result_df.apply(lambda x: x[c.TOPIC] + " <SEP> " + x[c.KP], axis = 1)
    
    # the unnecessary information are discarded and the dataset is saved 
    result_df = result_df[[c.ARG, c.TOPIC_KP, c.LABEL]]
    result_df.to_csv(c.DATA_DIR + data_type + ".csv", index = None)

## 2. Training the Model

In [3]:
# use the RoBERTa pre-trained language model, fine-tuned for sentence embedding
word_embedding_model = Transformer("roberta-base", max_seq_length=c.DEF_MAX_SEQ_LENGTH)
# add the <SEP> token to the tokenizer
word_embedding_model.tokenizer.add_tokens(["<SEP>"], special_tokens = True)
# resize the embedding matrix to include the new token
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

# pooling aggregates the embeddings of the tokens in a fixed-size vector
pooling_model = Pooling(word_embedding_model.get_word_embedding_dimension())

# the sentence transformer is the concatenation of the word embedding model and the pooling model
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# train samples: list of InputExample objects
train_samples = u.create_samples("train.csv", size=1600)

train_dataloader = DataLoader(train_samples, shuffle=False, batch_size=c.DEF_BATCH_SIZE)
train_loss = ContrastiveLoss(model)

model.fit(
    train_objectives = [(train_dataloader, train_loss)],
    epochs = c.DEF_EPOCHS,
    output_path = c.MODELS_DIR
)

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

## 3. Testing the model

In [9]:
# evaluate the model on the test set
test_samples = u.create_samples("test.csv", type=1)

# Evaluate the model on each test example
correct_predictions = 0
total_predictions = len(test_samples)

for sample in test_samples:
    # Get the embeddings for the argument and keypoint
    arg_embedding = model.encode(sample[c.ARG], convert_to_tensor=True)
    key_embedding = model.encode(sample[c.TOPIC_KP], convert_to_tensor=True)
    
    # Compute the cosine similarity between the embeddings
    similarity = 1 - np.cos(arg_embedding, key_embedding)
    
    # Predict the label based on the similarity
    predicted_label = int(similarity > 0.5)
    
    # Compare the predicted label to the true label and count the number of correct predictions
    if predicted_label == sample[c.LABEL]:
        correct_predictions += 1
        
# Compute the accuracy
accuracy = correct_predictions / total_predictions

print(f"Accuracy: {accuracy:.2f}")

TypeError: return arrays must be of ArrayType