In [3]:
import pandas as pd
import numpy as np
from transformers import pipeline, GPT2Tokenizer, GPT2Config, GPT2ForSequenceClassification

In [4]:
train_data_path = 'train.csv'
train_data = pd.read_csv(train_data_path)

test_data_path = 'test.csv'
test_data = pd.read_csv(test_data_path)


train_data.drop('IOB Slot tags', axis=1)


train_data = train_data.dropna(subset=['Core Relations'])
train_data['Core Relations'].fillna('none', inplace=True)
train_data['Core Relations'] = train_data['Core Relations'].astype(str)
train_data.reset_index(drop=True, inplace=True)


unique_core_relations = set()
for relations in train_data['Core Relations']:
    unique_core_relations.update(relations.split())

unique_core_relations = sorted(list(unique_core_relations))

In [5]:
test_texts = test_data['utterances'].tolist()

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

config = GPT2Config.from_pretrained('devashat/244-finetuned', num_labels=18)
model = GPT2ForSequenceClassification.from_pretrained('devashat/244-finetuned', config=config)
model.config.pad_token_id = tokenizer.eos_token_id


classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0, return_all_scores=True)

predictions = classifier(test_texts)

threshold = 0.5

predicted_labels = []
for prediction in predictions:
    # Convert LABEL_X to actual label using unique_core_relations
    labels = [unique_core_relations[int(pred['label'].split('_')[-1])] for pred in prediction if pred['score'] > threshold]
    predicted_labels.append(labels)

predicted_labels_joined = [", ".join(labels) for labels in predicted_labels]

predictions_df = pd.DataFrame({
    "utterances": test_texts,
    "Core Relations": predicted_labels_joined
})

predictions_csv_path = 'predictions.csv'
predictions_df.to_csv(predictions_csv_path, index=False)

predictions_csv_path




'predictions.csv'