In [1]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from sklearn.metrics import f1_score

In [2]:
import torch
import transformers
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, GPT2Config, Trainer, TrainingArguments

In [3]:
!pip install accelerate -U



In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [5]:
config = GPT2Config.from_pretrained('devashat/244-final', num_labels=18)
model = GPT2ForSequenceClassification.from_pretrained('devashat/244-final', config=config)
model.config.pad_token_id = tokenizer.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at devashat/244-final and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import pandas as pd

train_data_path = 'train.csv'
train_data = pd.read_csv(train_data_path)

test_data_path = 'test.csv'
test_data = pd.read_csv(test_data_path)

In [7]:
train_data.drop('IOB Slot tags', axis=1)

Unnamed: 0,utterances,Core Relations
0,who plays luke on star wars new hope,movie.starring.actor movie.starring.character
1,show credits for the godfather,movie.starring.actor
2,who was the main actor in the exorcist,movie.starring.actor
3,who played dory on finding nemo,movie.starring.actor movie.starring.character
4,who was the female lead in resident evil,movie.starring.actor actor.gender
...,...,...
2248,revenue for titanic,movie.gross_revenue
2249,total titanic revenues,movie.gross_revenue
2250,what was the revenue for toy story 3,movie.gross_revenue
2251,dark knight revenue,movie.gross_revenue


In [8]:
train_data = train_data.dropna(subset=['Core Relations'])
train_data['Core Relations'].fillna('none', inplace=True)
train_data['Core Relations'] = train_data['Core Relations'].astype(str)
train_data.reset_index(drop=True, inplace=True)

In [9]:
unique_core_relations = set()
for relations in train_data['Core Relations']:
    unique_core_relations.update(relations.split())

unique_core_relations = sorted(list(unique_core_relations))

unique_core_relations


['actor.gender',
 'gr.amount',
 'movie.country',
 'movie.directed_by',
 'movie.estimated_budget',
 'movie.genre',
 'movie.gross_revenue',
 'movie.initial_release_date',
 'movie.language',
 'movie.locations',
 'movie.music',
 'movie.produced_by',
 'movie.production_companies',
 'movie.rating',
 'movie.starring.actor',
 'movie.starring.character',
 'movie.subjects',
 'person.date_of_birth']

In [10]:
from sklearn.model_selection import train_test_split

train_split, validation_split = train_test_split(train_data, test_size=0.1, train_size=0.9)

In [11]:
def one_hot_encoding(data):

    one_hot_vectors = []

    for _, row in data.iterrows():
        one_hot_vector = dict.fromkeys(unique_core_relations, 0)
        for relation in row['Core Relations'].split():
            if relation in one_hot_vector:
                one_hot_vector[relation] = 1
        one_hot_vectors.append(one_hot_vector)

    return one_hot_vectors


In [12]:
vectors = one_hot_encoding(train_data)

one_hot_encoded_df = pd.DataFrame(vectors)

one_hot_encoded_df.head()

Unnamed: 0,actor.gender,gr.amount,movie.country,movie.directed_by,movie.estimated_budget,movie.genre,movie.gross_revenue,movie.initial_release_date,movie.language,movie.locations,movie.music,movie.produced_by,movie.production_companies,movie.rating,movie.starring.actor,movie.starring.character,movie.subjects,person.date_of_birth
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [13]:
import numpy as np
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        # encodings should be a dictionary with keys like 'input_ids', 'attention_mask', etc.
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Since encodings is a dictionary, we correctly use .items() here
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)



In [14]:
# Preprocess the validation and test datasets
train_texts = train_split['utterances'].tolist()
validation_texts = validation_split['utterances'].tolist()
# Replace with actual test data texts
# test_texts = test_data['utterances'].tolist()

In [15]:
# Tokenize all texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)

# You've tokenized train_texts here correctly
# encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)

# Assuming you've correctly prepared your labels as numpy arrays
#train_labels = np.array(one_hot_encoding(train_split))
#validation_labels = np.array(one_hot_encoding(validation_split))

# Now, create the datasets with the correct encodings and labels
#train_dataset = CustomDataset(encodings, train_labels)

# For validation data, ensure you also tokenize it
validation_encodings = tokenizer(validation_texts, truncation=True, padding=True, max_length=128)
#validation_dataset = CustomDataset(validation_encodings, validation_labels)



In [16]:
# Convert the list of dictionaries (one-hot encoded vectors) into a 2D list or numpy array
def convert_to_matrix(vectors):
    return np.array([list(vec.values()) for vec in vectors])

# Convert your one-hot encoded labels into a format suitable for tensor conversion
train_labels_matrix = convert_to_matrix(one_hot_encoding(train_split))
validation_labels_matrix = convert_to_matrix(one_hot_encoding(validation_split))

# Then, use these matrices when creating your datasets
train_dataset = CustomDataset(train_encodings, train_labels_matrix)
#validation_encodings = tokenizer(validation_texts, truncation=True, padding=True, max_length=128)
validation_dataset = CustomDataset(validation_encodings, validation_labels_matrix)


In [17]:
# train_labels = np.array(one_hot_encoding(train_split))
# validation_labels = np.array(one_hot_encoding(validation_split))
# # test_labels = np.array(one_hot_encoding(test_data))

# train_dataset = CustomDataset(train_texts, train_labels)
# validation_dataset = CustomDataset(validation_texts, validation_labels)
# # test_dataset = CustomDataset(test_texts, test_labels)

In [18]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

def compute_metrics(pred):
    # Get the predictions and labels
    logits = pred.predictions
    true_labels = pred.label_ids

    # Apply a sigmoid to the logits and round to get binary predictions
    # This step converts logits to probabilities, then to binary values (0 or 1)
    preds = np.where(torch.sigmoid(torch.tensor(logits)).numpy() > 0.5, 1, 0)

    # Compute the F1 score, considering each label independently
    f1 = f1_score(true_labels, preds, average='weighted')
    accuracy = accuracy_score(true_labels, preds)

    return {
        'f1': f1,
        'accuracy': accuracy,
    }


In [19]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=1e-3,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0978,0.097668,0.653651,0.579487
2,0.0762,0.124866,0.66313,0.579487
3,0.0315,0.051832,0.869969,0.841026


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=657, training_loss=0.10640439386600048, metrics={'train_runtime': 45.034, 'train_samples_per_second': 116.312, 'train_steps_per_second': 14.589, 'total_flos': 53471488573440.0, 'train_loss': 0.10640439386600048, 'epoch': 3.0})

In [22]:
finetuned_model = '244-finetuned'
trainer.save_model(finetuned_model)

In [23]:
!pip install huggingface_hub



In [24]:
from huggingface_hub import notebook_login

notebook_login()
#KEY: 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
model.push_to_hub(finetuned_model, use_temp_dir=False)
tokenizer.push_to_hub(finetuned_model, use_temp_dir=False)

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/devashat/244-finetuned/commit/46e8b86dc0526cc35abb3d76f63f031d782f3040', commit_message='Upload tokenizer', commit_description='', oid='46e8b86dc0526cc35abb3d76f63f031d782f3040', pr_url=None, pr_revision=None, pr_num=None)

In [26]:
import pandas as pd
import numpy as np
from transformers import pipeline

# Tokenize the test data
test_texts = test_data['utterances'].tolist()
#test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

# Move to the same device as the model
#test_encodings = {key: val.to(model.device) for key, val in test_encodings.items()}


# Create a pipeline for multi-label classification
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0, return_all_scores=True)

# Predict
predictions = classifier(test_texts)

# Define a threshold for determining label presence
threshold = 0.5

predicted_labels = []
for prediction in predictions:
    # Convert LABEL_X to actual label using unique_core_relations
    # Extract the label index from each prediction
    labels = [unique_core_relations[int(pred['label'].split('_')[-1])] for pred in prediction if pred['score'] > threshold]
    predicted_labels.append(labels)

# Join multiple labels by a separator if there are multiple
predicted_labels_joined = ["; ".join(labels) for labels in predicted_labels]

# Create a DataFrame to save to CSV
predictions_df = pd.DataFrame({
    "utterances": test_texts,
    "Core Relations": predicted_labels_joined
})

# Define the path where you want to save the predictions CSV
predictions_csv_path = 'predictions.csv'
predictions_df.to_csv(predictions_csv_path, index=False)

predictions_csv_path
