In [4]:
import json
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import torch
from setfit import SetFitModel, SetFitTrainer
from datasets import Dataset
from sentence_transformers.losses import CosineSimilarityLoss
import numpy as np

In [2]:
# Map labels to numbers
numerize_labels = {'Politics':0, 'Health':1, 'Finance':2, 'Travel':3, 'Food':4, 'Education':5,
       'Environment':6, 'Fashion':7, 'Science':8, 'Sports':9, 'Technology':10, 'Entertainment':11}

# Load your JSON file
with open('./data/train.json', 'r') as f:
    data = json.load(f)

# Convert the data into the required format
formatted_data = []
idx=0
for label, sentences in data.items():
    for  sentence in sentences:
        formatted_data.append({'label': numerize_labels[label], 'sentence': sentence, 'idx': idx})
        idx+=1

# Create a HuggingFace Dataset
dataset = Dataset.from_list(formatted_data)

In [3]:
setfit_model = SetFitModel.from_pretrained('sentence-transformers/paraphrase-mpnet-base-v2')

trainer = SetFitTrainer(
model=setfit_model,
train_dataset=dataset,
eval_dataset=dataset,
loss_class=CosineSimilarityLoss,
metric="accuracy",
batch_size=16,
num_iterations=20,
num_epochs=1,
column_mapping={"sentence": "text", "label": "label"})

trainer.train()
trainer.evaluate()

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 1440
  Batch size = 16
  Num epochs = 1
  Total optimization steps = 90


  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

{'embedding_loss': 0.1765, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.01}
{'embedding_loss': 0.0137, 'learning_rate': 9.876543209876543e-06, 'epoch': 0.56}
{'train_runtime': 525.9777, 'train_samples_per_second': 2.738, 'train_steps_per_second': 0.171, 'epoch': 1.0}


***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 1.0}

In [14]:
with open('./data/test_shuffle.txt', 'r') as f:
    test = f.readlines()

# Remove newline characters
test = [line.strip() for line in test]
reverse_numerize_labels = {v: k for k, v in numerize_labels.items()}
good_preds=[]
for sentence in test:
    preds=setfit_model.predict_proba(sentence)
    if preds[np.argmax(preds)]>0.8:
        good_preds.append((sentence, reverse_numerize_labels[int(setfit_model(sentence))]))

In [None]:
data_augmented = data.copy()
for sentence, label in good_preds:
    data_augmented[label].append(sentence)

DATA_PATH = "./data/train.json"

with open(DATA_PATH, "r") as fichier:
    data = json.load(fichier)


df = pd.DataFrame.from_dict(data, orient='index')
df = df.stack().reset_index(level=1, drop=True).reset_index()
df.columns = ['Label', 'Text']
df = df[['Text', 'Label']]

numerize_labels = {'Politics':0, 'Health':1, 'Finance':2, 'Travel':3, 'Food':4, 'Education':5,
       'Environment':6, 'Fashion':7, 'Science':8, 'Sports':9, 'Technology':10, 'Entertainment':11}

df.Label = df.Label.map(numerize_labels)
df.head()
    

In [None]:
# Step 1: Split the dataset into training and validation sets
train_dataset = pd.DataFrame(columns=['Text', 'Label'])
val_dataset = pd.DataFrame(columns=['Text', 'Label'])
        
# Step 2: Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(numerize_labels))
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 3: Tokenize the text data using the BERT tokenizer
train_encodings = tokenizer(list(train_dataset['Text']), truncation=True, padding=True)
val_encodings = tokenizer(list(val_dataset['Text']), truncation=True, padding=True)

# Step 4: Convert the tokenized data into input features compatible with BERT
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': list(train_dataset['Label'])})
val_dataset = Dataset.from_dict({'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask'], 'labels': list(val_dataset['Label'])})

# Step 5: Define the model architecture
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=300,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Step 6: Train the model
trainer.train()

# Step 7: Evaluate the model
trainer.evaluate()


In [None]:
result = pd.DataFrame(columns=['ID', 'Label'])

file_path = "./data/test_shuffle.txt"

with open(file_path, "r") as file:
    text = file.read()

for i, sentence in enumerate(text.split("\n")):
    inputs = tokenizer.encode_plus(sentence, truncation=True, padding=True, return_tensors="pt")
    outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits).item()
    predicted_label_name = list(numerize_labels.keys())[predicted_label]
    result = pd.concat([result, pd.DataFrame({'ID': [i], 'Label': [predicted_label_name]})], ignore_index=True)

result = result.iloc[:-1]
result.to_csv("result3.csv", index=False)