In [1]:
from datasets import load_dataset, Dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, sample_dataset
import json
import csv
import nlpaug.augmenter.word.back_translation as nat
import nlpaug.augmenter.word.synonym as nas

2024-04-10 20:36:54.172106: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Map labels to numbers
numerize_labels = {'Politics':0, 'Health':1, 'Finance':2, 'Travel':3, 'Food':4, 'Education':5,
       'Environment':6, 'Fashion':7, 'Science':8, 'Sports':9, 'Technology':10, 'Entertainment':11}

# Load your JSON file
with open('./data/train.json', 'r') as f:
    data = json.load(f)

# Convert the data into the required format
formatted_data = []
idx=0
for label, sentences in data.items():
    for  sentence in sentences:
        formatted_data.append({'label': numerize_labels[label], 'sentence': sentence, 'idx': idx})
        idx+=1

# Create a HuggingFace Dataset
dataset = Dataset.from_list(formatted_data)

In [3]:
dataset

Dataset({
    features: ['label', 'sentence', 'idx'],
    num_rows: 36
})

In [5]:
# Initialize the back translation augmenter
bt_aug = nat.BackTranslationAug()

# Initialize the synonym augmenter
syn_aug = nas.SynonymAug(aug_src='wordnet')

# Create an empty list to store the augmented sentences
augmented_data = []

# Loop over each sentence in the dataset
idx=0
for label, sentence in zip(dataset['label'], dataset['sentence']):
    # Add the original sentence and label to the list
    augmented_data.append({'label': label, 'sentence': sentence, 'idx': idx})
    idx+=1

    # Perform back translation augmentation
    augmented_sentence1 = bt_aug.augment(sentence)
    augmented_data.append({'label': label, 'sentence': augmented_sentence1[0], 'idx': idx})
    idx+=1

    # Perform synonym augmentation
    augmented_sentence2 = syn_aug.augment(sentence)

    # Add the augmented sentence and label to the list
    augmented_data.append({'label': label, 'sentence': augmented_sentence2[0], 'idx': idx})
    idx+=1

# Print the first 10 augmented sentences

In [6]:
augmented_dataset = Dataset.from_list(augmented_data)

In [14]:
model = SetFitModel.from_pretrained('sentence-transformers/paraphrase-mpnet-base-v2').to('cuda')

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [15]:
trainer = SetFitTrainer(
model=model,
train_dataset=augmented_dataset,
eval_dataset=augmented_dataset,
loss_class=CosineSimilarityLoss,
metric="accuracy",
batch_size=16,
num_iterations=20,
num_epochs=1,
column_mapping={"sentence": "text", "label": "label"})

  trainer = SetFitTrainer(
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/108 [00:00<?, ? examples/s]

In [16]:
trainer.train()

***** Running training *****
  Num unique pairs = 4320
  Batch size = 16
  Num epochs = 1
  Total optimization steps = 270


Step,Training Loss


In [17]:
metrics = trainer.evaluate()
print(metrics)

***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 1.0}


In [18]:
with open('./data/test_shuffle.txt', 'r') as f:
    test = f.readlines()

# Remove newline characters
test = [line.strip() for line in test]

In [19]:
predicted_labels=model(test)

In [20]:
predicted_labels=[int(a) for a in list(predicted_labels)]

In [21]:
reverse_numerize_labels = {v: k for k, v in numerize_labels.items()}
predictions=[]
for i in range(len(predicted_labels)):
    predictions.append((i, reverse_numerize_labels[predicted_labels[i]]))

In [22]:
with open('predictions.csv', 'w', newline='') as f:
    # Create a CSV writer object
    writer = csv.writer(f)

    # Write the header row
    writer.writerow(['ID', 'Label'])

    # Write the data rows
    writer.writerows(predictions)