In [15]:
from datasets import load_dataset, Dataset, concatenate_datasets
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, sample_dataset
import json
import csv
import nlpaug.augmenter.word.back_translation as nat
import nlpaug.augmenter.word.synonym as nas
import numpy as np

In [2]:
# Map labels to numbers
numerize_labels = {'Politics':0, 'Health':1, 'Finance':2, 'Travel':3, 'Food':4, 'Education':5,
       'Environment':6, 'Fashion':7, 'Science':8, 'Sports':9, 'Technology':10, 'Entertainment':11}

# Load your JSON file
with open('./data/train.json', 'r') as f:
    data = json.load(f)

# Convert the data into the required format
formatted_data = []
idx=0
for label, sentences in data.items():
    for  sentence in sentences:
        formatted_data.append({'label': numerize_labels[label], 'sentence': sentence, 'idx': idx})
        idx+=1

# Create a HuggingFace Dataset
dataset = Dataset.from_list(formatted_data)

In [3]:
dataset

Dataset({
    features: ['label', 'sentence', 'idx'],
    num_rows: 36
})

In [4]:
model = SetFitModel.from_pretrained('sentence-transformers/paraphrase-mpnet-base-v2')

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [5]:
trainer = SetFitTrainer(
model=model,
train_dataset=dataset,
eval_dataset=dataset,
loss_class=CosineSimilarityLoss,
metric="accuracy",
batch_size=16,
num_iterations=20,
num_epochs=1,
column_mapping={"sentence": "text", "label": "label"})

  trainer = SetFitTrainer(
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

In [6]:
trainer.train()

***** Running training *****
  Num unique pairs = 1440
  Batch size = 16
  Num epochs = 1
  Total optimization steps = 90


Step,Training Loss


In [7]:
metrics = trainer.evaluate()
print(metrics)

***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 1.0}


In [8]:
with open('./data/test_shuffle.txt', 'r') as f:
    test = f.readlines()

# Remove newline characters
test = [line.strip() for line in test]

In [9]:
good_preds=[]
for sentence in test:
    preds=model.predict_proba(sentence)
    if preds[np.argmax(preds)]>0.8:
        good_preds.append((sentence, model(sentence)))
print(len(good_preds))        

141


In [16]:
starting_idx = len(dataset)

# Create a new dataset with the additional entries
new_data = []
for idx, (sentence, label) in enumerate(good_preds):
    new_data.append({
        'label': label,
        'sentence': sentence,
        'idx': starting_idx + idx
    })

additional_dataset = Dataset.from_dict({key: [item[key] for item in new_data] for key in new_data[0]})

# Concatenate the original dataset with the additional dataset
updated_dataset = concatenate_datasets([dataset, additional_dataset])

print(updated_dataset)

Dataset({
    features: ['label', 'sentence', 'idx'],
    num_rows: 177
})


In [17]:
updated_trainer = SetFitTrainer(
model=model,
train_dataset=updated_dataset,
eval_dataset=updated_dataset,
loss_class=CosineSimilarityLoss,
metric="accuracy",
batch_size=16,
num_iterations=20,
num_epochs=1,
column_mapping={"sentence": "text", "label": "label"})

  updated_trainer = SetFitTrainer(
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/177 [00:00<?, ? examples/s]

In [18]:
updated_trainer.train()

***** Running training *****
  Num unique pairs = 7080
  Batch size = 16
  Num epochs = 1
  Total optimization steps = 443


Step,Training Loss


In [19]:
predicted_labels=updated_trainer.model(test)

In [20]:
predicted_labels=[int(a) for a in list(predicted_labels)]

In [21]:
reverse_numerize_labels = {v: k for k, v in numerize_labels.items()}
predictions=[]
for i in range(len(predicted_labels)):
    predictions.append((i, reverse_numerize_labels[predicted_labels[i]]))

In [22]:
with open('predictions.csv', 'w', newline='') as f:
    # Create a CSV writer object
    writer = csv.writer(f)

    # Write the header row
    writer.writerow(['ID', 'Label'])

    # Write the data rows
    writer.writerows(predictions)

In [25]:
for sentence, label in good_preds:
    print(sentence, reverse_numerize_labels[int(label)])

The impact of overpopulation on the environment is a topic of ongoing research. Environment
The benefits of using digital fashion in fashion are many, including reduced waste and improved creativity. Fashion
The political climate is becoming increasingly tense. Politics
The importance of size inclusivity in fashion cannot be overstated. Fashion
The opposition party is calling for a referendum on the issue. Politics
The impact of fashion on cultural diversity is a topic of ongoing research. Fashion
The American Diabetes Association recommends regular exercise and a healthy diet to manage blood sugar levels. Health
The impact of market volatility on investing is a topic of concern. Finance
The importance of vegan fashion in promoting animal welfare cannot be overemphasized. Fashion
The benefits of regular exercise for bone health are well documented. Health
The impact of fashion on cultural identity is a topic of ongoing research. Fashion
The impact of acid rain on ecosystems is a topic 

In [26]:
def count_sentences_per_label(good_preds):
    # Initialize a dictionary to store the counts per label
    label_counts = {}

    # Iterate through the (sentence, label) pairs
    for sentence, label in good_preds:
        # Check if the label exists in the dictionary
        if int(label) in label_counts:
            # Increment the count for the label
            label_counts[int(label)] += 1
        else:
            # Initialize the count for the label
            label_counts[int(label)] = 1

    return label_counts

# Example usage:
label_counts = count_sentences_per_label(good_preds)
print(label_counts)

{tensor(6): 1, tensor(7): 1, tensor(0): 1, tensor(7): 1, tensor(0): 1, tensor(7): 1, tensor(1): 1, tensor(2): 1, tensor(7): 1, tensor(1): 1, tensor(7): 1, tensor(6): 1, tensor(6): 1, tensor(5): 1, tensor(2): 1, tensor(7): 1, tensor(6): 1, tensor(7): 1, tensor(6): 1, tensor(7): 1, tensor(7): 1, tensor(6): 1, tensor(7): 1, tensor(6): 1, tensor(7): 1, tensor(7): 1, tensor(6): 1, tensor(7): 1, tensor(2): 1, tensor(1): 1, tensor(6): 1, tensor(0): 1, tensor(9): 1, tensor(7): 1, tensor(1): 1, tensor(2): 1, tensor(0): 1, tensor(7): 1, tensor(9): 1, tensor(7): 1, tensor(0): 1, tensor(7): 1, tensor(9): 1, tensor(6): 1, tensor(1): 1, tensor(6): 1, tensor(1): 1, tensor(8): 1, tensor(1): 1, tensor(7): 1, tensor(2): 1, tensor(0): 1, tensor(7): 1, tensor(7): 1, tensor(6): 1, tensor(2): 1, tensor(7): 1, tensor(7): 1, tensor(2): 1, tensor(3): 1, tensor(1): 1, tensor(7): 1, tensor(2): 1, tensor(1): 1, tensor(7): 1, tensor(5): 1, tensor(6): 1, tensor(6): 1, tensor(7): 1, tensor(7): 1, tensor(9): 1, tenso