In [1]:
from datasets import load_dataset, Dataset, concatenate_datasets
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, sample_dataset
import json
import csv
import nlpaug.augmenter.word.back_translation as nat
import nlpaug.augmenter.word.synonym as nas
import numpy as np

2024-04-11 15:08:39.709825: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Map labels to numbers
numerize_labels = {'Politics':0, 'Health':1, 'Finance':2, 'Travel':3, 'Food':4, 'Education':5,
       'Environment':6, 'Fashion':7, 'Science':8, 'Sports':9, 'Technology':10, 'Entertainment':11}

# Load your JSON file
with open('./data/train.json', 'r') as f:
    data = json.load(f)

# Convert the data into the required format
formatted_data = []
idx=0
for label, sentences in data.items():
    for  sentence in sentences:
        formatted_data.append({'label': numerize_labels[label], 'sentence': sentence, 'idx': idx})
        idx+=1

# Create a HuggingFace Dataset
dataset = Dataset.from_list(formatted_data)

In [4]:
dataset

Dataset({
    features: ['label', 'sentence', 'idx'],
    num_rows: 36
})

In [5]:
model = SetFitModel.from_pretrained('sentence-transformers/paraphrase-mpnet-base-v2')

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [6]:
trainer = SetFitTrainer(
model=model,
train_dataset=dataset,
eval_dataset=dataset,
loss_class=CosineSimilarityLoss,
metric="accuracy",
batch_size=16,
num_iterations=20,
num_epochs=1,
column_mapping={"sentence": "text", "label": "label"})

  trainer = SetFitTrainer(
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

In [7]:
trainer.train()

***** Running training *****
  Num unique pairs = 1440
  Batch size = 16
  Num epochs = 1
  Total optimization steps = 90


Step,Training Loss


In [8]:
metrics = trainer.evaluate()
print(metrics)

***** Running evaluation *****


{'accuracy': 1.0}


In [9]:
with open('./data/test_shuffle.txt', 'r') as f:
    test = f.readlines()

# Remove newline characters
test = [line.strip() for line in test]

In [13]:
good_preds=[]
for sentence in test:
    preds=model.predict_proba(sentence)
    if preds[np.argmax(preds)]>0.23:
        good_preds.append((sentence, model(sentence)))
print(len(good_preds))        

1101


In [14]:
reverse_numerize_labels = {v: k for k, v in numerize_labels.items()}

In [15]:
def count_sentences_per_label(good_preds):
    # Initialize a dictionary to store the counts per label
    label_counts = {}

    # Iterate through the (sentence, label) pairs
    for sentence, label in good_preds:
        # Check if the label exists in the dictionary
        if reverse_numerize_labels[int(label)] in label_counts:
            # Increment the count for the label
            label_counts[reverse_numerize_labels[int(label)]] += 1
        else:
            # Initialize the count for the label
            label_counts[reverse_numerize_labels[int(label)]] = 1

    return label_counts

# Example usage:
label_counts = count_sentences_per_label(good_preds)
print(label_counts)

{'Finance': 74, 'Environment': 144, 'Science': 65, 'Politics': 110, 'Fashion': 108, 'Education': 104, 'Health': 167, 'Entertainment': 75, 'Technology': 51, 'Food': 38, 'Travel': 84, 'Sports': 81}


In [16]:
def extract_sentences_per_label(good_preds):
    # Initialize a dictionary to store sentences per label
    sentences_per_label = {}

    # Iterate through the (sentence, label) pairs
    for sentence, label in good_preds:
        # Check if the label exists in the dictionary
        if int(label) not in sentences_per_label:
            sentences_per_label[int(label)] = []

        # Append the sentence to the list corresponding to its label
        sentences_per_label[int(label)].append(sentence)

    # Initialize a list to store exactly 10 sentences per label
    extracted_sentences_per_label = []

    # Iterate through the sentences per label and extract 10 sentences for each label
    for label, sentences in sentences_per_label.items():
        # Extend the list with the first 10 sentences for the label
        extracted_sentences_per_label.extend([(sentence, int(label)) for sentence in sentences[:40]])

    return extracted_sentences_per_label

sentences_per_label = extract_sentences_per_label(good_preds)

In [17]:
starting_idx = len(dataset)

# Create a new dataset with the additional entries
new_data = []
for idx, (sentence, label) in enumerate(sentences_per_label):
    new_data.append({
        'label': label,
        'sentence': sentence,
        'idx': starting_idx + idx
    })

additional_dataset = Dataset.from_dict({key: [item[key] for item in new_data] for key in new_data[0]})

# Concatenate the original dataset with the additional dataset
updated_dataset = concatenate_datasets([dataset, additional_dataset])

print(updated_dataset)

Dataset({
    features: ['label', 'sentence', 'idx'],
    num_rows: 514
})


In [18]:
updated_trainer = SetFitTrainer(
model=model,
train_dataset=updated_dataset,
eval_dataset=updated_dataset,
loss_class=CosineSimilarityLoss,
metric="accuracy",
batch_size=16,
num_iterations=20,
num_epochs=1,
column_mapping={"sentence": "text", "label": "label"})

  updated_trainer = SetFitTrainer(
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/514 [00:00<?, ? examples/s]

In [19]:
updated_trainer.train()

***** Running training *****
  Num unique pairs = 20560
  Batch size = 16
  Num epochs = 1
  Total optimization steps = 1285


Step,Training Loss


In [20]:
predicted_labels=updated_trainer.model(test)

In [21]:
predicted_labels=[int(a) for a in list(predicted_labels)]

In [22]:
reverse_numerize_labels = {v: k for k, v in numerize_labels.items()}
predictions=[]
for i in range(len(predicted_labels)):
    predictions.append((i, reverse_numerize_labels[predicted_labels[i]]))

In [23]:
with open('predictions.csv', 'w', newline='') as f:
    # Create a CSV writer object
    writer = csv.writer(f)

    # Write the header row
    writer.writerow(['ID', 'Label'])

    # Write the data rows
    writer.writerows(predictions)

In [26]:
def compare_csv_files(file1, file2):
    with open(file1, 'r') as csv_file1, open(file2, 'r') as csv_file2:
        csv_reader1 = csv.DictReader(csv_file1)
        csv_reader2 = csv.DictReader(csv_file2)

        for row1, row2 in zip(csv_reader1, csv_reader2):
            if row1['ID'] != row2['ID']:
                print("Ids do not match:")
                print(f"File 1: {row1['ID']} - {row1['Label']}")
                print(f"File 2: {row2['ID']} - {row2['Label']}")
            elif row1['Label'] != row2['Label']:
                print("Labels do not match for:", test[int(row1['ID'])])
                print(f"File 1 Label: {row1['Label']}")
                print(f"File 2 Label: {row2['Label']}")

# Example usage:
file1_path = 'pred30.csv'
file2_path = 'pred40.csv'

compare_csv_files(file1_path, file2_path)


Labels do not match for: The startup accelerator provides funding and mentorship to help early-stage companies grow.
File 1 Label: Technology
File 2 Label: Finance
Labels do not match for: The robotics competition inspires students to pursue careers in STEM fields.
File 1 Label: Science
File 2 Label: Education
Labels do not match for: The computer scientist conducts research to advance the field of artificial intelligence.
File 1 Label: Technology
File 2 Label: Science
Labels do not match for: The sports memorabilia collector owns rare items signed by legendary athletes.
File 1 Label: Sports
File 2 Label: Fashion
Labels do not match for: The best way to find the best local markets is to ask the locals or do your research ahead of time.
File 1 Label: Travel
File 2 Label: Food
Labels do not match for: The sports nutritionist develops customized meal plans for athletes.
File 1 Label: Sports
File 2 Label: Health
Labels do not match for: The art gallery hosted an opening reception for their