In [5]:
import json
import pandas as pd


In [3]:
DATA_PATH = "./data/train.json"

with open(DATA_PATH, "r") as fichier:
    data = json.load(fichier)


df = pd.DataFrame.from_dict(data, orient='index')
df = df.stack().reset_index(level=1, drop=True).reset_index()
df.columns = ['Label', 'Text']
df = df[['Text', 'Label']]

numerize_labels = {'Politics':0, 'Health':1, 'Finance':2, 'Travel':3, 'Food':4, 'Education':5,
       'Environment':6, 'Fashion':7, 'Science':8, 'Sports':9, 'Technology':10, 'Entertainment':11}

df.Label = df.Label.map(numerize_labels)
df.head()

Unnamed: 0,Text,Label
0,The mayor announced a new initiative to improv...,0
1,The senator is facing criticism for her stance...,0
2,The upcoming election has sparked intense deba...,0
3,Regular exercise and a balanced diet are key t...,1
4,The World Health Organization has issued new g...,1


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import torch

In [100]:
# Step 1: Split the dataset into training and validation sets
train_dataset = pd.DataFrame(columns=['Text', 'Label'])
val_dataset = pd.DataFrame(columns=['Text', 'Label'])

for i in range(1, len(df)+1):
    if i % 3 == 0:
        val_dataset = pd.concat([val_dataset, df.iloc[i-1].to_frame().T], ignore_index=True)
    else:
        train_dataset = pd.concat([train_dataset, df.iloc[i-1].to_frame().T], ignore_index=True)
        
# Step 2: Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(numerize_labels))
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 3: Tokenize the text data using the BERT tokenizer
train_encodings = tokenizer(list(train_dataset['Text']), truncation=True, padding=True)
val_encodings = tokenizer(list(val_dataset['Text']), truncation=True, padding=True)

# Step 4: Convert the tokenized data into input features compatible with BERT
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': list(train_dataset['Label'])})
val_dataset = Dataset.from_dict({'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask'], 'labels': list(val_dataset['Label'])})

# Step 5: Define the model architecture
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=300,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Step 6: Train the model
trainer.train()

# Step 7: Evaluate the model
trainer.evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/600 [00:00<?, ?it/s]

{'loss': 2.5962, 'grad_norm': 9.0806245803833, 'learning_rate': 1.0000000000000002e-06, 'epoch': 5.0}
{'loss': 2.5165, 'grad_norm': 6.362722873687744, 'learning_rate': 2.0000000000000003e-06, 'epoch': 10.0}
{'loss': 2.4888, 'grad_norm': 6.0819902420043945, 'learning_rate': 3e-06, 'epoch': 15.0}
{'loss': 2.4094, 'grad_norm': 6.955924034118652, 'learning_rate': 4.000000000000001e-06, 'epoch': 20.0}
{'loss': 2.2769, 'grad_norm': 17.234756469726562, 'learning_rate': 5e-06, 'epoch': 25.0}
{'loss': 2.1932, 'grad_norm': 8.1810884475708, 'learning_rate': 6e-06, 'epoch': 30.0}
{'loss': 2.1177, 'grad_norm': 12.641152381896973, 'learning_rate': 7.000000000000001e-06, 'epoch': 35.0}
{'loss': 2.021, 'grad_norm': 7.106116771697998, 'learning_rate': 8.000000000000001e-06, 'epoch': 40.0}
{'loss': 1.9045, 'grad_norm': 7.161906719207764, 'learning_rate': 9e-06, 'epoch': 45.0}
{'loss': 1.7754, 'grad_norm': 7.734902381896973, 'learning_rate': 1e-05, 'epoch': 50.0}
{'loss': 1.6299, 'grad_norm': 8.316179275

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.5934091806411743,
 'eval_runtime': 0.4953,
 'eval_samples_per_second': 24.229,
 'eval_steps_per_second': 2.019,
 'epoch': 300.0}

In [101]:
model.save_pretrained("./model")


In [103]:
result = pd.DataFrame(columns=['ID', 'Label'])

file_path = "./data/test_shuffle.txt"

with open(file_path, "r") as file:
    text = file.read()

for i, sentence in enumerate(text.split("\n")):
    inputs = tokenizer.encode_plus(sentence, truncation=True, padding=True, return_tensors="pt")
    outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits).item()
    predicted_label_name = list(numerize_labels.keys())[predicted_label]
    result = pd.concat([result, pd.DataFrame({'ID': [i], 'Label': [predicted_label_name]})], ignore_index=True)

result = result.iloc[:-1]
result.to_csv("result3.csv", index=False)