In [None]:
import torch
from tqdm.notebook import tqdm
import os
from transformers import BertTokenizer, DistilBertTokenizer
from torch.utils.data import TensorDataset
import numpy as np
from transformers import BertForSequenceClassification, DistilBertForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Project_News_Classification')

Mounted at /content/drive


In [None]:
# Specify the path to your JSON file
json_path = 'nytimes.json'
# Read the JSON data into a DataFrame
df = pd.read_json(json_path)
df.head()

Unnamed: 0,section,headline,article_url,article,abstract,article_id,image,caption,image_id
0,Health,F.D.A. Plans to Ban Most E-Cigarette Flavors b...,https://www.nytimes.com/2019/12/31/health/e-ci...,The Trump administration is expected to announ...,The tobacco and vaping industries and conserva...,42d25485-0e48-50bf-8d16-948833b2a55d,https://static01.nyt.com/images/2019/11/06/sci...,A new study by the National Institute on Drug ...,42d25485-0e48-50bf-8d16-948833b2a55d
1,Science,Meteor Showers in 2020 That Will Light Up Nigh...,https://www.nytimes.com/2020/01/01/science/met...,All year long as Earth revolves around the sun...,"All year long, Earth passes through streams of...",04bc90f0-b20b-511c-b5bb-3ce13194163f,https://static01.nyt.com/images/2020/01/01/sci...,"Perseid meteors named as ""Orinoid"" streak acro...",04bc90f0-b20b-511c-b5bb-3ce13194163f
2,Science,"Rocket Launches, Trips to Mars and More 2020 S...",https://www.nytimes.com/2020/01/01/science/spa...,"If you follow space news and astronomy, the pa...",A year full of highs and lows in space just en...,bd8647b3-8ec6-50aa-95cf-2b81ed12d2dd,https://static01.nyt.com/images/2020/01/01/sci...,Spectators viewing the launch of a Soyuz rocke...,bd8647b3-8ec6-50aa-95cf-2b81ed12d2dd
3,Television,What's on TV Wednesday: A Linda Ronstadt Doc a...,https://www.nytimes.com/2020/01/01/arts/televi...,LINDA RONSTADT: THE SOUND OF MY VOICE (2019) 9...,"""Linda Ronstadt: The Sound of My Voice"" airs o...",e6c25b53-0416-5795-b0cf-e1243924dc79,https://static01.nyt.com/images/2020/01/01/art...,"Linda Ronstadt in ""Linda Ronstadt: The Sound o...",e6c25b53-0416-5795-b0cf-e1243924dc79
4,Travel,New Cruise Ships to Set Sail for Antarctica,https://www.nytimes.com/2020/01/01/travel/anta...,As the number of travelers cruising Antarctica...,Interested in the southernmost continent? Here...,98c3d182-95ce-5244-9b9e-008a3dee7354,https://static01.nyt.com/images/2020/01/05/tra...,"Antarctica21&rsquo;s expedition ship, Ocean No...",98c3d182-95ce-5244-9b9e-008a3dee7354


In [None]:
# Remove the columns
df = df.drop(columns=['headline', 'abstract', 'article_url', 'article_id', 'image', 'caption', 'image_id'], axis=1)
df.head()

Unnamed: 0,section,article
0,Health,The Trump administration is expected to announ...
1,Science,All year long as Earth revolves around the sun...
2,Science,"If you follow space news and astronomy, the pa..."
3,Television,LINDA RONSTADT: THE SOUND OF MY VOICE (2019) 9...
4,Travel,As the number of travelers cruising Antarctica...


In [None]:
df['section'].value_counts()

section
Health             3001
Science            3001
Television         3001
Travel             3001
Movies             3001
Dance              3001
Real Estate        3001
Sports             3001
Technology         3001
Theater            3001
Opinion            3001
Music              3001
Books              3001
Art & Design       3001
Media              3001
Food               3001
Fashion & Style    3001
Style              2681
Automobiles        1825
Economy            1761
Your Money         1263
Global Business    1182
Education           825
Well                681
Name: count, dtype: int64

In [None]:
possible_labels = df.section.unique()
label_dict = {}
id2label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
    id2label_dict[index] = possible_label
print(label_dict)

{'Health': 0, 'Science': 1, 'Television': 2, 'Travel': 3, 'Movies': 4, 'Dance': 5, 'Real Estate': 6, 'Economy': 7, 'Sports': 8, 'Theater': 9, 'Opinion': 10, 'Music': 11, 'Books': 12, 'Art & Design': 13, 'Style': 14, 'Media': 15, 'Food': 16, 'Well': 17, 'Fashion & Style': 18, 'Technology': 19, 'Your Money': 20, 'Education': 21, 'Automobiles': 22, 'Global Business': 23}


In [None]:
print(id2label_dict)

{0: 'Health', 1: 'Science', 2: 'Television', 3: 'Travel', 4: 'Movies', 5: 'Dance', 6: 'Real Estate', 7: 'Economy', 8: 'Sports', 9: 'Theater', 10: 'Opinion', 11: 'Music', 12: 'Books', 13: 'Art & Design', 14: 'Style', 15: 'Media', 16: 'Food', 17: 'Well', 18: 'Fashion & Style', 19: 'Technology', 20: 'Your Money', 21: 'Education', 22: 'Automobiles', 23: 'Global Business'}


In [None]:
df['label'] = df.section.replace(label_dict)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                  df.label.values,
                                                  test_size=0.2,
                                                  random_state=2024,
                                                  stratify=df.label.values)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['section', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,article
section,label,data_type,Unnamed: 3_level_1
Art & Design,13,train,2401
Art & Design,13,val,600
Automobiles,22,train,1460
Automobiles,22,val,365
Books,12,train,2400
Books,12,val,601
Dance,5,train,2401
Dance,5,val,600
Economy,7,train,1409
Economy,7,val,352


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].article.values,
    add_special_tokens=True,
    return_attention_mask=True,
    truncation=True,
    padding=True,
    return_tensors='pt',
    max_length = 512
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].article.values,
    add_special_tokens=True,
    truncation=True,
    return_attention_mask=True,
    padding=True,
    return_tensors='pt',
    max_length = 512
)



input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
len(dataset_train), len(dataset_val)

(48988, 12247)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)



In [None]:
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    TP = np.sum(preds_flat == labels_flat)
    total_samples = len(labels_flat)
    overall_accuracy = TP / total_samples
    return overall_accuracy

In [None]:
import random

seed_val = 2024
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [None]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)
        print(batch[0])
        print(len(batch[0][0]))
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0
    predictions, true_vals = [], []

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        logits = logits.detach().cpu().numpy()
        loss_train_total += loss.item()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    torch.save(model.state_dict(), f'finetuned_distillBERT_epoch_{epoch}.model')
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    tqdm.write(f'\nEpoch {epoch}')
    train_accuracy = accuracy_per_class(predictions, true_vals)
    train_f1 = f1_score_func(predictions, true_vals)
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Training F1 Score (Weighted): {train_f1}')
    tqdm.write(f'Training Accuracy (Weighted): {train_accuracy}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    val_accuracy = accuracy_per_class(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Validation Accuracy (Weighted): {val_accuracy}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/16330 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.8149544784651097
Training F1 Score (Weighted): 0.7991080765648321
Training Accuracy (Weighted): 0.806074957132359
Validation loss: 0.6700232545356084
F1 Score (Weighted): 0.8486209239724819
Validation Accuracy (Weighted): 0.8539234098146485


Epoch 2:   0%|          | 0/16330 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.5008563618971835
Training F1 Score (Weighted): 0.8865494117979482
Training Accuracy (Weighted): 0.8881971094962031
Validation loss: 0.598862219476113
F1 Score (Weighted): 0.8754451394356171
Validation Accuracy (Weighted): 0.8765411937617376


Epoch 3:   0%|          | 0/16330 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.3428981783825658
Training F1 Score (Weighted): 0.9260750214825363
Training Accuracy (Weighted): 0.9268596390952887
Validation loss: 0.6754292044720793
F1 Score (Weighted): 0.8775219315750202
Validation Accuracy (Weighted): 0.8798072997468768


Epoch 4:   0%|          | 0/16330 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.2270286235429227
Training F1 Score (Weighted): 0.9530353410079625
Training Accuracy (Weighted): 0.9534171633869519
Validation loss: 0.7277293635248893
F1 Score (Weighted): 0.8794374301406924
Validation Accuracy (Weighted): 0.8805421735935332


Epoch 5:   0%|          | 0/16330 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.1542352870738841
Training F1 Score (Weighted): 0.9688222499851707
Training Accuracy (Weighted): 0.9689924063035845
Validation loss: 0.752952288009803
F1 Score (Weighted): 0.88204699517357
Validation Accuracy (Weighted): 0.8825834898342452


In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
model.load_state_dict(torch.load('finetuned_distillBERT_epoch_5.model', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
_, predictions, true_vals = evaluate(dataloader_validation)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        [  101,  5291,  2962,  ...,     0,     0,     0],
        [  101,  2348,  3994,  ...,     0,     0,     0]], device='cuda:0')
512
tensor([[ 101, 1037, 2537,  ..., 2062, 2084,  102],
        [ 101, 2188, 2005,  ...,    0,    0,    0],
        [ 101, 2073, 1005,  ..., 1010, 2029,  102]], device='cuda:0')
512
tensor([[  101,  3520,  9033,  ...,     0,     0,     0],
        [  101,  2009,  1005,  ...,     0,     0,     0],
        [  101,  4901, 14103,  ...,     0,     0,     0]], device='cuda:0')
512
tensor([[  101,  1045,  1005,  ...,     0,     0,     0],
        [  101,  3730,  1011,  ...,     0,     0,     0],
        [  101,  1037,  2329,  ...,  1996, 14439,   102]], device='cuda:0')
512
tensor([[  101, 19066,  1011,  ...,     0,     0,     0],
        [  101,  2009,  1005,  ...,     0,     0,     0],
        [  101,  2167,  5922,  ...,  2021,  2720,   102]], device='cuda:0')
512
tensor([[  101, 11058,  2038,  

In [None]:
def accuracy_per_class2(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    TP = np.sum(preds_flat == labels_flat)
    total_samples = len(labels_flat)
    overall_accuracy = TP / total_samples
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
    return overall_accuracy

accuracy_per_class2(predictions, true_vals)

Class: Health
Accuracy: 522/601

Class: Science
Accuracy: 544/600

Class: Television
Accuracy: 556/600

Class: Travel
Accuracy: 540/600

Class: Movies
Accuracy: 554/600

Class: Dance
Accuracy: 586/600

Class: Real Estate
Accuracy: 556/600

Class: Economy
Accuracy: 290/352

Class: Sports
Accuracy: 585/600

Class: Theater
Accuracy: 559/600

Class: Opinion
Accuracy: 508/600

Class: Music
Accuracy: 554/600

Class: Books
Accuracy: 562/601

Class: Art & Design
Accuracy: 540/600

Class: Style
Accuracy: 275/536

Class: Media
Accuracy: 518/600

Class: Food
Accuracy: 568/600

Class: Well
Accuracy: 86/136

Class: Fashion & Style
Accuracy: 461/600

Class: Technology
Accuracy: 538/601

Class: Your Money
Accuracy: 208/253

Class: Education
Accuracy: 131/165

Class: Automobiles
Accuracy: 353/365

Class: Global Business
Accuracy: 215/237



0.8825834898342452

In [None]:
import numpy as np
from sklearn import metrics

def accuracy_per_class2(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    TP = np.sum(preds_flat == labels_flat)
    total_samples = len(labels_flat)
    overall_accuracy = TP / total_samples
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}')

        recall = metrics.recall_score(y_true, y_preds, average='weighted')
        precision = metrics.precision_score(y_true, y_preds, average='weighted')
        f1_score = metrics.f1_score(y_true, y_preds, average='weighted')

        print(f'Recall: {recall}')
        print(f'Precision: {precision}')
        print(f'F1 Score: {f1_score}\n')

    return overall_accuracy

accuracy_per_class2(predictions, true_vals)

In [None]:
import numpy as np
from sklearn import metrics

def accuracy_per_class3(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    TP = np.sum(preds_flat == labels_flat)
    total_samples = len(labels_flat)
    overall_accuracy = TP / total_samples
    recall = metrics.recall_score(labels_flat, preds_flat, average='weighted')
    precision = metrics.precision_score(labels_flat, preds_flat, average='weighted')
    f1_score = metrics.f1_score(labels_flat, preds_flat, average='weighted')
    print(f'accuracy: {overall_accuracy}')
    print(f'Recall: {recall}')
    print(f'Precision: {precision}')
    print(f'F1 Score: {f1_score}\n')
    return overall_accuracy

accuracy_per_class3(predictions, true_vals)

accuracy: 0.8825834898342452
Recall: 0.8825834898342452
Precision: 0.8819574656146366
F1 Score: 0.88204699517357



0.8825834898342452