In [1]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-curated-transformers<0.3.0,>=0.2.0 (from en-core-web-trf==3.7.3)
  Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.3/236.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl (25 kB)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_tokenizers-0.0.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (731 kB)
[2K  

In [1]:
import spacy
import pandas as pd

df = pd.read_csv('News_Category_Dataset_v3.csv')

df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
df.isna().sum()

link                     0
headline                 1
category                 0
short_description        0
authors              32955
date                     0
dtype: int64

In [4]:
df = df.fillna('')

In [5]:
nlp = spacy.load('en_core_web_trf')

In [6]:
import re
from tqdm import tqdm
from typing import Optional

def preprocess_dataset(df: pd.DataFrame, remove_stop_words: Optional[bool] = True) -> pd.DataFrame:
    """
    Function that merges headlines with descriptions
    and removes numbers, stop words and punctuation symbols in the merged data

    :param df: dataframe with news
    :param remove_stop_words: weather to remove stop words
    :return: same dataframe but with new descriptions
    """
    # change description data
    response = df.copy(deep=True)
    pairs = zip(df.headline.values, df.short_description.values)
    response.short_description = [
        re.sub('\s+', ' ', f'{headline}\n{description}').strip()
        for headline, description in pairs
    ]
    # list for the new descriptions
    processed_descriptions = []

    # ignore numbers and punctuation symbols
    if remove_stop_words:
        # also ignore stop words
        criterion = lambda token: not any((token.is_digit, token.is_stop, token.is_punct))
    else:
        criterion = lambda token: not any((token.is_digit, token.is_punct))

    # dataframe cleaning process
    for description in tqdm(df.short_description.values):
        doc = nlp(description)
        processed_tokens = [token.lemma_ for token in filter(criterion, doc)]
        processed_description = ' '.join(processed_tokens)

        processed_descriptions.append(processed_description)

    # replace old data
    response.short_description = processed_descriptions

    return response

We've already seen accuracy results for two fine-tuned models on different datasets. Now, we'll process the whole dataset without stop words. The dataset will be processed in chunks. Each chunk after processing will be saved to Google Drive. After all chunks are processed, we'll concatenate them and save the final dataset. Processing in chunks was selected because of the huge dataset size and the time required to process it. As you can see, I had a bad connection, and processing was interrupted, and then I had to start from the middle of the dataset.

In [7]:
import os.path as op
from google.colab import drive

drive.mount('/content/drive')

CHUNK_SIZE = 20_000
GDRIVE_PATH = '/content/drive/MyDrive'

for i in range(6, 10):
    chunk = preprocess_dataset(df[i * CHUNK_SIZE:(i + 1) * CHUNK_SIZE], False)
    filename = f'chunk{i + 1}.csv'

    chunk.to_csv(op.join(GDRIVE_PATH, filename), index=False)

100%|██████████| 20000/20000 [1:11:44<00:00,  4.65it/s]
100%|██████████| 20000/20000 [1:08:52<00:00,  4.84it/s]
100%|██████████| 20000/20000 [1:21:54<00:00,  4.07it/s]
100%|██████████| 9815/9815 [35:49<00:00,  4.57it/s]
