
# All The News v2.0 📰

[All The News (ATN) 2.0](https://components.one/datasets/all-the-news-2-news-articles-dataset) is a 2.7M row large dataset of news articles from a number of publications. It spawned from [ATN v1.0](https://www.kaggle.com/datasets/snapcrack/all-the-news/data), a famous kaggle challenge for sentiment analysis and topic modeling. 


* The file is available as a dropbox link, and with the parameter set to `?dl=1`, we can download the dataset programatically.
* Input Filetype is `CSV` and saved as `Delta Lake`
* Since the input CSV is quite large, it is read in chunks and stored as partitions

#### Notebook Properties
* Upstream Notebook: N/A
* Compute Resources: `16 GB RAM, 2 CPUs`
* Last Updated: `Nov 21 2023`

In [0]:
import warnings

warnings.filterwarnings("ignore")

import os
import json
import contextlib
import pandas as pd
import pyarrow as pa
import numpy as np
from tqdm.autonotebook import tqdm

from src.utils.io import FileSystemHandler
from src.utils.schemas import all_the_news_raw_schema

from deltalake import DeltaTable
from deltalake.exceptions import TableNotFoundError

In [0]:
pd.set_option("display.max_columns", None)

read_chunk_size: int = 50_000
"""Read and save partitions of 50k articles from csv to delta"""

dataset_size: int = 2_700_000
"""Total estimated size of ATN v2 dataset"""

TABLE_NAME: str = "all_the_news"
"""The name of the delta table to save the output dataset"""
CATALOG_NAME: str = "raw"
"""A catalog / schema name to store the table"""

datafs = FileSystemHandler("s3")
"""A file system handler that uses local or s3 storage to save delta tables"""

SITE_URL: str = "https://components.one/datasets/all-the-news-2-news-articles-dataset"
DOWNLOAD_URL: str = "https://www.dropbox.com/s/cn2utnr5ipathhh/all-the-news-2-1.zip?dl=1"


Remove Table if Exists

In [0]:
with contextlib.suppress(TableNotFoundError):
    # if table already doesn't exist, then ignore
    print(datafs.clear_delta(table=TABLE_NAME, catalog_name=CATALOG_NAME))


## Data Import

In [0]:
chunks: pd.io.parsers.readers.TextFileReader = pd.read_csv(
    DOWNLOAD_URL,
    compression="zip",
    encoding="utf-8",
    sep=",",
    quotechar='"',
    chunksize=read_chunk_size,
)

In [0]:
def preprocess_chunk(chunk_df: pd.DataFrame) -> pd.DataFrame:
    """Fix types and nulls in chunk read from CSV."""
    for col in chunk_df.columns:
        chunk_df[col] = (
            chunk_df[col]
            .dropna()
            .apply(
                lambda cell: None
                if isinstance(cell, str)
                and (cell.lower() == "nan" or cell.lower() == "none")
                else cell
            )
        )

    chunk_df = chunk_df.replace([np.nan], [None])
    chunk_df["date"] = pd.to_datetime(chunk_df["date"])

    return chunk_df

In [0]:
for chunk_df in tqdm(chunks, total=dataset_size // read_chunk_size):
    chunk_df = preprocess_chunk(chunk_df)
    datafs.write_delta(
        dataframe=chunk_df,
        table=TABLE_NAME,
        catalog_name=CATALOG_NAME,
        schema=all_the_news_raw_schema,
        mode="append",
    )


## Test Ingested Data

In [0]:
# reinitialize delta table
atn_delta_table: DeltaTable = DeltaTable(f"{datafs.CATALOG}/{CATALOG_NAME}/{TABLE_NAME}.delta")

In [0]:
pd.DataFrame(json.loads(atn_delta_table.schema().to_json())["fields"])

In [0]:
atn_partitions: list[str] = atn_delta_table.file_uris()

atn_df: pd.DataFrame = pd.read_parquet(atn_partitions[-1])
print(atn_df.shape)
atn_df.head()

In [0]:
atn_df.info()