In [2]:
"""
Install the dependencies.
"""

!pip install polars

Collecting polars
  Downloading polars-1.7.0-cp38-abi3-macosx_11_0_arm64.whl.metadata (14 kB)
Downloading polars-1.7.0-cp38-abi3-macosx_11_0_arm64.whl (27.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.2/27.2 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: polars
Successfully installed polars-1.7.0


In [1]:
"""
Read the JSON file and convert it to a CSV with Title and Content columns.
"""

import json
import csv

input_file = 'datasets/trn.json' 
output_file = 'datasets/trn_output.csv'

with open(input_file, 'r') as f_in, open(output_file, 'w', newline='', encoding='utf-8') as f_out:
    writer = csv.writer(f_out)
    writer.writerow(['title', 'content'])
    
    for line in f_in:
        json_object = json.loads(line)
        title = json_object.get('title', '')
        content = json_object.get('content', '')
        writer.writerow([title, content])


In [2]:
"""
Read the CSV file using polars.
"""

import polars as pl

df = pl.read_csv("datasets/trn_output.csv")
df

title,content
str,str
"""Girls Ballet Tutu Neon Pink""","""High quality 3 layer ballet tu…"
"""Adult Ballet Tutu Yellow""",
"""The Way Things Work: An Illust…",
"""Mog's Kittens""","""Judith Kerr&#8217;s best&#8211…"
"""Misty of Chincoteague""",
…,…
,"""Enhance your gaming experience…"
,"""Charge both of your PS4 contro…"
"""Apache Paracord Type III 7 Str…",
"""Cont Removable Paper Label""","""Continuous Length Removable Pa…"


In [3]:
"""
Remove rows where the Title or Content is null/empty.
Save the cleaned data into another CSV.
"""

df_filtered = df.filter(
    pl.col('content').is_not_null() & 
    pl.col('title').is_not_null() &  
    (pl.col('content') != '') & 
    (pl.col('title') != '')
)

df_filtered.write_csv('datasets/trn_output_clean.csv')

In [7]:
"""
Comparing old and new data.
"""

print(f"Size of the old dataset: {len(df)}")
print(f"Size of the new dataset: {len(df_filtered)}")

Size of the old dataset: 2248619
Size of the new dataset: 1390403


In [5]:
"""
Optionally, save as parquet.
"""
df_filtered.write_parquet("datasets/trn_output_clean.parquet")