In [None]:
import os
import pandas as pd
import json
import duckdb

In [None]:
# Define the main directory
main_dir = 'v1.0'
output_dir = 'v1.0-stat-2'

In [None]:
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

In [None]:
import duckdb
import os
import json

def process_parquet(file_path):
    # Connect to an in-memory DuckDB instance
    con = duckdb.connect()

    # Load the Parquet file into a DuckDB relation
    con.execute(f"CREATE TABLE parquet_table AS SELECT * FROM '{file_path}'")

    # Calculate the number of records
    num_records = con.execute("SELECT COUNT(*) FROM parquet_table").fetchone()[0]

    # Add a new column 'num-words'
    con.execute("ALTER TABLE parquet_table ADD COLUMN num_words INT")
    con.execute("UPDATE parquet_table SET num_words = array_length(str_split_regex(content, '\\s+'), 1)")

    # Select the relevant columns and calculate the summations
    query = """
    SELECT
        SUM("content-length") AS total_content_length_sum,
        SUM("num-sents") AS num_sents_sum,
        SUM(num_words) AS num_words_sum,
        SUM(CASE WHEN 'religious' = ANY(categories) OR 'associations_religieuses' = ANY(categories) THEN 1 ELSE 0 END) AS count_religious,
        SUM(CASE WHEN 'wikipedia' = ANY(categories) THEN 1 ELSE 0 END) AS count_wikipedia
    FROM parquet_table
    """
    result = con.execute(query).fetchone()

    # Create a result dictionary
    result_dict = {
        'file_path': file_path,
        'num_records': str(num_records),
        'total_content_length_sum': str(result[0]),
        'num_sents_sum': str(result[1]),
        'num_words_sum': str(result[2]),
        'religious_num_records': str(result[3]),
        'wikipedia_num_records': str(result[4])
    }

    # Define output path
    relative_path = os.path.relpath(file_path, main_dir)
    output_path = os.path.join(output_dir, relative_path + '.json')

    # Ensure the output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Save the result to a JSON file
    with open(output_path, 'w') as f:
        json.dump(result_dict, f, indent=4)

    return None


In [None]:
# Collect all parquet files
parquet_files = []
for root, dirs, files in os.walk(main_dir):
    for file in files:
        if file.endswith('.parquet'):
            file_path = os.path.join(root, file)
            parquet_files.append(file_path)
            
parquet_files.sort(key=os.path.getsize)

In [None]:
from tqdm import tqdm

for file_path in tqdm(parquet_files):
    process_parquet(file_path)



In [None]:
! zip -r v1.0-stat-2.zip v1.0-stat-2/*