In [None]:
import os
import requests
import polars as pl
from datasets import load_dataset
import json

# Use the specified data directory path
data_dir = "/Users/tommasofurlanello/Documents/Dev/MarketInference/data"
os.makedirs(data_dir, exist_ok=True)

def download_gutenberg_dataset():
    """
    Download the Gutenberg English dataset from Hugging Face and save to /data folder
    """
    print("Downloading Gutenberg English dataset...")
    
    # Method 1: Using the datasets library
    try:
        dataset = load_dataset("sedthh/gutenberg_english")
        # Save each split to parquet files for faster loading
        for split in dataset:
            output_path = os.path.join(data_dir, f"gutenberg_{split}.parquet")
            print(f"Saving {split} split to {output_path}")
            dataset[split].to_parquet(output_path)
        return True
    except Exception as e:
        print(f"Error using datasets library: {e}")
        print("Trying alternative download method...")
    
    # Method 2: Manual download if datasets library fails
    try:
        # Get dataset info
        info_url = "https://huggingface.co/datasets/sedthh/gutenberg_english/resolve/main/dataset_infos.json"
        response = requests.get(info_url)
        response.raise_for_status()
        info = json.loads(response.text)
        
        splits = ["train", "test", "validation"]
        for split in splits:
            # Download each split
            split_url = f"https://huggingface.co/datasets/sedthh/gutenberg_english/resolve/main/{split}-00000-of-00001.parquet"
            output_path = os.path.join(data_dir, f"gutenberg_{split}.parquet")
            
            print(f"Downloading {split} split from {split_url}")
            response = requests.get(split_url)
            response.raise_for_status()
            
            with open(output_path, "wb") as f:
                f.write(response.content)
            print(f"Saved to {output_path}")
        
        return True
    except Exception as e:
        print(f"Error in alternative download method: {e}")
        return False

def load_as_polars():
    """
    Load all splits of the Gutenberg dataset as a single Polars DataFrame
    """
    splits = ["train", "test", "validation"]
    dataframes = []
    
    for split in splits:
        file_path = os.path.join(data_dir, f"gutenberg_{split}.parquet")
        if os.path.exists(file_path):
            print(f"Loading {file_path}...")
            df = pl.read_parquet(file_path)
            # Add a column to identify the split
            df = df.with_columns(pl.lit(split).alias("split"))
            dataframes.append(df)
        else:
            print(f"Warning: {file_path} not found")
    
    if not dataframes:
        raise FileNotFoundError(f"No dataset files found in {data_dir} directory")
    
    # Combine all dataframes
    combined_df = pl.concat(dataframes)
    print(f"Combined DataFrame shape: {combined_df.shape}")
    print(f"Combined DataFrame schema:\n{combined_df.schema}")
    
    return combined_df

In [None]:
download_gutenberg_dataset()

In [None]:
df = load_as_polars()

In [14]:
unnested_df = df.with_columns(pl.col("METADATA").str.json_decode()).unnest("METADATA")

In [None]:
unnested_df["subjects"][52]

In [20]:
novels = unnested_df.filter(pl.col("subjects").str.contains("fiction"))

In [30]:
novels.write_parquet("/Users/tommasofurlanello/Documents/Dev/MarketInference/data/gutenberg_en_novels.parquet")

In [None]:
novels["TEXT"][0][114:2500]