<a href="https://colab.research.google.com/github/grfaith/AmericanStories/blob/main/Bbox_and_text_lookup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any

In [4]:
import csv
import pandas as pd
import os
from datasets import load_dataset
import json

# Placeholder functions

def synch_with_google_drive():
    from google.colab import drive
    drive.mount('/content/drive')

    # Adjust this path to match your file location
    # df = pd.read_csv('/content/drive/My Drive/Random_Sample_100.csv')

    # Show the first few rows to verify
    # df.head()


In [5]:
def extract_pub_year_and_date(df):
    """Extracts pub_year and pub_date from the article_ID and adds them as new columns."""
    # Extract pub_date and pub_year from article_ID
    df['pub_date'] = df['article_ID'].str.split('_').str[1]  # Extracts date like '1916-07-31'
    df['pub_year'] = df['pub_date'].str[:4]  # Extracts the year from pub_date (first 4 characters)

    return df

def sort_and_chunk_csv(df, chunk_size=50000):
    """Sorts the DataFrame by pub_year and splits it into manageable chunks without grouping by year."""
    # Ensure that pub_year is numeric for sorting and processing
    df['pub_year'] = pd.to_numeric(df['pub_year'], errors='coerce')

    # Sort the DataFrame by pub_year
    df_sorted = df.sort_values(by='pub_year').reset_index(drop=True)

    # DEBUG: Print sorted pub_years to verify order
    # print("Sorted pub_years:", df_sorted['pub_year'].tolist())

    # Split into chunks of size chunk_size
    chunks = [df_sorted[i:i + chunk_size] for i in range(0, len(df_sorted), chunk_size)]

    return chunks


In [6]:
def download_dataset(years):
    """Downloads the relevant dataset for the given range of years from Hugging Face."""
    dataset_scan_level = {}

    for year in years:
        try:
            # Load the dataset for the specific year
            dataset_year = load_dataset(
                "./AmericanStories",
                "subset_years",
                year_list=[year],
                trust_remote_code=True
            )
            dataset_scan_level[year] = dataset_year[year]  # Access the year's data

            # Debug: Print confirmation # chockable
            # print(f"Successfully loaded data for year: {year}") # chockable

        except KeyError as e:
            print(f"Failed to load dataset for year {year}: {e}")

    # Debug: Print available years in dataset
    print(f"Available years in dataset: {dataset_scan_level.keys()}")

    return dataset_scan_level if dataset_scan_level else None


In [7]:
def save_partial_results(chunk_results, output_path):
    """Save processed results to a CSV file."""
    # If chunk_results is a DataFrame, use .to_csv()
    if isinstance(chunk_results, pd.DataFrame):
        chunk_results.to_csv(output_path, mode='a', index=False)  # Append mode

    # If chunk_results is a dictionary, write it using csv.DictWriter
    elif isinstance(chunk_results, list) and all(isinstance(item, dict) for item in chunk_results):
        # Ensure that all dictionaries have the same keys
        fieldnames = chunk_results[0].keys() if chunk_results else []

        with open(output_path, mode='a', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            if csvfile.tell() == 0:  # Write header only if the file is empty
                writer.writeheader()

            writer.writerows(chunk_results)

    else:
        print(f"Unrecognized data format for chunk_results: {type(chunk_results)}")


def resume_from_checkpoint(output_path):
    """Checks the output CSV to determine the last processed article and resume."""
    if not os.path.isfile(output_path):
        return None  # No checkpoint available

    with open(output_path, 'r') as f:
        last_line = f.readlines()[-1]  # Get the last processed line
        last_processed_article_id = last_line.split(',')[0]  # Assuming article_ID is the first column
        return last_processed_article_id

In [8]:
def generate_image_url(article_id, bounding_boxes):
    """
    Generates a URL to the image of the scanned article from the Library of Congress,
    using the article ID and bounding box coordinates.

    Example URL pattern:
    https://www.loc.gov/resource/sn83030313/1874-12-06/ed-1/?sp=11&clip=x1,y1,x2,y2
    """
    # Split the article ID to extract the necessary components
    components = article_id.split("_")

    # Check if we have enough components in article_id
    if len(components) < 4:
        print(f"Error: Invalid article_id format: {article_id}")
        return None  # Return None if article_id is not in the expected format

    # Extract the newspaper ID, date, and page number
    newspaper_id = components[3]
    date = components[1]  # Format: YYYY-MM-DD

    # Extract and handle page number, default to 1 if page number is None or invalid
    page_number = components[2].replace("p", "") if len(components) > 2 and components[2] and components[2] != 'None' else "1"

    # Base URL for the scanned article
    base_url = f"https://www.loc.gov/resource/{newspaper_id}/{date}/ed-1/?sp={page_number}"

    # Group the bounding boxes by four coordinates (assuming the bounding boxes are provided as a flat list)
    if isinstance(bounding_boxes, list) and len(bounding_boxes) % 4 == 0:
        grouped_bounding_boxes = [bounding_boxes[i:i+4] for i in range(0, len(bounding_boxes), 4)]

        # Warning if more than one bounding box is found
        if len(grouped_bounding_boxes) > 1:
            print(f"Warning: More than one bounding box found for article {article_id}. Bounding boxes: {grouped_bounding_boxes}. Using the first one.")

        # Use the first bounding box
        bbox = grouped_bounding_boxes[0]

        # Check that bbox has exactly 4 numeric coordinates
        if len(bbox) == 4 and all(isinstance(coord, (int, float)) for coord in bbox):
            clip_param = f"{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}"
            full_url = f"{base_url}&clip={clip_param}"
        else:
            print(f"Error: Invalid bounding box for article {article_id}: {bbox}")
            full_url = base_url  # Return the base URL if bounding box is invalid
    else:
        print(f"Warning: No valid bounding box for article {article_id}.")
        full_url = base_url  # If no bounding boxes, return the base page URL

    return full_url


In [9]:
def process_chunk(chunk):
    """Processes a chunk of articles, downloading datasets and finding matches."""

    # Extract unique publication years from the chunk and format them as strings
    pub_years = list(set(chunk['pub_year']))  # Get unique years
    pub_years_str = [str(year) for year in pub_years]  # Convert to string format

    # Download the dataset for the relevant years
    print(pub_years_str)  # Debugging purposes
    dataset = download_dataset(pub_years_str)  # Load datasets for the years

    # If the dataset failed to download, skip this chunk
    if dataset is None:
        print(f"Skipping chunk for years {pub_years_str} due to dataset load failure.")
        return

    # Create new columns for article text, bounding boxes, and article link if they don't exist
    if 'article_text' not in chunk.columns:
        chunk['article_text'] = ''
    if 'bbox' not in chunk.columns:
        chunk['bbox'] = ''
    if 'article_link' not in chunk.columns:
        chunk['article_link'] = ''

    # Iterate through each article in the chunk
    for idx, article in chunk.iterrows():
        article_id = article['article_ID']
        pub_year = article['pub_year']  # Get the year for this article
        row_number = article['row_number']  # Get the row number for direct access

        try:
            # Call extract_article_data with row_number to get text and bounding boxes
            article_text, bbox = extract_article_data(article_id, dataset, pub_year, row_number)

            # Store the result in the chunk DataFrame
            chunk.at[idx, 'article_text'] = article_text
            chunk.at[idx, 'bbox'] = bbox

            # Generate the article link using the article ID and bounding box
            article_link = generate_image_url(article_id, bbox)
            chunk.at[idx, 'article_link'] = article_link

        except Exception as e:
            print(f"Error processing article {article_id} (Year: {pub_year}): {e}")

    return chunk


    return chunk  # Return the chunk with updated columns


In [14]:
import os

def extract_article_data(lookup_id, dataset_article_level, year, row_number):
    """Extracts the article text, bounding boxes, and checks for mismatched article_ids."""

    debug_file = 'mismatch_debug.txt'

    # Access the data for the specific year
    try:
        year_data = dataset_article_level[str(year)]
    except KeyError:
        print(f"No data available for year {year}.")
        return None, None

    # Ensure the row_number is within bounds
    if row_number >= len(year_data):
        print(f"Error: Row number {row_number} is out of bounds for year {year}.")
        return None, None

    # Access the specific article data
    scan_data = year_data[row_number]

    # Get the article_id from the dataset
    article_id = scan_data.get('article_id')

    # Compare the dataset's article_id with the lookup_id passed to the function
    if article_id != lookup_id:
        # If the IDs don't match, log this information in the debug file
        with open(debug_file, 'a') as f:
            f.write(f"Mismatch found for year {year}, row {row_number}.\n"
                    f"Expected article_id: {lookup_id}, Found: {article_id}\n")
        print(f"Article ID mismatch: expected {lookup_id}, but found {article_id}. Mismatch logged in {debug_file}.")

    # Access the 'article' and 'bbox' fields
    article_text = scan_data.get('article', None)
    bounding_box = scan_data.get('bbox', None)

    if article_text is None:
        print(f"No article text found for article ID {lookup_id}.")
        return None, None

    if bounding_box is None:
        print(f"No bounding box found for article ID {lookup_id}.")
        return article_text, None  # Return article text without bounding box

    # Return the article text and bounding box
    return article_text, bounding_box


In [16]:
import os
import pandas as pd
import csv

def main():
    """Main program to process articles in chunks."""

    # Step 1: Ask user if they want to start from scratch or continue earlier work
    user_choice = input("Enter 'S' to start from scratch or 'C' to continue from last checkpoint: ").strip().upper()

    # Step 2: Load the CSV from Google Drive
    print("Loading CSV from Google Drive...")
    synch_with_google_drive()  # This function syncs your Google Drive
    df = pd.read_csv('/content/drive/My Drive/Full_KW_Matrix_2+.csv')

    # Step 3: Extract pub_date and pub_year from article_ID
    print("Extracting publication year and date from article_ID...")
    df = extract_pub_year_and_date(df)

    # Step 4: Sort and chunk the DataFrame
    print("Sorting and chunking the DataFrame...")
    chunks = sort_and_chunk_csv(df)

    output_dir = '/content/drive/My Drive/output/'

    # Ensure the output directory exists
    if not os.path.exists(output_dir):
      os.makedirs(output_dir)

    output_path_template = os.path.join(output_dir, 'output_chunk_{}.csv')


    # Step 5: Check user input and handle accordingly
    if user_choice == 'S':
        # If starting from scratch, clear all existing output files
        print("Starting from scratch. Creating new output files...")
        resume_from_chunk = 0  # Initialize to process all chunks from the start
        for chunk_index in range(len(chunks)):
            output_path = output_path_template.format(chunk_index + 1)
            if os.path.isfile(output_path):
                os.remove(output_path)  # Remove existing file
            # Create a new blank file
            with open(output_path, 'w') as f:
                pass

    elif user_choice == 'C':
        # If continuing, find the last processed chunk
        last_processed_article_id = resume_from_checkpoint(output_path_template.format(1))

        # Flag to skip already processed chunks
        resume_from_chunk = 0
        if last_processed_article_id is not None:
            for chunk_index, chunk in enumerate(chunks):
                if last_processed_article_id in chunk['article_ID'].values:
                    resume_from_chunk = chunk_index
                    break
    else:
        print("Invalid choice. Please restart the program and enter 'S' or 'C'.")
        return  # Exit the program if invalid input is given

    # Step 6: Process each chunk
    print("Processing each chunk...")
    for chunk_index, chunk in enumerate(chunks[resume_from_chunk:], start=resume_from_chunk + 1):
        output_path = output_path_template.format(chunk_index)

        # If continuing, check if the chunk has already been processed (file exists and is non-empty)
        if user_choice == 'C' and os.path.isfile(output_path) and os.path.getsize(output_path) > 0:
            print(f"Chunk {chunk_index} already processed. Skipping...")
            continue  # Skip this chunk as it has already been processed

        print(f"Processing chunk {chunk_index} of {len(chunks)}...")

        # Process the chunk to extract article text and bounding boxes
        chunk_results = process_chunk(chunk)

        # Save the processed chunk
        print(f"Saving results of chunk {chunk_index} to {output_path}...")
        save_partial_results(chunk_results, output_path)

    print("Processing complete!")

if __name__ == "__main__":
    main()

Enter 'S' to start from scratch or 'C' to continue from last checkpoint: S
Loading CSV from Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Extracting publication year and date from article_ID...
Sorting and chunking the DataFrame...
Starting from scratch. Creating new output files...
Processing each chunk...
Processing chunk 1 of 11...
['1799', '1802', '1803', '1805', '1806', '1808', '1809', '1810', '1814', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '18

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Generating 1911 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1912': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1912.tar.gz'}


faro_1912.tar.gz:   0%|          | 0.00/2.25G [00:00<?, ?B/s]

Generating 1912 split: 0 examples [00:00, ? examples/s]

Loading associated
Error loading file: mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_1912/1912-12-09_p1_sn86092557_00415660285_1912120901_0729.json
Error loading file: mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_1912/1912-12-26_p1_sn86092557_00415660285_1912122601_0857.json
Error loading file: mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_1912/1912-12-10_p1_sn86092557_00415660285_1912121001_0737.json
Error loading file: mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_1912/1912-12-28_p1_sn86092557_00415660285_1912122801_0863.json
Error loading file: mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_1912/1912-12-21_p1_sn86092557_00415660285_1912122101_0832.json
Available years in dataset: dict_keys(['1907', '1908', '1909', '1910', '1911', '1912'])
Saving results of chunk 5 to /content/drive/My Drive/output/output_chunk_5.csv...
Processing chunk 6 of 11...
['1912', '1913', '1914'

faro_1913.tar.gz:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Generating 1913 split: 0 examples [00:00, ? examples/s]

Loading associated
Error loading file: mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_1913/1913-01-03_p1_sn86092557_00415660297_1913010301_0019.json
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1914': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1914.tar.gz'}


faro_1914.tar.gz:   0%|          | 0.00/2.30G [00:00<?, ?B/s]

Generating 1914 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1915': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1915.tar.gz'}


faro_1915.tar.gz:   0%|          | 0.00/2.21G [00:00<?, ?B/s]

Generating 1915 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1916': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1916.tar.gz'}


faro_1916.tar.gz:   0%|          | 0.00/2.21G [00:00<?, ?B/s]

Generating 1916 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1917': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1917.tar.gz'}


faro_1917.tar.gz:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Generating 1917 split: 0 examples [00:00, ? examples/s]

Loading associated
Available years in dataset: dict_keys(['1912', '1913', '1914', '1915', '1916', '1917'])
Saving results of chunk 6 to /content/drive/My Drive/output/output_chunk_6.csv...
Processing chunk 7 of 11...
['1920', '1921', '1917', '1918', '1919']
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1920': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1920.tar.gz'}


faro_1920.tar.gz:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

Generating 1920 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1921': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1921.tar.gz'}


faro_1921.tar.gz:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Generating 1921 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1918': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1918.tar.gz'}


faro_1918.tar.gz:   0%|          | 0.00/2.29G [00:00<?, ?B/s]

Generating 1918 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1919': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1919.tar.gz'}


faro_1919.tar.gz:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

Generating 1919 split: 0 examples [00:00, ? examples/s]

Loading associated
Available years in dataset: dict_keys(['1920', '1921', '1917', '1918', '1919'])
Saving results of chunk 7 to /content/drive/My Drive/output/output_chunk_7.csv...
Processing chunk 8 of 11...
['1921', '1922', '1923']
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1922': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1922.tar.gz'}


faro_1922.tar.gz:   0%|          | 0.00/2.71G [00:00<?, ?B/s]

Generating 1922 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1923': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1923.tar.gz'}


faro_1923.tar.gz:   0%|          | 0.00/836M [00:00<?, ?B/s]

Generating 1923 split: 0 examples [00:00, ? examples/s]

Loading associated
Available years in dataset: dict_keys(['1921', '1922', '1923'])
Saving results of chunk 8 to /content/drive/My Drive/output/output_chunk_8.csv...
Processing chunk 9 of 11...
['1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932']
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1924': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1924.tar.gz'}


faro_1924.tar.gz:   0%|          | 0.00/787M [00:00<?, ?B/s]

Generating 1924 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1925': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1925.tar.gz'}


faro_1925.tar.gz:   0%|          | 0.00/589M [00:00<?, ?B/s]

Generating 1925 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1926': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1926.tar.gz'}


faro_1926.tar.gz:   0%|          | 0.00/569M [00:00<?, ?B/s]

Generating 1926 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1927': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1927.tar.gz'}


faro_1927.tar.gz:   0%|          | 0.00/469M [00:00<?, ?B/s]

Generating 1927 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1928': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1928.tar.gz'}


faro_1928.tar.gz:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating 1928 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1929': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1929.tar.gz'}


faro_1929.tar.gz:   0%|          | 0.00/394M [00:00<?, ?B/s]

Generating 1929 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1930': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1930.tar.gz'}


faro_1930.tar.gz:   0%|          | 0.00/374M [00:00<?, ?B/s]

Generating 1930 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1931': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1931.tar.gz'}


faro_1931.tar.gz:   0%|          | 0.00/383M [00:00<?, ?B/s]

Generating 1931 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1932': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1932.tar.gz'}


faro_1932.tar.gz:   0%|          | 0.00/568M [00:00<?, ?B/s]

Generating 1932 split: 0 examples [00:00, ? examples/s]

Loading associated
Available years in dataset: dict_keys(['1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932'])
Saving results of chunk 9 to /content/drive/My Drive/output/output_chunk_9.csv...
Processing chunk 10 of 11...
['1932', '1933', '1934', '1935', '1936', '1937', '1938']
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1933': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1933.tar.gz'}


faro_1933.tar.gz:   0%|          | 0.00/567M [00:00<?, ?B/s]

Generating 1933 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1934': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1934.tar.gz'}


faro_1934.tar.gz:   0%|          | 0.00/573M [00:00<?, ?B/s]

Generating 1934 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1935': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1935.tar.gz'}


faro_1935.tar.gz:   0%|          | 0.00/540M [00:00<?, ?B/s]

Generating 1935 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1936': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1936.tar.gz'}


faro_1936.tar.gz:   0%|          | 0.00/549M [00:00<?, ?B/s]

Generating 1936 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1937': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1937.tar.gz'}


faro_1937.tar.gz:   0%|          | 0.00/557M [00:00<?, ?B/s]

Generating 1937 split: 0 examples [00:00, ? examples/s]

Loading associated
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1938': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1938.tar.gz'}


faro_1938.tar.gz:   0%|          | 0.00/556M [00:00<?, ?B/s]

Generating 1938 split: 0 examples [00:00, ? examples/s]

Loading associated
Available years in dataset: dict_keys(['1932', '1933', '1934', '1935', '1936', '1937', '1938'])
Saving results of chunk 10 to /content/drive/My Drive/output/output_chunk_10.csv...
Processing chunk 11 of 11...
['1938', '1939']
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1939': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1939.tar.gz'}


faro_1939.tar.gz:   0%|          | 0.00/470M [00:00<?, ?B/s]

Generating 1939 split: 0 examples [00:00, ? examples/s]

Loading associated
Available years in dataset: dict_keys(['1938', '1939'])
Saving results of chunk 11 to /content/drive/My Drive/output/output_chunk_11.csv...
Processing complete!
