<a href="https://colab.research.google.com/github/grfaith/May25/blob/main/25May13_lookup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ── 0. one-time setup ───────────────────────────────────────────────
from google.colab import drive
import os, zipfile, datetime as dt

drive.mount('/content/drive', force_remount=True)   # one prompt, then silent
OUT_DIR = '/content/drive/MyDrive/AmStories_kw_hits'
os.makedirs(OUT_DIR, exist_ok=True)


# -*- coding: utf-8 -*-
"""AS_kw_May25_string_word.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1ZMgC6UJ3BLN0oGmtUVPLJ8We45WttGLs
"""

###Installs

!pip install datasets
!pip install ipympl
!pip install --upgrade datasets

#Imports
import json
import pandas as pd
from datasets import load_dataset
import tqdm as tq
from google.colab import files
import re

# kw_distance = 150

### Cell for loading files from local drive
kw_file = files.upload()

# Specify custom column names
column_names = ["kword", "kwyear", "kwtype"]

# Read the uploaded CSV file into a DataFrame with custom column names
for fn in kw_file.keys():
    kw_df = pd.read_csv(fn, names=column_names, header=None)

# Display information about the uploaded file
for fn in kw_file.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
          name=fn, length=len(kw_file[fn])))

# Defining function to load dataset

def load_text_dataset(dataset_year_str):
    """
    This function pulls a dataset of a specific year from the HuggingFace Hub.

    Parameters:
        dataset_year (int): The year of the dataset to be pulled..

    Returns:
        dataset_article_level: dataset for appropriate year
    """
    # Download data for the dataset year at the associated article level (Default)
    # dataset = load_dataset("dell-research-harvard/AmericanStories", "subset_years", year_list=[dataset_year])

    # now let's load our data, we have to specify the huggingface location of our
    # data, the fact that we want to have a subset of years, and our desired years
    dataset_article_level=load_dataset("dell-research-harvard/AmericanStories",
                                      "subset_years",
                                       year_list=[dataset_year_str],
                                       trust_remote_code=True
                                       )

    return dataset_article_level

### Function to filter kw data in use based on years of discovery in kw file.

def get_kw(dataset_year_str):
    """
    This function loads a CSV file to create a DataFrame and filters out keywords where the second column is less than 1774
    Parameters:
        kw_file loaded from prompt above
    Returns:
        pandas.DataFrame: The filtered Data
    """
    # Convert dataset_year to integer
    dataset_year = int(dataset_year_str)

    # Filter the rows based on the condition
    kw_df_filter = kw_df[kw_df['kwyear'] <= dataset_year]

    # print(kw_df_filter)

    return kw_df_filter

def process_kw(dataset_year_str, kw_df_filter, dataset_article_level):
    """
    This function processes words in a DataFrame.

    Parameters:
        kw_df_filter (pandas.DataFrame): The DataFrame containing keywords
        dataset_article (DatasetDict): A dictionary-like object containing datasets for different years.

    Returns:
        dataset_year_hits (DataFrame or None): A DataFrame containing results if found, or None if no results were found.
    """
    print ("Searching within ", dataset_year_str)

    # Creating an empty dataframe
    current_year_df = pd.DataFrame()

    for index, row in kw_df_filter.iterrows():
        explore_kw = row.iloc[0]
        kw_type = row.iloc[2]
        # print(explore_kw, kw_type)
        result_df = kw_search(dataset_article_level, dataset_year_str, explore_kw, kw_type)
        # Concatenate the single search result onto the results DataFrame
        current_year_df = pd.concat([current_year_df, result_df], ignore_index=True)

    return current_year_df

import re
import pandas as pd

def kw_search(dataset_article, dataset_year, explor_kw, kw_type):
    """
    This function searches through the dataset for matching articles containing the given keyword.
    """
    try:
        # Access the dataset for the specific year
        year_dataset = dataset_article[dataset_year]

        # Check the structure of the year dataset
        # print(f"Dataset structure for year {dataset_year}: {year_dataset.column_names}")

        # Access the 'article' column containing the text
        articles = year_dataset['article']

        # Create an empty list to store matching articles
        articles_containing_kw = []

        for article_n, article_text in enumerate(articles):
            article_text = article_text.lower()

            # Determine the search pattern based on kw_type
            if kw_type == "string":
                # Define the pattern to search for the value of 'explor_kw' within words (partial match)
                pattern_kw = re.compile(r'\b\w*' + re.escape(explor_kw) + r'\w*\b', flags=re.IGNORECASE)
            elif kw_type == "word":
                # Define the pattern to search for the exact value of 'explor_kw' as a whole word (exact match)
                pattern_kw = re.compile(r'\b' + re.escape(explor_kw) + r'\b', flags=re.IGNORECASE)
            else:
                raise ValueError("Invalid kw_type. Must be either 'string' or 'word'.")

            # Find all occurrences of the keyword
            kw_matches = pattern_kw.findall(article_text)
            kw_count = len(kw_matches)

            # If the keyword is found, add the article info and keyword count to the results
            if kw_count > 0:
                # print(f"Keyword '{explor_kw}' found in article {article_n} with {kw_count} occurrences.")

                # Check if 'article_id' is available
                if "article_id" in year_dataset[article_n]:
                    article_id = year_dataset[article_n]["article_id"]
                else:
                    # print(f"No 'article_id' found for article {article_n}. Skipping...")
                    continue

                articles_containing_kw.append({
                    'row_number': article_n,
                    'article_ID': article_id,
                    'keyword_hit': explor_kw,
                    'keyword_count': kw_count,
                })

        # Check if any articles were found
        if not articles_containing_kw:
            # print(f"No articles found containing the keyword '{explor_kw}' for year {dataset_year}.")
            # Return an empty DataFrame with the expected column names
            return pd.DataFrame(columns=['row_number', 'article_ID', 'keyword_hit', 'keyword_count'])

        # Convert the list of dictionaries to a DataFrame with the required columns
        df_of_articles_containing_kw = pd.DataFrame(articles_containing_kw)

        # Check if required columns are present before subsetting
        if set(['row_number', 'article_ID', 'keyword_hit', 'keyword_count']).issubset(df_of_articles_containing_kw.columns):
            df_of_articles_containing_kw = df_of_articles_containing_kw[['row_number', 'article_ID', 'keyword_hit', 'keyword_count']]
        else:
            print("Required columns are missing in the DataFrame.")

        return df_of_articles_containing_kw

    except KeyError as e:
        print(f"KeyError: {e}. Please check if the dataset and article structure is correct.")
    except Exception as e:
        print(f"An error occurred: {e}")

"""# *BREAK*"""

# Full AmStories extends back to 1774 (I think). Previous searches have returned many  hits
full_start_year = 1844  # Inclusive
full_end_year = 1845    # Exclusive, so 1845 is not included
chunk_size = 3

# ── 2.  main loop, but drop files.download() ────────────────────────
for chunk_start in range(full_start_year, full_end_year, chunk_size):
    chunk_end = min(chunk_start + chunk_size, full_end_year)
    for loop_year in range(chunk_start, chunk_end):
        dataset_year_str = str(loop_year)
        try:
            dataset_article_level = load_text_dataset(dataset_year_str)
            kw_df_filter        = get_kw(loop_year)
            year_search_result  = process_kw(dataset_year_str,
                                             kw_df_filter,
                                             dataset_article_level)

            # save directly to Drive
            out_path = f'{OUT_DIR}/AS_Main_KW_Hits_May25_SW_{loop_year}.csv'
            year_search_result.to_csv(out_path, index=False)
            print(f'✅  {out_path} written ({len(year_search_result):,} rows)')

        except ValueError:
            print(f'Dataset empty for {dataset_year_str}; skipping.')
