<a href="https://colab.research.google.com/github/grfaith/AmericanStories/blob/main/AS_kw_prox_string_word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
###Installs

!pip install datasets
!pip install ipympl




This is code for looking at a range of years in the American Stories data and finding articles which appear with the string 'explor' and saving their information to disk.

In [2]:
#Imports
import json
import pandas as pd
from datasets import load_dataset
import tqdm as tq
from google.colab import files
import re

# kw_distance = 150


In [3]:
# Full AmStories extends back to 1774 (I think). Previous searches have returned many  hits
start_year = 1920 #This is start-year inclusive

# Complete dataset runs to 1940 inclusive, but this particular implementation
# runs out of space so I'm breaking it into indeterminate chunks
end_year = 1940 #This is end-year exclusive (I think)

# Number to append to csv files
part_num = 24

In [4]:
### Cell for loading files from local drive
kw_file = files.upload()

# Specify custom column names
column_names = ["kword", "kwyear", "kwtype"]

# Read the uploaded CSV file into a DataFrame with custom column names
for fn in kw_file.keys():
    kw_df = pd.read_csv(fn, names=column_names, header=None)

# Display information about the uploaded file
for fn in kw_file.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
          name=fn, length=len(kw_file[fn])))



Saving AS_KW_String_and_Word_Set_2.csv to AS_KW_String_and_Word_Set_2.csv
User uploaded file "AS_KW_String_and_Word_Set_2.csv" with length 161 bytes


In [5]:
# Defining function to load dataset

def load_text_dataset(dataset_year_str):
    """
    This function pulls a dataset of a specific year from the HuggingFace Hub.

    Parameters:
        dataset_year (int): The year of the dataset to be pulled..

    Returns:
        dataset_article_level: dataset for appropriate year
    """
    # Download data for the dataset year at the associated article level (Default)
    # dataset = load_dataset("dell-research-harvard/AmericanStories", "subset_years", year_list=[dataset_year])

    # now let's load our data, we have to specify the huggingface location of our
    # data, the fact that we want to have a subset of years, and our desired years
    dataset_article_level=load_dataset("dell-research-harvard/AmericanStories",
                                      "subset_years",
                                       year_list=[dataset_year_str],
                                       trust_remote_code=True
                                       )

    return dataset_article_level

In [6]:
### Function to filter kw data in use based on years of discovery in kw file.

def get_kw(dataset_year_str):
    """
    This function loads a CSV file to create a DataFrame and filters out keywords where the second column is less than 1774
    Parameters:
        kw_file loaded from prompt above
    Returns:
        pandas.DataFrame: The filtered Data
    """
    # Convert dataset_year to integer
    dataset_year = int(dataset_year_str)

    # Filter the rows based on the condition
    kw_df_filter = kw_df[kw_df['kwyear'] <= dataset_year]

    # print(kw_df_filter)

    return kw_df_filter



In [7]:

def process_kw(dataset_year_str, kw_df_filter, dataset_article_level):
    """
    This function processes words in a DataFrame.

    Parameters:
        kw_df_filter (pandas.DataFrame): The DataFrame containing keywords
        dataset_article (DatasetDict): A dictionary-like object containing datasets for different years.

    Returns:
        dataset_year_hits (DataFrame or None): A DataFrame containing results if found, or None if no results were found.
    """
    print ("Searching within ", dataset_year_str)

    # Creating an empty dataframe
    current_year_df = pd.DataFrame()

    for index, row in kw_df_filter.iterrows():
        explore_kw = row.iloc[0]
        kw_type = row.iloc[2]
        # print(explore_kw, kw_type)
        result_df = kw_search(dataset_article_level, dataset_year_str, explore_kw, kw_type)
        # Concatenate the single search result onto the results DataFrame
        current_year_df = pd.concat([current_year_df, result_df], ignore_index=True)

    return current_year_df


In [8]:
import re
import pandas as pd

def kw_search(dataset_article, dataset_year, explor_kw, kw_type):
    """
    This function searches through the dataset for matching articles containing the given keyword.
    """
    try:
        # Access the dataset for the specific year
        year_dataset = dataset_article[dataset_year]

        # Check the structure of the year dataset
        # print(f"Dataset structure for year {dataset_year}: {year_dataset.column_names}")

        # Access the 'article' column containing the text
        articles = year_dataset['article']

        # Create an empty list to store matching articles
        articles_containing_kw = []

        for article_n, article_text in enumerate(articles):
            article_text = article_text.lower()

            # Determine the search pattern based on kw_type
            if kw_type == "string":
                # Define the pattern to search for the value of 'explor_kw' within words (partial match)
                pattern_kw = re.compile(r'\b\w*' + re.escape(explor_kw) + r'\w*\b', flags=re.IGNORECASE)
            elif kw_type == "word":
                # Define the pattern to search for the exact value of 'explor_kw' as a whole word (exact match)
                pattern_kw = re.compile(r'\b' + re.escape(explor_kw) + r'\b', flags=re.IGNORECASE)
            else:
                raise ValueError("Invalid kw_type. Must be either 'string' or 'word'.")

            # Find all occurrences of the keyword
            kw_matches = pattern_kw.findall(article_text)
            kw_count = len(kw_matches)

            # If the keyword is found, add the article info and keyword count to the results
            if kw_count > 0:
                # print(f"Keyword '{explor_kw}' found in article {article_n} with {kw_count} occurrences.")

                # Check if 'article_id' is available
                if "article_id" in year_dataset[article_n]:
                    article_id = year_dataset[article_n]["article_id"]
                else:
                    # print(f"No 'article_id' found for article {article_n}. Skipping...")
                    continue

                articles_containing_kw.append({
                    'row_number': article_n,
                    'article_ID': article_id,
                    'keyword_hit': explor_kw,
                    'keyword_count': kw_count,
                })

        # Check if any articles were found
        if not articles_containing_kw:
            # print(f"No articles found containing the keyword '{explor_kw}' for year {dataset_year}.")
            # Return an empty DataFrame with the expected column names
            return pd.DataFrame(columns=['row_number', 'article_ID', 'keyword_hit', 'keyword_count'])

        # Convert the list of dictionaries to a DataFrame with the required columns
        df_of_articles_containing_kw = pd.DataFrame(articles_containing_kw)

        # Check if required columns are present before subsetting
        if set(['row_number', 'article_ID', 'keyword_hit', 'keyword_count']).issubset(df_of_articles_containing_kw.columns):
            df_of_articles_containing_kw = df_of_articles_containing_kw[['row_number', 'article_ID', 'keyword_hit', 'keyword_count']]
        else:
            print("Required columns are missing in the DataFrame.")

        return df_of_articles_containing_kw

    except KeyError as e:
        print(f"KeyError: {e}. Please check if the dataset and article structure is correct.")
    except Exception as e:
        print(f"An error occurred: {e}")




In [9]:
results = pd.DataFrame()

for loop_year in range(start_year, end_year):
    dataset_year_str = str(loop_year)
    try:
        # Load dataset for current loop_year
        dataset_article_level = load_text_dataset(dataset_year_str)

        # Get valid keyword filters for current year
        kw_df_filter = get_kw(loop_year)
        # print ("Processing year", dataset_year_str)

        # Process keyword filters and dataset for current loop year to get hits
        year_search_result = process_kw(dataset_year_str,kw_df_filter, dataset_article_level)

        if not year_search_result.empty:
          pass # print("Found: ", year_search_result)
        else:
          pass # print("No results found for ", dataset_year_str)

        # Concatenate the single search result onto the results DataFrame
        results = pd.concat([results, year_search_result])

    except ValueError:
        print(f"Dataset empty for {dataset_year_str}. Moving to the next year.")
        continue

results = results.reset_index(drop=True)
results.to_csv(f'AS_Main_KW_Hits_Prox_SW_Part_{part_num}.csv', index=False)
files.download(f'AS_Main_KW_Hits_Prox_SW_Part_{part_num}.csv')
print("Finished")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


AmericanStories.py:   0%|          | 0.00/8.91k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1920': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1920.tar.gz'}


faro_1920.tar.gz:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

Generating 1920 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1920
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1921': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1921.tar.gz'}


faro_1921.tar.gz:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Generating 1921 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1921
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1922': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1922.tar.gz'}


faro_1922.tar.gz:   0%|          | 0.00/2.71G [00:00<?, ?B/s]

Generating 1922 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1922
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1923': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1923.tar.gz'}


faro_1923.tar.gz:   0%|          | 0.00/836M [00:00<?, ?B/s]

Generating 1923 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1923
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1924': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1924.tar.gz'}


faro_1924.tar.gz:   0%|          | 0.00/787M [00:00<?, ?B/s]

Generating 1924 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1924
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1925': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1925.tar.gz'}


faro_1925.tar.gz:   0%|          | 0.00/589M [00:00<?, ?B/s]

Generating 1925 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1925
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1926': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1926.tar.gz'}


faro_1926.tar.gz:   0%|          | 0.00/569M [00:00<?, ?B/s]

Generating 1926 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1926
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1927': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1927.tar.gz'}


faro_1927.tar.gz:   0%|          | 0.00/469M [00:00<?, ?B/s]

Generating 1927 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1927
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1928': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1928.tar.gz'}


faro_1928.tar.gz:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating 1928 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1928
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1929': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1929.tar.gz'}


faro_1929.tar.gz:   0%|          | 0.00/394M [00:00<?, ?B/s]

Generating 1929 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1929
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1930': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1930.tar.gz'}


faro_1930.tar.gz:   0%|          | 0.00/374M [00:00<?, ?B/s]

Generating 1930 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1930
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1931': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1931.tar.gz'}


faro_1931.tar.gz:   0%|          | 0.00/383M [00:00<?, ?B/s]

Generating 1931 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1931
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1932': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1932.tar.gz'}


faro_1932.tar.gz:   0%|          | 0.00/568M [00:00<?, ?B/s]

Generating 1932 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1932
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1933': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1933.tar.gz'}


faro_1933.tar.gz:   0%|          | 0.00/567M [00:00<?, ?B/s]

Generating 1933 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1933
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1934': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1934.tar.gz'}


faro_1934.tar.gz:   0%|          | 0.00/573M [00:00<?, ?B/s]

Generating 1934 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1934
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1935': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1935.tar.gz'}


faro_1935.tar.gz:   0%|          | 0.00/540M [00:00<?, ?B/s]

Generating 1935 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1935
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1936': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1936.tar.gz'}


faro_1936.tar.gz:   0%|          | 0.00/549M [00:00<?, ?B/s]

Generating 1936 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1936
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1937': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1937.tar.gz'}


faro_1937.tar.gz:   0%|          | 0.00/557M [00:00<?, ?B/s]

Generating 1937 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1937
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1938': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1938.tar.gz'}


faro_1938.tar.gz:   0%|          | 0.00/556M [00:00<?, ?B/s]

Generating 1938 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1938
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1939': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1939.tar.gz'}


faro_1939.tar.gz:   0%|          | 0.00/470M [00:00<?, ?B/s]

Generating 1939 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1939


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Finished


In [None]:
# Run this cell if you have to interrupt the above loop to save progress.
#  Update file names as appropriate.


results = results.reset_index(drop=True)
results.to_csv(f'AS_Main_KW_Hits_Prox_SW_Part_{part_num}.csv', index=False)
files.download(f'AS_Main_KW_Hits_Prox_SW_Part_{part_num}.csv')



In [None]:
print ("Now save output file to local disk")
input("Press Enter to continue...")
