<a href="https://colab.research.google.com/github/grfaith/AmStory/blob/main/Finding_explore_kw_proximity_hits_in_AS_string_word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
###Installs

!pip install datasets
!pip install ipympl


Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K

This is code for looking at a range of years in the American Stories data and finding articles which appear with the string 'explor' and saving their information to disk.

In [2]:
#Imports
import json
import pandas as pd
from datasets import load_dataset
import tqdm as tq
from google.colab import files
import re

kw_distance = 150


In [3]:
start_year = 1916
end_year = 1940 #This is end-year exclusive (I think)

In [4]:
### Cell for loading files from local drive
kw_file = files.upload()

# Specify custom column names
column_names = ["kword", "kwyear", "kwtype"]

# Read the uploaded CSV file into a DataFrame with custom column names
for fn in kw_file.keys():
    kw_df = pd.read_csv(fn, names=column_names, header=None)

# Display information about the uploaded file
for fn in kw_file.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
          name=fn, length=len(kw_file[fn])))



Saving AS_Explore_KW_String_and_Word.csv to AS_Explore_KW_String_and_Word.csv
User uploaded file "AS_Explore_KW_String_and_Word.csv" with length 455 bytes


In [5]:
# Defining function to load dataset

def load_text_dataset(dataset_year_str):
    """
    This function pulls a dataset of a specific year from the HuggingFace Hub.

    Parameters:
        dataset_year (int): The year of the dataset to be pulled..

    Returns:
        dataset_article_level: dataset for appropriate year
    """
    # Download data for the dataset year at the associated article level (Default)
    # dataset = load_dataset("dell-research-harvard/AmericanStories", "subset_years", year_list=[dataset_year])

    # now let's load our data, we have to specify the huggingface location of our
    # data, the fact that we want to have a subset of years, and our desired years
    dataset_article_level=load_dataset("dell-research-harvard/AmericanStories",
                                      "subset_years",
                                       year_list=[dataset_year_str],
                                       trust_remote_code=True
                                       )

    return dataset_article_level

In [6]:
### Function to filter kw data in use based on years of discovery in kw file.

def get_kw(dataset_year_str):
    """
    This function loads a CSV file to create a DataFrame and filters out keywords where the second column is less than 1774
    Parameters:
        kw_file loaded from prompt above
    Returns:
        pandas.DataFrame: The filtered Data
    """
    # Convert dataset_year to integer
    dataset_year = int(dataset_year_str)

    # Filter the rows based on the condition
    kw_df_filter = kw_df[kw_df['kwyear'] <= dataset_year]

    # print(kw_df_filter)

    return kw_df_filter



In [7]:

def process_kw(dataset_year_str, kw_df_filter, dataset_article_level):
    """
    This function processes words in a DataFrame.

    Parameters:
        kw_df_filter (pandas.DataFrame): The DataFrame containing keywords
        dataset_article (DatasetDict): A dictionary-like object containing datasets for different years.

    Returns:
        dataset_year_hits (DataFrame or None): A DataFrame containing results if found, or None if no results were found.
    """
    print ("Searching within ", dataset_year_str)

    # Creating an empty dataframe
    current_year_df = pd.DataFrame()

    for index, row in kw_df_filter.iterrows():
        explore_kw = row.iloc[0]
        kw_type = row.iloc[2]
        # print(explore_kw, kw_type)
        result_df = kw_pair_search(dataset_article_level, dataset_year_str, explore_kw, kw_distance, kw_type)
        # Concatenate the single search result onto the results DataFrame
        current_year_df = pd.concat([current_year_df, result_df], ignore_index=True)

    return current_year_df


In [8]:
## key word search within articles to find keywords with kw_distance of the string 'explor'

def kw_pair_search(dataset_article, dataset_year, explor_kw, kw_distance, kw_type):
    """
    This function searches through the dataset, by kw, to look for matching articles.

    Parameters:
        dataset_article (DatasetDict): A dictionary-like object containing datasets for different years.
        dataset_year (int): The year of the dataset.
        explor_kw (str): The keyword to search for.
        kw_distance (int): The maximum distance allowed between occurrences of the two keywords.

    Returns:
        df_of_articles_containing_two_words (DataFrame): A DataFrame containing article IDs and article texts of articles containing both keywords within the specified distance.
    """
    # Access the dataset for the specific year
    year_dataset = dataset_article[dataset_year]

    # Access the 'raw_data_string' column
    articles = year_dataset['article']

    # Create empty list to store matching articles
    articles_containing_two_words = []

    for article_n, article_text in enumerate(articles):
        article_text = article_text.lower()
        if "explor" in article_text and explor_kw in article_text:
            # print ("We have a hit at", article_n)

            # Define the pattern to search for "explor" within words
            pattern_explor = re.compile(r'\b\w*explor\w*\b')

            # Find indices of matches in the article text
            explor_indices = [match.start() for match in pattern_explor.finditer(article_text)]

            # Determine if the search needs to be on a string or on a word

            if kw_type == "string":
                # Define the pattern to search for the value of 'explor_kw' within words
                pattern_kw = re.compile(r'\b\w*' + re.escape(explor_kw) + r'\w*\b', flags=re.IGNORECASE)
            elif kw_type == "word":
                # Define the pattern to search for the exact value of 'explor_kw' as a whole word
                pattern_kw = re.compile(r'\b' + re.escape(explor_kw) + r'\b', flags=re.IGNORECASE)
            else:
                # Handle the case when kw_type is neither "string" nor "word"
                raise ValueError("Invalid kw_type. Must be either 'string' or 'word'.")


            # Find indices of matches in the article text
            kw_indices = [match.start() for match in pattern_kw.finditer(article_text)]


            # Check if there are any pairs of indices within the specified distance
            for explor_index in explor_indices:
                for kw_index in kw_indices:
                    if abs(explor_index - kw_index) <= kw_distance:
                        # print ("SUCCESS!!", dataset_year, article_n)
                        # Append the matching article information to the list
                        articles_containing_two_words.append({
                            'article_year': dataset_year,
                            'keyword_hit': explor_kw,
                            'row_number': article_n,
                            'article_ID': dataset_article[dataset_year][article_n]["article_id"],
                            'newspaper_name': dataset_article[dataset_year][article_n]["newspaper_name"],
                            'edition': dataset_article[dataset_year][article_n]["edition"],
                            'date': dataset_article[dataset_year][article_n]["date"],
                            'page': dataset_article[dataset_year][article_n]["page"],
                            'headline': dataset_article[dataset_year][article_n]["headline"],
                            'byline': dataset_article[dataset_year][article_n]["byline"],
                            'article': dataset_article[dataset_year][article_n]["article"],
                        })
                        # Once a matching pair is found, break out of the loop
                        break
                    else:
                        continue

    # Convert the list of dictionaries to a DataFrame
    df_of_articles_containing_two_words = pd.DataFrame(articles_containing_two_words)
    return df_of_articles_containing_two_words


In [9]:
results = pd.DataFrame()

for loop_year in range(start_year, end_year):
    dataset_year_str = str(loop_year)
    try:
        # Load dataset for current loop_year
        dataset_article_level = load_text_dataset(dataset_year_str)

        # Get valid keyword filters for current year
        kw_df_filter = get_kw(loop_year)
        # print ("Processing year", dataset_year_str)

        # Process keyword filters and dataset for current loop year to get hits
        year_search_result = process_kw(dataset_year_str,kw_df_filter, dataset_article_level)

        # if not year_search_result.empty:
        #   print("Found: ", year_search_result)
        # else:
        #   print("No results found for ", dataset_year_str)

        # Concatenate the single search result onto the results DataFrame
        results = pd.concat([results, year_search_result])

    except ValueError:
        print(f"Dataset empty for {dataset_year_str}. Moving to the next year.")
        continue

results = results.reset_index(drop=True)
results.to_csv('AS_Explor_KW_Hits_Prox_SW_Part_3-In Fxn.csv', index=False)
files.download("AS_Explor_KW_Hits_Prox_SW_Part_3-In Fxn.csv")
print("Finished")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/8.91k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1916': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1916.tar.gz'}


Downloading data:   0%|          | 0.00/2.21G [00:00<?, ?B/s]

Generating 1916 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1916
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1917': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1917.tar.gz'}


Downloading data:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Generating 1917 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1917
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1918': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1918.tar.gz'}


Downloading data:   0%|          | 0.00/2.29G [00:00<?, ?B/s]

Generating 1918 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1918
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1919': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1919.tar.gz'}


Downloading data:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

Generating 1919 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1919
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1920': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1920.tar.gz'}


Downloading data:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

Generating 1920 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1920
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1921': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1921.tar.gz'}


Downloading data:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Generating 1921 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1921
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1922': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1922.tar.gz'}


Downloading data:   0%|          | 0.00/2.71G [00:00<?, ?B/s]

Generating 1922 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1922
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1923': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1923.tar.gz'}


Downloading data:   0%|          | 0.00/836M [00:00<?, ?B/s]

Generating 1923 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1923
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1924': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1924.tar.gz'}


Downloading data:   0%|          | 0.00/787M [00:00<?, ?B/s]

Generating 1924 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1924
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1925': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1925.tar.gz'}


Downloading data:   0%|          | 0.00/589M [00:00<?, ?B/s]

Generating 1925 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1925
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1926': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1926.tar.gz'}


Downloading data:   0%|          | 0.00/569M [00:00<?, ?B/s]

Generating 1926 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1926
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1927': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1927.tar.gz'}


Downloading data:   0%|          | 0.00/469M [00:00<?, ?B/s]

Generating 1927 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1927
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1928': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1928.tar.gz'}


Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating 1928 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1928
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1929': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1929.tar.gz'}


Downloading data:   0%|          | 0.00/394M [00:00<?, ?B/s]

Generating 1929 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1929
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1930': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1930.tar.gz'}


Downloading data:   0%|          | 0.00/374M [00:00<?, ?B/s]

Generating 1930 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1930
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1931': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1931.tar.gz'}


Downloading data:   0%|          | 0.00/383M [00:00<?, ?B/s]

Generating 1931 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1931
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1932': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1932.tar.gz'}


Downloading data:   0%|          | 0.00/568M [00:00<?, ?B/s]

Generating 1932 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1932
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1933': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1933.tar.gz'}


Downloading data:   0%|          | 0.00/567M [00:00<?, ?B/s]

Generating 1933 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1933
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1934': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1934.tar.gz'}


Downloading data:   0%|          | 0.00/573M [00:00<?, ?B/s]

Generating 1934 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1934
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1935': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1935.tar.gz'}


Downloading data:   0%|          | 0.00/540M [00:00<?, ?B/s]

Generating 1935 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1935
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1936': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1936.tar.gz'}


Downloading data:   0%|          | 0.00/549M [00:00<?, ?B/s]

Generating 1936 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1936
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1937': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1937.tar.gz'}


Downloading data:   0%|          | 0.00/557M [00:00<?, ?B/s]

Generating 1937 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1937
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1938': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1938.tar.gz'}


Downloading data:   0%|          | 0.00/556M [00:00<?, ?B/s]

Generating 1938 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1938
Only taking a subset of years. Change name to 'all_years' to use all years in the dataset.
{'1939': 'https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1939.tar.gz'}


Downloading data:   0%|          | 0.00/470M [00:00<?, ?B/s]

Generating 1939 split: 0 examples [00:00, ? examples/s]

Loading associated
Searching within  1939


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Finished


In [None]:
# Run this cell if you have to interrupt the above loop to save progress.
#  Update file names as appropriate.


results = results.reset_index(drop=True)
results.to_csv('AS_Explor_KW_Prox_Raw_SW_Part_3.csv', index=False)
files.download("AS_Explor_KW_Prox_Raw_SW_Part_3.csv")



In [None]:
print ("Now save output file to local disk")
input("Press Enter to continue...")


# **SOLVENT FRONT**

In [None]:
print(results.info)


In [None]:
# let's start with deciding which years we want data for
scan_level_desired_years = ["1900",]

# now let's load our data, we have to specify the huggingface location of our
# data, the fact that we want to have a subset of years, and our desired years
dataset_scan_level=load_dataset("dell-research-harvard/AmericanStories",
                                "subset_years_content_regions",
                                year_list=scan_level_desired_years
                                )

In [None]:
print(dataset_article_level['1900'][377385]["article"])