<a href="https://colab.research.google.com/github/grfaith/AmericanStories/blob/main/Finding_explore_kw_proximity_hits_in_AS_string_word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
###Installs

!pip install datasets
!pip install ipympl


This is code for looking at a range of years in the American Stories data and finding articles which appear with the string 'explor' and saving their information to disk.

In [None]:
#Imports
import json
import pandas as pd
from datasets import load_dataset
import tqdm as tq
from google.colab import files
import re

kw_distance = 150


In [None]:
# Full ChronAm extends back to 1774 (I think). Previous searches have returnedo hits
start_year = 1910 #This is start-year inclusive

# Complete dataset runs to 1940 inclusive, but this particular implementation runs out of space about 1905, so I'm breaking it into chunks
end_year = 1940 #This is end-year exclusive (I think)

In [None]:
### Cell for loading files from local drive
kw_file = files.upload()

# Specify custom column names
column_names = ["kword", "kwyear", "kwtype"]

# Read the uploaded CSV file into a DataFrame with custom column names
for fn in kw_file.keys():
    kw_df = pd.read_csv(fn, names=column_names, header=None)

# Display information about the uploaded file
for fn in kw_file.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
          name=fn, length=len(kw_file[fn])))



In [None]:
# Defining function to load dataset

def load_text_dataset(dataset_year_str):
    """
    This function pulls a dataset of a specific year from the HuggingFace Hub.

    Parameters:
        dataset_year (int): The year of the dataset to be pulled..

    Returns:
        dataset_article_level: dataset for appropriate year
    """
    # Download data for the dataset year at the associated article level (Default)
    # dataset = load_dataset("dell-research-harvard/AmericanStories", "subset_years", year_list=[dataset_year])

    # now let's load our data, we have to specify the huggingface location of our
    # data, the fact that we want to have a subset of years, and our desired years
    dataset_article_level=load_dataset("dell-research-harvard/AmericanStories",
                                      "subset_years",
                                       year_list=[dataset_year_str],
                                       trust_remote_code=True
                                       )

    return dataset_article_level

In [None]:
### Function to filter kw data in use based on years of discovery in kw file.

def get_kw(dataset_year_str):
    """
    This function loads a CSV file to create a DataFrame and filters out keywords where the second column is less than 1774
    Parameters:
        kw_file loaded from prompt above
    Returns:
        pandas.DataFrame: The filtered Data
    """
    # Convert dataset_year to integer
    dataset_year = int(dataset_year_str)

    # Filter the rows based on the condition
    kw_df_filter = kw_df[kw_df['kwyear'] <= dataset_year]

    # print(kw_df_filter)

    return kw_df_filter



In [None]:

def process_kw(dataset_year_str, kw_df_filter, dataset_article_level):
    """
    This function processes words in a DataFrame.

    Parameters:
        kw_df_filter (pandas.DataFrame): The DataFrame containing keywords
        dataset_article (DatasetDict): A dictionary-like object containing datasets for different years.

    Returns:
        dataset_year_hits (DataFrame or None): A DataFrame containing results if found, or None if no results were found.
    """
    print ("Searching within ", dataset_year_str)

    # Creating an empty dataframe
    current_year_df = pd.DataFrame()

    for index, row in kw_df_filter.iterrows():
        explore_kw = row.iloc[0]
        kw_type = row.iloc[2]
        # print(explore_kw, kw_type)
        result_df = kw_pair_search(dataset_article_level, dataset_year_str, explore_kw, kw_distance, kw_type)
        # Concatenate the single search result onto the results DataFrame
        current_year_df = pd.concat([current_year_df, result_df], ignore_index=True)

    return current_year_df


In [None]:
## key word search within articles to find keywords with kw_distance of the string 'explor'

def kw_pair_search(dataset_article, dataset_year, explor_kw, kw_distance, kw_type):
    """
    This function searches through the dataset, by kw, to look for matching articles.

    Parameters:
        dataset_article (DatasetDict): A dictionary-like object containing datasets for different years.
        dataset_year (int): The year of the dataset.
        explor_kw (str): The keyword to search for.
        kw_distance (int): The maximum distance allowed between occurrences of the two keywords.

    Returns:
        df_of_articles_containing_two_words (DataFrame): A DataFrame containing article IDs and article texts of articles containing both keywords within the specified distance.
    """
    # Access the dataset for the specific year
    year_dataset = dataset_article[dataset_year]

    # Access the 'raw_data_string' column
    articles = year_dataset['article']

    # Create empty list to store matching articles
    articles_containing_two_words = []

    for article_n, article_text in enumerate(articles):
        article_text = article_text.lower()
        if "explor" in article_text and explor_kw in article_text:
            # print ("We have a hit at", article_n)

            # Define the pattern to search for "explor" within words
            pattern_explor = re.compile(r'\b\w*explor\w*\b')

            # Find indices of matches in the article text
            explor_indices = [match.start() for match in pattern_explor.finditer(article_text)]

            # Determine if the search needs to be on a string or on a word

            if kw_type == "string":
                # Define the pattern to search for the value of 'explor_kw' within words
                pattern_kw = re.compile(r'\b\w*' + re.escape(explor_kw) + r'\w*\b', flags=re.IGNORECASE)
            elif kw_type == "word":
                # Define the pattern to search for the exact value of 'explor_kw' as a whole word
                pattern_kw = re.compile(r'\b' + re.escape(explor_kw) + r'\b', flags=re.IGNORECASE)
            else:
                # Handle the case when kw_type is neither "string" nor "word"
                raise ValueError("Invalid kw_type. Must be either 'string' or 'word'.")


            # Find indices of matches in the article text
            kw_indices = [match.start() for match in pattern_kw.finditer(article_text)]
            #print (kw_indices)
            #input ("Press Enter to continue")

            # Check if there are any pairs of indices within the specified distance
            for explor_index in explor_indices:
                for kw_index in kw_indices:
                    if abs(explor_index - kw_index) <= kw_distance:
                        # print ("SUCCESS!!", dataset_year, article_n)
                        # Append the matching article information to the list
                        articles_containing_two_words.append({
                            'article_year': dataset_year,
                            'row_number': article_n,
                            'article_ID': dataset_article[dataset_year][article_n]["article_id"],
                            'newspaper_name': dataset_article[dataset_year][article_n]["newspaper_name"],
                            'edition': dataset_article[dataset_year][article_n]["edition"],
                            'date': dataset_article[dataset_year][article_n]["date"],
                            'page': dataset_article[dataset_year][article_n]["page"],
                            'headline': dataset_article[dataset_year][article_n]["headline"],
                            'byline': dataset_article[dataset_year][article_n]["byline"],
                            'article': dataset_article[dataset_year][article_n]["article"],
                            'keyword_hit': explor_kw,
                            'kw_index': kw_index,
                            'explor_index': explor_index,
                        })
                        # Once a matching pair is found, break out of the loop
                        # break
                    else:
                        continue

    # Convert the list of dictionaries to a DataFrame
    df_of_articles_containing_two_words = pd.DataFrame(articles_containing_two_words)
    return df_of_articles_containing_two_words


In [None]:
results = pd.DataFrame()

for loop_year in range(start_year, end_year):
    dataset_year_str = str(loop_year)
    try:
        # Load dataset for current loop_year
        dataset_article_level = load_text_dataset(dataset_year_str)

        # Get valid keyword filters for current year
        kw_df_filter = get_kw(loop_year)
        # print ("Processing year", dataset_year_str)

        # Process keyword filters and dataset for current loop year to get hits
        year_search_result = process_kw(dataset_year_str,kw_df_filter, dataset_article_level)

        # if not year_search_result.empty:
        #   print("Found: ", year_search_result)
        # else:
        #   print("No results found for ", dataset_year_str)

        # Concatenate the single search result onto the results DataFrame
        results = pd.concat([results, year_search_result])

    except ValueError:
        print(f"Dataset empty for {dataset_year_str}. Moving to the next year.")
        continue

results = results.reset_index(drop=True)
results.to_csv('AS_Explor_KW_Hits_Prox_SW_Part_3_In Fxn.csv', index=False)
files.download("AS_Explor_KW_Hits_Prox_SW_Part_3_In Fxn.csv")
print("Finished")


In [None]:
# Run this cell if you have to interrupt the above loop to save progress.
#  Update file names as appropriate.


results = results.reset_index(drop=True)
results.to_csv('AS_Explor_KW_Hits_Prox_SW_Part_3.csv', index=False)
files.download("AS_Explor_KW_Hits_Prox_SW_Part_3.csv")



In [None]:
print ("Now save output file to local disk")
input("Press Enter to continue...")


# **SOLVENT FRONT**

In [None]:
print(results.info)


In [None]:
# let's start with deciding which years we want data for
scan_level_desired_years = ["1900",]

# now let's load our data, we have to specify the huggingface location of our
# data, the fact that we want to have a subset of years, and our desired years
dataset_scan_level=load_dataset("dell-research-harvard/AmericanStories",
                                "subset_years_content_regions",
                                year_list=scan_level_desired_years
                                )

In [None]:
print(dataset_article_level['1900'][377385]["article"])