<a href="https://colab.research.google.com/github/grfaith/Dissertation/blob/master/Finding_explore_kw_hits_in_AS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
###Installs

!pip install datasets
!pip install ipympl




This is code for looking at a range of years in the American Stories data and finding articles which appear with the string 'explor' and saving their information to disk.

In [2]:
#Imports
import json
import pandas as pd
from datasets import load_dataset
import tqdm as tq
from google.colab import files

start_year = 1820
end_year = 1827 #This is end-year exclusive (I think)

In [18]:
### Cell for loading files from local drive
kw_file = files.upload()

# Specify custom column names
column_names = ["kword", "kwyear"]

# Read the uploaded CSV file into a DataFrame with custom column names
for fn in kw_file.keys():
    kw_df = pd.read_csv(fn, names=column_names, header=None)

# Display information about the uploaded file
for fn in kw_file.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
          name=fn, length=len(kw_file[fn])))



In [4]:
# Defining function to load dataset

def load_text_dataset(dataset_year_str):
    """
    This function pulls a dataset of a specific year from the HuggingFace Hub.

    Parameters:
        dataset_year (int): The year of the dataset to be pulled..

    Returns:
        dataset_article_level: dataset for appropriate year
    """
    # Download data for the dataset year at the associated article level (Default)
    # dataset = load_dataset("dell-research-harvard/AmericanStories", "subset_years", year_list=[dataset_year])

    # now let's load our data, we have to specify the huggingface location of our
    # data, the fact that we want to have a subset of years, and our desired years
    dataset_article_level=load_dataset("dell-research-harvard/AmericanStories",
                                      "subset_years",
                                       year_list=[dataset_year_str],
                                       trust_remote_code=True
                                       )

    return dataset_article_level

In [5]:
### Function to filter kw data in use based on years of discovery in kw file.

def get_kw(dataset_year_str):
    """
    This function loads a CSV file to create a DataFrame and filters out keywords where the second column is less than 1774
    Parameters:
        kw_file loaded from prompt above
    Returns:
        pandas.DataFrame: The filtered Data
    """
    # Convert dataset_year to integer
    dataset_year = int(dataset_year_str)

    # Filter the rows based on the condition
    kw_df_filter = kw_df[kw_df['kwyear'] <= dataset_year]

    # print(kw_df_filter)

    return kw_df_filter



In [6]:

def process_kw(dataset_year_str, kw_df_filter, dataset_article_level):
    """
    This function processes words in a DataFrame.

    Parameters:
        kw_df_filter (pandas.DataFrame): The DataFrame containing keywords
        dataset_article (DatasetDict): A dictionary-like object containing datasets for different years.

    Returns:
        dataset_year_hits (DataFrame or None): A DataFrame containing results if found, or None if no results were found.
    """
    # Creating an empty dataframe
    current_year_df = pd.DataFrame()

    for index, row in kw_df_filter.iterrows():
        explore_kw = row.iloc[0]
        result_df = kw_pair_search(dataset_article_level, dataset_year_str, explore_kw)
        # Concatenate the single search result onto the results DataFrame
        current_year_df = pd.concat([current_year_df, result_df], ignore_index=True)

    return current_year_df


In [7]:
def kw_pair_search(dataset_article, dataset_year, explor_kw):
    """
    This function searches through the dataset, by kw, to look for matching articles.

    Parameters:
        dataset_article (DatasetDict): A dictionary-like object containing datasets for different years.
        dataset_year (int): The year of the dataset.
        explor_kw (str): The keyword to search for.

    Returns:
        df_of_articles_containing_two_words (DataFrame): A DataFrame containing article IDs and article texts of articles containing both keywords.
    """
    # Access the dataset for the specific year
    year_dataset = dataset_article[dataset_year]

    # Access the 'raw_data_string' column
    articles = year_dataset['article']

    # Create empty list to store matching articles
    articles_containing_two_words = []

    for article_n, article_text in enumerate(articles):
        article_text = article_text.lower()
        if "explor" in article_text and explor_kw in article_text:
            # Append the matching article information to the list
            articles_containing_two_words.append({
                'article_year': dataset_year,
                'keyword hit': explor_kw,
                'row_number': article_n,
                'article_ID': dataset_article[dataset_year][article_n]["article_id"],
                'newspaper_name': dataset_article[dataset_year][article_n]["newspaper_name"],
                'edition': dataset_article[dataset_year][article_n]["edition"],
                'date': dataset_article[dataset_year][article_n]["date"],
                'page': dataset_article[dataset_year][article_n]["page"],
                'headline': dataset_article[dataset_year][article_n]["headline"],
                'byline': dataset_article[dataset_year][article_n]["byline"],
                # 'article': dataset_article[dataset_year][article_n]["article_id"],
            })

    # Convert the list of dictionaries to a DataFrame
    df_of_articles_containing_two_words = pd.DataFrame(articles_containing_two_words)
    # if not df_of_articles_containing_two_words.empty:
    #  print(df_of_articles_containing_two_words)
    # input("Press Enter to continue...")
    return df_of_articles_containing_two_words

In [12]:
results = pd.DataFrame()

for loop_year in range(start_year, end__year):
    dataset_year_str = str(loop_year)
    try:
        # Load dataset for current loop_year
        dataset_article_level = load_text_dataset(dataset_year_str)

        # Get valid keyword filters for current year
        kw_df_filter = get_kw(loop_year)
        # print ("Processing year", dataset_year_str)

        # Process keyword filters and dataset for current loop year to get hits
        year_search_result = process_kw(dataset_year_str,kw_df_filter, dataset_article_level)

        # if not year_search_result.empty:
        #    print(year_search_result)


        # Concatenate the single search result onto the results DataFrame
        results = pd.concat([results, year_search_result])

    except ValueError:
        print(f"Dataset empty for {dataset_year_str}. Moving to the next year.")
        continue

results = results.reset_index(drop=True)
results.to_csv('AS_Explor_KW_Hits_Raw.csv', index=False)
files.download("AS_Explor_KW_Hits_Raw.csv")
print("Finished")


Finished


In [10]:
results = results.reset_index(drop=True)

In [13]:
print (results)


   article_year keyword hit  row_number  \
0          1821        mars         182   
1          1821        star          58   
2          1822    astronom         224   
3          1822    herschel         224   
4          1822       space         224   
5          1823      heaven         513   
6          1823        mars        1044   
7          1823      planet         513   
8          1823       space         513   
9          1823        star         176   
10         1823        star        1034   
11         1824      heaven         482   
12         1824        mars         325   
13         1824        moon         909   
14         1824        moon        1018   
15         1824        star         482   
16         1825    astronom         190   
17         1825       comet        1951   
18         1825      heaven        1951   
19         1825      heaven        3614   
20         1825       space        1951   
21         1825        star        2431   
22         

In [None]:
foo=load_text_dataset("1827")

In [17]:
from google.colab import files
files.download("AS_Explor_KW_Hits_Raw.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# ***SOLVENT FRONT

Code above this line has been rechecked against AmStories and works properly.***

In [None]:
#Defining Functions

# Def Load Text Dataset
# Def Process = Search on two keywords
# AmStory light cleanup
# GPT OCR Cleanup
# GPT Explore space
# Evaluate explore space (for multiple runs)
# Sort by vote counts into six pops (5Y, 0N - 4/1, 3/2, 2/3, 1/4, 0/5)

Defining Functions

*1.  Def Load Dataset*

*2.  Def Process = Search on two keywords*
3.  GPT OCR Cleanup
4.  GPT Explore space
5.  Evaluate explore space (for multiple runs)
6.  Sort by vote counts into six pops (5Y, 0N - 4/1, 3/2, 2/3, 1/4, 0/5)
7.  Match with coded results
8.  Validate

In [None]:
# Creating an empty dataframe based on the features of the dataset

# Assuming dataset_article_level["1862"].features is a dictionary
features_dict = dataset_article_level[dataset_year_str].features

# Create a DataFrame from the dictionary
current_year_df = pd.DataFrame([features_dict])


In [None]:
#Loop

  #For each year in range
     #Load dataset, and then process
        #Process loop on keywords
        #Save to file

#Save to file (checkpoint for raw explor corpus)

# Mechanics for next pending size
# Run light AmStory cleanup
# Run GPT OCR cleanup
# Run 5x explore space decision via GPT
# Split out results based on similar votes
# Add in links to existing coding responses
# Validate decisions (maybe sample of yes/no and some larger portion of mixed votes to get a feel for data)

# Analysis?  Decide next steps?  Consider Hathi Dictionary run?


In [None]:
# Display the first few rows of the DataFrame to inspect the data
print("First few rows of the DataFrame:")
print(results.head())

# Get the shape of the DataFrame
print("\nShape of the DataFrame:")
print(results.shape)

# Get the data types of each column in the DataFrame
print("\nData types of each column:")
print(results.dtypes)

# Optionally, you can get a concise summary of the DataFrame
print("\nSummary of the DataFrame:")
print(results.info())