In [None]:
###  This code is to search through cleaned OCR recognized text for the string 'explore'
#  The code will return one row for each hit with a unique ID based on the article_ID and location of the hit.

In [1]:
import pandas as pd
import os
import re

working_dir = os.getcwd()+"\\"


In [13]:
### Getting files from disk

# Getting file with hits and cleaned text
hit_file_path = working_dir+'AM_Prox_SW_OCR_Concat.csv'

# Read the CSV file into a DataFrame
hit_df = pd.read_csv(hit_file_path)

# Getting kw_file
kw_file_path = working_dir+'AS_Explore_KW_String_and_Word.csv'

# Read the CSV file into a DataFrame
kw_df = pd.read_csv(kw_file_path, names=['keyword', 'kw_year', 'kw_type'], header=None)

In [14]:
### Grouping multiple hits

# Dropping column 'keyword_hit'
if 'keyword_hit' in hit_df.columns:
    hit_df.drop(columns=['keyword_hit'], inplace=True)
else:
    print("Warning: 'keyword_hit' column not found, skipping drop operation.")

# Dropping duplicate rows
hit_df = hit_df.drop_duplicates()

# Reindexing df
hit_df.reset_index(drop=True, inplace=True)


In [4]:
### Generates filtered KW df based on a year

def get_filter_kw(search_year, kw_df):
    """
    Filters a DataFrame of keywords based on a specified year.

    Parameters:
    - search_year (int): The year to filter by.
    - kw_df (pandas.DataFrame): DataFrame containing keyword data with a column 'kwyear'.

    Returns:
    - pandas.DataFrame: Filtered DataFrame containing keywords where 'kwyear' <= search_year.
    """
    # Convert search_year to integer
    dataset_year = int(search_year)

    # Filter the DataFrame based on the condition
    kw_df_filtered = kw_df[kw_df['kw_year'] <= dataset_year]

    return kw_df_filtered



In [16]:
### Function to find cases where 'explor' is within kw_dist of a keyword

def find_explor_near_keywords(hit_df, kw_df, kw_dist=150):
    """
    Function to find instances where the string 'explor' appears within 150 words of keywords,
    considering different search modes ('string' or 'word') based on 'kw_type' in kw_filter_df.

    Parameters:
    - hit_df (pandas.DataFrame): DataFrame containing article data with columns 'article_ID' and 'article_year'.
    - kw_df (pandas.DataFrame): DataFrame containing keyword data with columns 'keyword', 'kw_type', and 'kw_year'.
    - kw_dist (int): Maximum distance in words within which to search for 'explor' near keywords. Default is 150.

    Returns:
    - result_df (pandas.DataFrame): DataFrame containing locations of 'explor' near keywords.
    """
    result_dfs = []
    
    for index, row in hit_df.iterrows():
        article_year = row['article_year']
        
        # Assuming get_filter_kw is defined elsewhere and used here to filter keywords
        filtered_kw_df = get_filter_kw(article_year, kw_df)
        
        article_text = row['article']
        article_lower = article_text.lower()  # Convert article text to lowercase for case-insensitive search
        
        results = []
        
        for kw_index, kw_row in filtered_kw_df.iterrows():
            keyword = kw_row['keyword'].lower()  # Convert keyword to lowercase
            
            # Determine search mode based on 'kw_type'
            kw_type = kw_row['kw_type']
            if kw_type == 'string':
                # Find keyword in the article text as substring using regex
                keyword_positions = [m.start() for m in re.finditer(re.escape(keyword), article_lower)]
            elif kw_type == 'word':
                # Find keyword in the article text as whole word using regex
                keyword_positions = [m.start() for m in re.finditer(r'\b{}\b'.format(re.escape(keyword)), article_lower)]
            else:
                raise ValueError(f"Invalid kw_type '{kw_type}' for keyword '{keyword}'. Must be 'string' or 'word'.")
            
            for kw_position in keyword_positions:
                # Check if 'explor' is within kw_dist words of the keyword
                explor_positions = [m.start() for m in re.finditer(r'explor', article_lower[max(0, kw_position - kw_dist): min(len(article_lower), kw_position + kw_dist)])]
                
                if explor_positions:
                    for explor_position in explor_positions:
                        results.append({
                            'article_ID': row['article_ID'],
                            'article_year': article_year,
                            'article': row['article'],
                            'keyword': kw_row['keyword'],
                            'explor_position': explor_position + max(0, kw_position - kw_dist)  # Adjust position to the original text
                        })
        
        # Convert results list to DataFrame and append to result_dfs
        if results:
            result_dfs.append(pd.DataFrame(results))
    
    # Concatenate all DataFrames in result_dfs
    if result_dfs:
        result_df = pd.concat(result_dfs, ignore_index=True)
    else:
        result_df = pd.DataFrame(columns=['article_ID', 'article_year', 'article', 'keyword', 'explor_position'])
    
    return result_df


In [30]:
result_df = find_explor_near_keywords(hit_df, kw_df, kw_dist=150)

In [32]:
# Assuming result_df is already populated from running find_explor_near_keywords
result_df.drop_duplicates(inplace=True)

# Optionally, reset index if needed
result_df.reset_index(drop=True, inplace=True)


In [71]:
import pandas as pd

# Assuming result_df is already populated and duplicates are removed
# Group by 'article_ID' and 'explor_position', aggregate 'keyword' values
combined_df = result_df.groupby(['article_ID', 'explor_position'], as_index=False)['keyword'].agg(lambda x: ', '.join(x))

# Optionally, include other columns if needed
combined_df = result_df.groupby(['article_ID', 'explor_position'], as_index=False).agg({
    'keyword': lambda x: ', '.join(x),
    'article_year': 'first',  # Assuming article_year is the same for grouped rows
    'article': 'first'  # Assuming article text is the same for grouped rows
})

# Sort by 'article_year' if it exists
if 'article_year' in combined_df.columns:
    combined_df = combined_df.sort_values(by='article_year')

# Reset index
combined_df.reset_index(drop=True, inplace=True)


In [74]:
combined_df.to_csv('ASEx_Unique_Hits.csv', index=True)