In [1]:
import pandas as pd
import requests
from io import StringIO



In [2]:
def read_articles_csv(file_path):
    """
    Reads a CSV file containing article data and returns a DataFrame.
    
    Args:
        file_path (str): The path to the CSV file.
        
    Returns:
        df_articles (pd.DataFrame): A DataFrame containing the article data.
    """
    df_articles = pd.read_csv(file_path)
    return df_articles


In [3]:
def get_unique_keywords(df_articles):
    """
    Extracts unique keywords from the 'keyword' column of a DataFrame.

    Args:
        df_articles (pd.DataFrame): The DataFrame containing article data.
        
    Returns:
        keywords (list): A list of unique keywords.
    """
    keywords = list(df_articles['keyword'].unique())
    return keywords

In [4]:
def fetch_newspapers_data():
    """
    Fetches newspaper data from the Chronicling America website and returns a DataFrame.

    Returns:
        df_newspapers (pd.DataFrame): A DataFrame containing the newspaper data.
    """
    url = 'https://chroniclingamerica.loc.gov/newspapers.txt'
    response = requests.get(url)
    data = response.text

    # Read the tab-separated values (TSV) data as a DataFrame.
    df_newspapers = pd.read_csv(StringIO(data), delimiter='|')

    # Remove leading and trailing whitespaces from the column names.
    df_newspapers.columns = [i.strip() for i in df_newspapers.columns]

    return df_newspapers

In [5]:
def get_newspapers_with_keywords(df_newspapers, keywords):
    """
    Filters a DataFrame of newspapers to keep only the rows where the 'Title' column
    contains any of the given keywords.

    Args:
        df_newspapers (pd.DataFrame): The DataFrame containing newspaper data.
        keywords (list): A list of keywords to search for in the 'Title' column.

    Returns:
        newspapers_with_keywords (pd.DataFrame): A DataFrame containing the filtered newspaper data.
    """
    newspapers_with_keywords = df_newspapers.loc[df_newspapers['Title'].str.lower().str.contains('|'.join(keywords))]
    return newspapers_with_keywords

In [6]:
def get_keyword_LCCN(newspapers_with_keywords):
    """
    Extracts the 'LCCN' values from a DataFrame of newspapers and removes leading/trailing whitespaces.

    Args:
        newspapers_with_keywords (pd.DataFrame): The DataFrame containing filtered newspaper data.

    Returns:
        keyword_LCCN (list): A list of 'LCCN' values.
    """
    keyword_LCCN = [i.strip() for i in list(newspapers_with_keywords['LCCN'])]
    return keyword_LCCN

In [7]:
def add_LCCN_column(df_articles):
    """
    Adds an 'LCCN' column to a DataFrame of articles, extracted from the 'hit_id' column.

    Args:
        df_articles (pd.DataFrame): The DataFrame containing article data.

    Returns:
        df_articles (pd.DataFrame): The DataFrame with the added 'LCCN' column.
    """
    df_articles['LCCN'] = df_articles['hit_id'].str.split("/").str[0]
    return df_articles

In [8]:
def filter_articles_with_keywords_titles(df_articles, keyword_LCCN):
    """
    Filters a DataFrame of articles to keep only the rows where the 'LCCN' column
    contains any of the given 'LCCN' values.

    Args:
        df_articles (pd.DataFrame): The DataFrame containing article data.
        keyword_LCCN (list): A list of 'LCCN' values to search for in the 'LCCN' column.

    Returns:
        articles_with_keywords_titles (pd.DataFrame): A DataFrame containing the filtered article data.
    """
    articles_with_keywords_titles = df_articles.loc[df_articles['LCCN'].str.lower().str.contains('|'.join(keyword_LCCN))]
    return articles_with_keywords_titles

In [9]:
def exclude_newspapers(df_articles, keyword_LCCN):
    """
    Filters a DataFrame of articles to exclude rows where the 'LCCN' column
    contains any of the given 'LCCN' values.

    Args:
        df_articles (pd.DataFrame): The DataFrame containing article data.
        keyword_LCCN (list): A list of 'LCCN' values to exclude from the 'LCCN' column.

    Returns:
        excluding_those_newspapers (pd.DataFrame): A DataFrame containing the filtered article data.
    """
    excluding_those_newspapers = df_articles.loc[~df_articles['LCCN'].str.lower().str.contains('|'.join(keyword_LCCN))]
    return excluding_those_newspapers

In [10]:
def filter_more_than_one_keyword(df_articles):
    """
    Filters a DataFrame of articles to keep only the rows where the 'hit_id' value
    occurs more than once.

    Args:
        df_articles (pd.DataFrame): The DataFrame containing article data.

    Returns:
        more_than_one_keyword (pd.DataFrame): A DataFrame containing the filtered article data.
    """
    hit_counts = df_articles['hit_id'].value_counts()
    mask = df_articles['hit_id'].isin(hit_counts[hit_counts > 1].index)
    more_than_one_keyword = df_articles.loc[mask]
    return more_than_one_keyword

In [11]:
def merge_with_title(df_articles, df_newspapers):
    """
    Merges two DataFrames, 'df_articles' and 'df_newspapers', on the 'LCCN' column.

    Args:
        df_articles (pd.DataFrame): The DataFrame containing article data.
        df_newspapers (pd.DataFrame): The DataFrame containing newspaper data.

    Returns:
        merged_df (pd.DataFrame): The merged DataFrame.
    """
    merged_df = df_articles.merge(df_newspapers, left_on='LCCN', right_on='LCCN', how='left')
    return merged_df

In [12]:
def group_by_title_and_year(merged_df):
    """
    Groups a DataFrame by the 'Title' and 'year' columns and returns the count for each group.

    Args:
        merged_df (pd.DataFrame): The DataFrame to group.

    Returns:
        grouped_df (pd.DataFrame): The grouped and counted DataFrame.
    """
    grouped_df = merged_df.groupby(["Title", 'year']).count().reset_index()
    return grouped_df

In [13]:
# Main execution
if __name__ == "__main__":
    df_articles = read_articles_csv("...data_input/total_result_1.csv")
    keywords = get_unique_keywords(df_articles)
    df_newspapers = fetch_newspapers_data()
    newspapers_with_keywords = get_newspapers_with_keywords(df_newspapers, keywords)
    keyword_LCCN = get_keyword_LCCN(newspapers_with_keywords)
    df_articles = add_LCCN_column(df_articles)
    articles_with_keywords_titles = filter_articles_with_keywords_titles(df_articles, keyword_LCCN)
    excluding_those_newspapers = exclude_newspapers(df_articles, keyword_LCCN)
    more_than_one_keyword = filter_more_than_one_keyword(excluding_those_newspapers)
    with_title = merge_with_title(more_than_one_keyword, df_newspapers)
    with_title.to_csv(".../data_output/with_title.csv")