In [26]:
import pandas as pd
import numpy as np
# get parquet file from here: 
# Current data source in a S3 bucket: sneakedreferences/processable-references/run_2024_08_19/2024_8_19T15_18_19.parquet
# down
# The parquet file is downloaded to the local machine
FILENAME = "2024_8_19T15_18_19.parquet"
def read_data(parquet_filename = FILENAME):
    """Reads parquet file and returns a dataframe"""
    print(f"{read_data.__name__}: {read_data.__doc__}")
    df = pd.read_parquet(parquet_filename)
    return df

def rearrange(issn):
    """Re-arranges ISSN datastructure to a more readable format"""
    data = list(map(lambda x: (f"{x[0][1]}", f"{x[1][1]}"),issn))
    return data

def split_issn(issn):
    """Splits the datastructure into its component issns"""
    e_issn = None
    p_issn = None
    for i in issn:
        if "print" in i:
            p_issn = i[1]
        elif "electronic" in i:
            e_issn = i[1]
    return p_issn, e_issn

def fix_issn(row):
    """Splits the original structure from the filename to a more readable format"""
    issn = row['issn']
    first_pass_issn = rearrange(issn)
    p_issn, e_issn = split_issn(first_pass_issn)
    return p_issn, e_issn

def separate_container_title(data):
    """Converting type to allow for easier grouping"""
    title = data
    if isinstance(data, np.ndarray):
        title = ", ".join(data.tolist())
    return title

def prepare_data_frame(filename = FILENAME):
    """generates dataframe, processes ISSNs, adds counts"""
    print(f"{prepare_data_frame.__name__}: {prepare_data_frame.__doc__}")
    df = read_data(filename)
    df[['print_issn', 'electronic_issn']] = df.apply(fix_issn, axis = 1, result_type='expand')
    df['separated_tokens'] = df.token_vocabulary.apply(lambda x: ", ".join(sorted(x)))
    df['container_title'] = df.container_title.apply(separate_container_title)
    df['ref_pge'] = df.apply(lambda x: x['cleaned_references_length']/x['total_reference_length'], axis=1)
    df.drop(columns=['issn'], inplace=True)
    # group counts
    df['token_counts_by_print_issn'] = df.groupby(['separated_tokens', 'print_issn'])['DOI'].transform('count')
    df['token_counts_by_electronic_issn'] = df.groupby(['separated_tokens', 'electronic_issn'])['DOI'].transform('count')
    return df

def in_title(row):
    """checks to see if the token vocabulary is part of the book title"""
    tf = "No"
    title_flag = []
    title = row['title']
    if (isinstance(row['token_vocabulary'], np.ndarray)) and title:
        for i in row['token_vocabulary']:
            if i.lower() in title.lower():
                title_flag.append('Yes')
    if 'Yes' in title_flag:
        tf = "Yes"
    return tf
    
def in_container_title(row):
    """checks to see if the token vocabulary is part of the container title"""
    tf = "No"
    title_flag = []
    title = row['container_title']
    if (isinstance(row['token_vocabulary'], np.ndarray)) and title:
        for i in row['token_vocabulary']:
            if i.lower() in title.lower():
                title_flag.append('Yes')
    if 'Yes' in title_flag:
        tf = "Yes"
    return tf

def get_book_chapter(df):
    """Returns rows that are only book chapters"""
    print(f"{get_book_chapter.__name__}: {get_book_chapter.__doc__}")
    bc = df[(df.work_type == 'book-chapter')].copy()
    bc['container_title_flag'] = bc.apply(in_container_title, axis=1)
    bc['title_flag'] = bc.apply(in_title, axis=1)
    return bc

def prepare_output_df(bc, filename = 'book_chapters.csv'):
    """processes dataframe column headings for better readability, outputs dataframe as a csv file"""
    print(f"{prepare_output_df.__name__}: {prepare_output_df.__doc__}")
    # re-ordering columns for readability
    bc = bc[['DOI', 'separated_tokens', 'token_frac_refs', 'author', 'title', 'container_title', 'flag', 'title_flag', 'container_title_flag', 'member', 'ref_pge', 'total_reference_length']].copy()
    # renaming columns and removing unnecessary columns
    rename_cols = {'separated_tokens': "most occuring token counted over all processed references", 
    'token_frac_refs': "Percentage of references in which the token(s) appears", 
    'flag' :"author flag", 
    'ref_pge': "Percentage of references that are processed compared to the total number of references in the article",
    'total_reference_length': "Total no. of references"}
    bc.rename(columns=rename_cols, inplace=True)
     # outputting file
    try:
        bc.to_csv(filename, index=False)
    except Exception as e:
        print("ERROR: ", e)
    print(f"CSV file located here: {filename}")

In [27]:
df = prepare_data_frame()
bc = get_book_chapter(df)
prepare_output_df(bc)

prepare_data_frame: generates dataframe, processes ISSNs, adds counts
read_data: Reads parquet file and returns a dataframe
get_book_chapter: Returns rows that are only book chapters
prepare_output_df: processes dataframe column headings for better readability, outputs dataframe as a csv file
CSV file located here: book_chapters.csv
