#### Run all cells and it will generate a csv file
#### This notebook generates a csv file of books where the most common token is present between 50 - 100% of the references found in its metadata

In [16]:
import pandas as pd
import numpy as np
# get parquet file from here: 
# Current data source in a S3 bucket: sneakedreferences/processable-references/run_2024_08_19/2024_8_19T15_18_19.parquet
# down
# The parquet file is downloaded to the local machine
FILENAME = "2024_8_19T15_18_19.parquet"
def read_data(parquet_filename = FILENAME):
    """Reads parquet file and returns a dataframe"""
    print(f"{read_data.__name__}: {read_data.__doc__}")
    df = pd.read_parquet(parquet_filename)
    return df

def rearrange(issn):
    """Re-arranges ISSN datastructure to a more readable format"""
    data = list(map(lambda x: (f"{x[0][1]}", f"{x[1][1]}"),issn))
    return data

def split_issn(issn):
    """Splits the datastructure into its component issns"""
    e_issn = None
    p_issn = None
    for i in issn:
        if "print" in i:
            p_issn = i[1]
        elif "electronic" in i:
            e_issn = i[1]
    return p_issn, e_issn

def fix_issn(row):
    """Splits the original structure from the filename to a more readable format"""
    issn = row['issn']
    first_pass_issn = rearrange(issn)
    p_issn, e_issn = split_issn(first_pass_issn)
    return p_issn, e_issn

def separate_container_title(data):
    """Converting type to allow for easier grouping"""
    title = data
    if isinstance(data, np.ndarray):
        title = ", ".join(data.tolist())
    return title

def prepare_data_frame(filename = FILENAME):
    """generates dataframe, processes ISSNs, adds counts"""
    print(f"{prepare_data_frame.__name__}: {prepare_data_frame.__doc__}")
    df = read_data(filename)
    df[['print_issn', 'electronic_issn']] = df.apply(fix_issn, axis = 1, result_type='expand')
    df['separated_tokens'] = df.token_vocabulary.apply(lambda x: ", ".join(sorted(x)))
    df['container_title'] = df.container_title.apply(separate_container_title)
    df['ref_pge'] = df.apply(lambda x: x['cleaned_references_length']/x['total_reference_length'], axis=1)
    df.drop(columns=['issn'], inplace=True)
    # group counts
    df['token_counts_by_print_issn'] = df.groupby(['separated_tokens', 'print_issn'])['DOI'].transform('count')
    df['token_counts_by_electronic_issn'] = df.groupby(['separated_tokens', 'electronic_issn'])['DOI'].transform('count')
    return df

def get_books(df):
    """Returns rows that are only books"""
    print(f"{get_books.__name__}: {get_books.__doc__}")
    books = df[df.work_type == 'book'].sort_values('token_frac_refs', ascending=False).copy()
    return books

def prepare_output_df(books, filename = 'books.csv'):
    """processes dataframe column headings for better readability, outputs dataframe as a csv file"""
    print(f"{prepare_output_df.__name__}: {prepare_output_df.__doc__}")
    # re-ordering columns for readability
    books = books[['DOI', 'separated_tokens', 'token_frac_refs', 'author', 'flag', 'title',  'member', 'ref_pge', 'total_reference_length']].copy()
    # renaming columns and removing unnecessary columns
    rename_cols = {'separated_tokens': "most occuring token counted over all processed references", 
    'token_frac_refs': "Percentage of references in which the token(s) appears", 
    'flag' :"author flag", 
    'ref_pge': "Percentage of references that are processed compared to the total number of references in the article",
    'total_reference_length': "Total no. of references"}
    books.rename(columns=rename_cols, inplace=True)
     # outputting file
    try:
        books.to_csv(filename, index=False)
    except Exception as e:
        print("ERROR: ", e)
    print(f"CSV file located here: {filename}")

In [17]:
df = prepare_data_frame()
books = get_books(df)
books = prepare_output_df(books)

prepare_data_frame: generates dataframe, processes ISSNs, adds counts
read_data: Reads parquet file and returns a dataframe
get_books: Returns rows that are only books
prepare_output_df: processes dataframe column headings for better readability, outputs dataframe as a csv file
CSV file located here: books.csv
