### Run all cells and it will generate a csv file
### This notebook generates a dataframe of journal articles where the most common token is not one of the authors

In [222]:
import pandas as pd
import numpy as np
# get parquet file from here: 
# Current data source in a S3 bucket: sneakedreferences/processable-references/run_2024_08_19/2024_8_19T15_18_19.parquet
# down
FILENAME = "2024_8_19T15_18_19.parquet"
def read_data(parquet_filename = FILENAME):
    """Reads parquet file and returns a dataframe"""
    print(f"{read_data.__name__}: {read_data.__doc__}")
    df = pd.read_parquet(parquet_filename)
    return df

def rearrange(issn):
    """Re-arranges ISSN datastructure to a more readable format"""
    data = list(map(lambda x: (f"{x[0][1]}", f"{x[1][1]}"),issn))
    return data

def split_issn(issn):
    """Splits the datastructure into its component issns"""
    e_issn = None
    p_issn = None
    for i in issn:
        if "print" in i:
            p_issn = i[1]
        elif "electronic" in i:
            e_issn = i[1]
    return p_issn, e_issn

def fix_issn(row):
    """Splits the original structure from the filename to a more readable format"""
    issn = row['issn']
    first_pass_issn = rearrange(issn)
    p_issn, e_issn = split_issn(first_pass_issn)
    return p_issn, e_issn

def separate_container_title(data):
    """Converting type to allow for easier grouping"""
    title = data
    if isinstance(data, np.ndarray):
        title = ", ".join(data.tolist())
    return title

def prepare_data_frame(filename = FILENAME):
    """generates dataframe, processes ISSNs, adds counts"""
    print(f"{prepare_data_frame.__name__}: {prepare_data_frame.__doc__}")
    df = read_data(filename)
    df[['print_issn', 'electronic_issn']] = df.apply(fix_issn, axis = 1, result_type='expand')
    df['separated_tokens'] = df.token_vocabulary.apply(lambda x: ", ".join(sorted(x)))
    df['container_title'] = df.container_title.apply(separate_container_title)
    df['ref_pge'] = df.apply(lambda x: x['cleaned_references_length']/x['total_reference_length'], axis=1)
    df.drop(columns=['issn'], inplace=True)
    # group counts
    df['token_counts_by_container_title'] = df.groupby(['separated_tokens', 'container_title'])['DOI'].transform('count')
    df['token_counts_by_print_issn'] = df.groupby(['separated_tokens', 'print_issn'])['DOI'].transform('count')
    df['token_counts_by_electronic_issn'] = df.groupby(['separated_tokens', 'electronic_issn'])['DOI'].transform('count')
    df['container_title_work_type_counts'] = df.groupby(['container_title', 'work_type'])['DOI'].transform('count')
    return df

def get_unflagged_journal_articles(df):
    """generates dataframe that has journal articles where the max token is not one of the authors"""
    print(f"{get_unflagged_journal_articles.__name__}: {get_unflagged_journal_articles.__doc__}")
    selected_columns = ['DOI', 'separated_tokens', 'token_frac_refs', 'author', 'flag', 'title', 'container_title',
                        'print_issn', 'electronic_issn','ref_pge','total_reference_length','token_counts_by_electronic_issn',
                        'token_counts_by_print_issn','token_counts_by_container_title','member']
    # getting unflagged journal articles with token counts that are greater than 10. 
    # Token counts are the number of repeated tokens grouped by various types of ISSNs. 
    # This shows the number of possible authors that are repeated by journal
    journal_articles_df = df[(df.work_type == 'journal-article')].sort_values(['token_frac_refs','token_counts_by_electronic_issn', 'token_counts_by_print_issn'], ascending=False)[selected_columns]
    unflagged_journal_articles = journal_articles_df[(journal_articles_df.flag == "No") & 
                                                     ((journal_articles_df.token_counts_by_electronic_issn >= 10) | 
                                                      (journal_articles_df.token_counts_by_print_issn >= 10) | 
                                                      (journal_articles_df.token_counts_by_container_title >= 10))].sort_values(['token_counts_by_electronic_issn',
                                                                                                                                 'token_counts_by_container_title',
                                                                                                                                 'token_counts_by_print_issn','token_frac_refs'], ascending=False)
    return unflagged_journal_articles

def get_most_common_tokens(unflagged_journal_articles):
    """gets the most common tokens"""
    print(f"{get_most_common_tokens.__name__}: {get_most_common_tokens.__doc__}")
    tokens = []
    columns = ["token_counts_by_electronic_issn", "token_counts_by_print_issn", "token_counts_by_container_title"]
    for c in columns:
        st = list(set(list(unflagged_journal_articles[unflagged_journal_articles[c] == unflagged_journal_articles[c].max()].separated_tokens)))
        tokens.extend(st)
    return list(set(tokens))

def prepare_output_df(unflagged_journal_articles, filename = 'unflagged_journal_articles.csv'):
    """processes dataframe column headings for better readability, outputs dataframe as a csv file"""
    print(f"{prepare_output_df.__name__}: {prepare_output_df.__doc__}")
    # renaming columns and removing unnecessary columns
    rename_cols = {'separated_tokens': "most occuring token counted over all processed references", 
    'token_frac_refs': "Percentage of references in which the token(s) appears", 
    'flag' :"author flag", 
    'ref_pge': "Percentage of references that are processed compared to the total number of references in the article",
    'total_reference_length': "Total no. of references",
    'token_counts_by_electronic_issn': "token_counts_by_electronic_issn",
    'token_counts_by_print_issn': "token_counts_by_print_issn"}
    unflagged_journal_articles.drop(['token_counts_by_container_title'], axis=1, inplace=True)
    unflagged_journal_articles.rename(columns=rename_cols, inplace=True)
    # outputting file
    try:
        unflagged_journal_articles.to_csv(filename, index=False)
    except Exception as e:
        print("ERROR: ", e)
    print(f"CSV file located here: {filename}")


### Prepare dataframe

In [223]:
df = prepare_data_frame()
unflagged_journal_articles = get_unflagged_journal_articles(df)

most_common_tokens = get_most_common_tokens(unflagged_journal_articles)
# currently, the output of most_common_tokens is
# ['Simos', 'Medicine, Science, Sports']

# removing tokens that are 'Medicine, Science, Sports' as this is not a person
all_separated_tokens = list(set(unflagged_journal_articles[unflagged_journal_articles.separated_tokens != 'Medicine, Science, Sports'].sort_values('token_frac_refs', ascending=False).separated_tokens.to_list()))


prepare_data_frame: generates dataframe, processes ISSNs, adds counts
read_data: Reads parquet file and returns a dataframe
get_unflagged_journal_articles: generates dataframe that has journal articles where the max token is not one of the authors
get_most_common_tokens: gets the most common tokens


### Process separated tokens to determine which tokens might be a person

In [224]:
# got this list after querying chatgpt
# queried chatgpt open ai model 4o mini just using the chat prompt
# Identify the people,  and output the answer as a python list from this list: all_separated_tokens
# this needs to get better, currently it's a very manual process
# Identify the people from this list from the variables all_separated_tokens,  and output the answer as a python list.  
# I also ran a few experiments using Stanza's NER models and narrowed the tokens to this
# This is an extremely manual process and further experiments need to be done to make it less manual
# One way would be fine tuning a model with author names in Crossref to narrow down the possibilities of who might be a "person"
#Response
people = ['محمد',
'ANZ, Rashed',
 'Neur',
 'CW, Shu',
 'Yamaguchi',
 'Merz',
 'Wehner',
 'Yousof',
 'Rashed',
 'Perrotta',
 'Jacques',
 'Grzegorz, Michalski',
 'Aithal',
 'Terziev',
 'Sheikholeslami',
 'Huang',
 'Hua',
 'Hashim',
 'Bondur',
 'Dziewonski',
 'Mamatov',
 'Tsvetkov',
 'Abu',
 'Ulenikov',
 'Latash',
 'Stević',
 'Carraher',
 'Tezduyar',
 'Inoue',
 'Gayda',
 'BC, Yang',
 'Degadwala',
 'Hipel',
 'Ghassan',
 'Mahmoud',
 'Fischer',
 'Dietz',
 'Samhita',
 'Wyrok',
 'Sathish',
 'Okubo',
 'Karthikeyan',
 'Guz',
 'Kim',
 'Wang',
 'Simos',
 'Medvedev',
 'Mohanavel',
 'Pivinskii',
 'Bobtelsky',
 'Arian',
 'Ruggeri',
 'Bohlmann',
 'Makarov',
 'Ding',
 'Roukos',
 'Jefferson']

### Get unflagged journal articles rows that only contain tokens that have been identified as people, prepare dataframe for output, and output as csv

In [225]:
# getting only those rows where the tokens have been identified as people
unflagged_journal_articles = unflagged_journal_articles[unflagged_journal_articles.separated_tokens.isin(people)]
prepare_output_df(unflagged_journal_articles)

prepare_output_df: processes dataframe column headings for better readability, outputs dataframe as a csv file
CSV file located here: unflagged_journal_articles.csv
