#### Run all cells to generate the outputs listed below

#### This notebook generates two csv files. 
1. A csv of all journal articles
2. A csv of all articles with duplicate journal titles

In [39]:
import pandas as pd
import numpy as np
# get parquet file from here: 
# Current data source in a S3 bucket: sneakedreferences/processable-references/run_2024_08_19/2024_8_19T15_18_19.parquet
# down
# The parquet file is downloaded to the local machine
FILENAME = "2024_8_19T15_18_19.parquet"
def read_data(parquet_filename = FILENAME):
    """Reads parquet file and returns a dataframe"""
    print(f"{read_data.__name__}: {read_data.__doc__}")
    df = pd.read_parquet(parquet_filename)
    return df

def rearrange(issn):
    """Re-arranges ISSN datastructure to a more readable format"""
    data = list(map(lambda x: (f"{x[0][1]}", f"{x[1][1]}"),issn))
    return data

def split_issn(issn):
    """Splits the datastructure into its component issns"""
    e_issn = None
    p_issn = None
    for i in issn:
        if "print" in i:
            p_issn = i[1]
        elif "electronic" in i:
            e_issn = i[1]
    return p_issn, e_issn

def fix_issn(row):
    """Splits the original structure from the filename to a more readable format"""
    issn = row['issn']
    first_pass_issn = rearrange(issn)
    p_issn, e_issn = split_issn(first_pass_issn)
    return p_issn, e_issn

def separate_container_title(data):
    """Converting type to allow for easier grouping"""
    title = data
    if isinstance(data, np.ndarray):
        title = ", ".join(data.tolist())
    return title

def prepare_data_frame(filename = FILENAME):
    """generates dataframe, processes ISSNs, adds counts"""
    print(f"{prepare_data_frame.__name__}: {prepare_data_frame.__doc__}")
    df = read_data(filename)
    df[['print_issn', 'electronic_issn']] = df.apply(fix_issn, axis = 1, result_type='expand')
    df['separated_tokens'] = df.token_vocabulary.apply(lambda x: ", ".join(sorted(x)))
    df['container_title'] = df.container_title.apply(separate_container_title)
    df['ref_pge'] = df.apply(lambda x: x['cleaned_references_length']/x['total_reference_length'], axis=1)
    df.drop(columns=['issn'], inplace=True)
    # group counts
    df['token_counts_by_container_title'] = df.groupby(['separated_tokens', 'container_title'])['DOI'].transform('count')
    df['token_counts_by_print_issn'] = df.groupby(['separated_tokens', 'print_issn'])['DOI'].transform('count')
    df['token_counts_by_electronic_issn'] = df.groupby(['separated_tokens', 'electronic_issn'])['DOI'].transform('count')
    df['container_title_work_type_counts'] = df.groupby(['container_title', 'work_type'])['DOI'].transform('count')
    return df

def get_journal_articles(df):
    """generates dataframe that has journal articles where the max token is one of the authors"""
    print(f"{get_journal_articles.__name__}: {get_journal_articles.__doc__}")
    selected_columns = ['DOI', 'separated_tokens', 'token_frac_refs', 'author', 'flag', 'title', 'container_title',
                        'print_issn', 'electronic_issn','ref_pge','total_reference_length','token_counts_by_electronic_issn',
                        'token_counts_by_print_issn','token_counts_by_container_title','member']

    journal_articles_df = df[(df.work_type == 'journal-article')].sort_values(['token_frac_refs','token_counts_by_electronic_issn', 'token_counts_by_print_issn'], ascending=False)[selected_columns]
    return journal_articles_df

def output_df(df, filename):
   # outputting file
    try:
        df.to_csv(filename, index=False)
    except Exception as e:
        print("ERROR: ", e)
    print(f"CSV file located here: {filename}") 

def prepare_output_df(df, filename):
    """processes dataframe column headings for better readability, outputs dataframe as a csv file"""
    print(f"{prepare_output_df.__name__}: {prepare_output_df.__doc__}")
    # renaming columns and removing unnecessary columns
    rename_cols = {'separated_tokens': "most occuring token counted over all processed references", 
    'token_frac_refs': "Percentage of references in which the token(s) appears", 
    'flag' :"author flag", 
    'ref_pge': "Percentage of references that are processed compared to the total number of references in the article",
    'total_reference_length': "Total no. of references",
    'token_counts_by_electronic_issn': "token_counts_by_electronic_issn",
    'token_counts_by_print_issn': "token_counts_by_print_issn"}
    df.drop(['token_counts_by_container_title'], axis=1, inplace=True)
    df.rename(columns=rename_cols, inplace=True)
    output_df(df, filename)
    return df

def get_duplicate_titles(journal_articles, filename = "duplicate_journal_titles.csv"):
    domain_stopwords = ['Preface', 'Introduction', 'Obituary', 'OBITUARY', 'Bibliographie',
       'In Memoriam', 'Editorial', 'In memoriam','Dedication',
       'Social administration digest', 'Bibliography', 'IN MEMORIAM',
       'Obituaries', 'Book review', 'Back Matter',
       'References', 'Environmental digest', 'General Assembly',
       'Autobiography', 'OBITUARIES', 'Foreword', 'Introduction to the special issue',
       'Social Administration Digest', 'News',  'Book Review', 'Tribute', 'Recommended practices', 'Selected bibliography','Selected Bibliography',
       'Selected bibliography', 'Présentation','Bericht über Patente','Documentation',
       'Results on top physics by CMS', 'Curriculum Vitae', 'NOTES', 'Notes and Comments','Notes','Discussion','Prologue', 'Literatur'
       ]
    dup_ja = journal_articles[(journal_articles.title.notnull()) & (journal_articles.title != '') & (~ journal_articles.title.isin(domain_stopwords))].copy()
    dup_ja = dup_ja[dup_ja.duplicated(subset=['title'], keep=False)].sort_values(['title', 'container_title'])[['DOI', 'title', 'container_title',"print_issn", "electronic_issn","most occuring token counted over all processed references", "author flag", "member"]].copy()
    output_df(dup_ja, filename)
    return dup_ja

### Prepare dataframe and get a dataframe of all journal articles

In [32]:
df = prepare_data_frame()
journal_articles = get_journal_articles(df)
journal_articles = prepare_output_df(journal_articles, "journal_articles.csv")

prepare_data_frame: generates dataframe, processes ISSNs, adds counts
read_data: Reads parquet file and returns a dataframe
get_journal_articles: generates dataframe that has journal articles where the max token is one of the authors
prepare_output_df: processes dataframe column headings for better readability, outputs dataframe as a csv file
CSV file located here: journal_articles.csv


#### Get duplicate journal titles

In [40]:
dj = get_duplicate_titles(journal_articles)

CSV file located here: duplicate_journal_titles.csv
