In [1]:
import pandas as pd
import numpy as np
# get parquet file from here: 
# Current data source in a S3 bucket: sneakedreferences/processable-references/run_2024_08_19/2024_8_19T15_18_19.parquet
# down
def read_data(parquet_filename):
    """Reads parquet file and returns a dataframe"""
    print(f"{read_data.__name__}: {read_data.__doc__}")
    df = pd.read_parquet(parquet_filename)
    return df

def rearrange(issn):
    data = list(map(lambda x: (f"{x[0][1]}", f"{x[1][1]}"),issn))
    return data

def split_issn(issn):
    e_issn = None
    p_issn = None
    for i in issn:
        if "print" in i:
            p_issn = i[1]
        elif "electronic" in i:
            e_issn = i[1]
    return p_issn, e_issn

def fix_issn(row):
    issn = row['issn']
    first_pass_issn = rearrange(issn)
    p_issn, e_issn = split_issn(first_pass_issn)
    return p_issn, e_issn

def separate_container_title(data):
    title = data
    if isinstance(data, np.ndarray):
        title = ", ".join(data.tolist())
    return title

def prepare_data_frame(df):
    """processes ISSNs, adds counts"""
    print(f"{prepare_data_frame.__name__}: {prepare_data_frame.__doc__}")
    df[['print_issn', 'electronic_issn']] = df.apply(fix_issn, axis = 1, result_type='expand')
    df['separated_tokens'] = df.token_vocabulary.apply(lambda x: ", ".join(sorted(x)))
    df['container_title'] = df.container_title.apply(separate_container_title)
    df['ref_pge'] = df.apply(lambda x: x['cleaned_references_length']/x['total_reference_length'], axis=1)
    df.drop(columns=['issn'], inplace=True)
    # group counts
    df['token_counts_by_container_title'] = df.groupby(['separated_tokens', 'container_title'])['DOI'].transform('count')
    df['token_counts_by_print_issn'] = df.groupby(['separated_tokens', 'print_issn'])['DOI'].transform('count')
    df['token_counts_by_electronic_issn'] = df.groupby(['separated_tokens', 'electronic_issn'])['DOI'].transform('count')
    df['container_title_work_type_counts'] = df.groupby(['container_title', 'work_type'])['DOI'].transform('count')
    return df

In [2]:
# Reading main dataframe from parquet file
df = read_data("2024_8_19T15_18_19.parquet")
df = prepare_data_frame(df)

read_data: Reads parquet file and returns a dataframe
prepare_data_frame: processes ISSNs, adds counts


### The following methods get DOI counts per journal from the above dataframe

In [5]:
import requests
import pandas as pd

api = "https://api.crossref.org/journals/"
header = {"mailto": "edatta@crossref.org"}

def process_journal_info_df(df):
    """Gets journal articles from dataframe and adds additional information"""
    print(f"{process_journal_info_df.__name__}: {process_journal_info_df.__doc__}")
    journal_article_df = df[df.work_type == 'journal-article'][['DOI','container_title', 'print_issn', 'electronic_issn', 'container_title_work_type_counts', 'member']]
    journal_article_df['print_issn_counts'] = journal_article_df.groupby('print_issn')['DOI'].transform('count')
    journal_article_df['electronic_issn_counts'] = journal_article_df.groupby('electronic_issn')['DOI'].transform('count')
    journal_article_df = journal_article_df.sort_values(['electronic_issn_counts','print_issn_counts','container_title_work_type_counts'], ascending=False).drop_duplicates('container_title')
    journal_article_df.drop(columns=['DOI'], inplace=True)
    e_issn_sort = journal_article_df.sort_values(['electronic_issn_counts', 'print_issn_counts', 'container_title_work_type_counts'], ascending=False).head(60)
    p_issn_sort = journal_article_df.sort_values(['print_issn_counts', 'electronic_issn_counts','container_title_work_type_counts'], ascending=False).head(60)
    journal_article_df = pd.concat([e_issn_sort,p_issn_sort])
    journal_info = journal_article_df.drop_duplicates('container_title').sort_values(['electronic_issn_counts', 'container_title_work_type_counts', 'print_issn'], ascending=False)
    return journal_info
    
def process_issns(journal_info):
    """Gets all sorts of ISSN info"""
    print(f"{process_issns.__name__}: {process_issns.__doc__}")
    equal_p_issn_title_counts = list(journal_info[journal_info.container_title_work_type_counts == journal_info.print_issn_counts].index)
    
    equal_e_issn_title_counts = list(journal_info[journal_info.container_title_work_type_counts == journal_info.electronic_issn_counts].index)
    equal_p_issn_actual = list(set(equal_p_issn_title_counts) - set(equal_e_issn_title_counts))
    
    e_issn_dups = journal_info[(journal_info.duplicated(subset=['electronic_issn'], keep=False)) & (journal_info.electronic_issn.notnull())].index
    
    p_issn_dups = journal_info[(journal_info.duplicated(subset=['print_issn'], keep=False)) & (journal_info.print_issn.notnull())].index
    
    p_issn_actual = set(p_issn_dups) - set(e_issn_dups)
    p_issn_actual = list(p_issn_actual)
    issn_info = {"print_issn": p_issn_actual, "electronic_issn": list(e_issn_dups), "equal_e_issn_title_counts": equal_e_issn_title_counts, "equal_p_issn_actual": equal_p_issn_actual}
    return issn_info

def get_real_counts(row, issn_info):
    """Processing ISSNs to get true counts per ISSN"""
    print(f"{get_real_counts.__name__}: {get_real_counts.__doc__}")
    real_count = 0
    p_issns = issn_info["print_issn"]
    e_issns = issn_info["electronic_issn"]
    equal_e_issn_title_counts = issn_info["equal_e_issn_title_counts"]
    equal_p_issn_actual = issn_info["equal_p_issn_actual"]
    index = row.name
    if index in p_issns:
        real_count = row['print_issn_counts']
    elif index in e_issns:
        real_count = row['electronic_issn_counts']
    elif index in equal_e_issn_title_counts:
        real_count = row['electronic_issn_counts']
    elif index in equal_p_issn_actual:
        real_count = row['print_issn_counts']
    return real_count

def remove_dup_issns(journal_info):
    """Determining which rows to keep for unique journals"""
    print(f"{remove_dup_issns.__name__}: {remove_dup_issns.__doc__}")
    e = journal_info[journal_info.electronic_issn.notnull()].electronic_issn
    e = e.drop_duplicates()
    p = journal_info[(journal_info.print_issn.notnull())].print_issn
    p_issn_indices = list(set(p.index) - set(e.index))
    get_e_issn = journal_info[(journal_info.index.isin(p_issn_indices)) & (journal_info.electronic_issn.notnull())].electronic_issn
    peissn = set(get_e_issn.tolist())
    eissn = set(e.tolist())
    common_issns = list(set(peissn).intersection(eissn))
    
    for i in common_issns:
        index = get_e_issn[get_e_issn == i].index[0]
        p_issn_indices.remove(index)
    
    keep_indices = list(e.index) + p_issn_indices
    return keep_indices

def get_journal_info(row):
    """Get journal info from Crossref journals route"""
    print(f"{get_journal_info.__name__}: {get_journal_info.__doc__}")
    title = None
    total_dois = None
    issn = None
    if row['electronic_issn']:
        issn = row['electronic_issn']
    elif row['print_issn']:
        issn = row['print_issn']
    url = api + issn
    try:
        response = requests.get(url, headers=header)
        if response.ok:
            rsp = response.json()['message']
            title = rsp['title']
            total_dois = rsp['counts']['total-dois']
        else:
            print(f"ERROR: For issn {issn}: {response.reason}")
    except Exception as e:
        print(f"{issn}: {e}")
    return title, total_dois

def null_value(val):
    """For the final output, adding a string for null values"""
    print(f"{null_value.__name__}: {null_value.__doc__}")
    new_value = None
    if pd.isna(val):
        new_value = "None found"
    else:
        new_value = val
    return new_value

def get_final_output(journal_info):
    """Re-arranging and re-naming columns"""
    print(f"{get_final_output.__name__}: {get_final_output.__doc__}")
    columns = ['journal_title', 'total_dois', 'doi_pge']
    for c in columns:
        journal_info[c] = journal_info[c].apply(null_value)
    # re-ordering columns
    journal_info = journal_info[['actual_counts', 'journal_title', 'container_title', 'print_issn', 'electronic_issn', 'member', 'total_dois', 'doi_pge']]
    rename_fields = {'actual_counts': 'DOI counts', 'journal_title': 'Journal Title in journals route','container_title': 'Container Title from DOI','print_issn': 'Print ISSN', 'electronic_issn': 'Electronic ISSN', 'member': 'Member', 'total_dois': 'Total No. of DOIs', 'doi_pge': 'Percentage of DOI counts over total number of DOIs'}
    journal_info.rename(columns=rename_fields, inplace=True)
    return journal_info

def generate_journal_info(df, filename = "journal_info_high_counts.csv"):
    """Wrapper function to call other functions and outputting dataframe to CSV"""
    print("Generating dataframe to show journal information")
    journal_info = process_journal_info_df(df)
    issn_info = process_issns(journal_info)
    journal_info['actual_counts'] = journal_info.apply(lambda x: get_real_counts(x, issn_info), axis=1)
    keep_indices = remove_dup_issns(journal_info)
    journal_info.drop(columns=['container_title_work_type_counts', 'print_issn_counts', 'electronic_issn_counts'], inplace=True)
    journal_info = journal_info[journal_info.index.isin(keep_indices)].sort_values('actual_counts', ascending=False)
    intermediate_journal_info = journal_info.copy()
    intermediate_journal_info[['journal_title', 'total_dois']] = intermediate_journal_info.apply(get_journal_info, axis=1, result_type = 'expand')
    journal_info = intermediate_journal_info.copy()
    journal_info['doi_pge'] = journal_info.apply(lambda r: round((r['actual_counts']/r['total_dois'] * 100),2), axis=1)
    journal_info.drop_duplicates(subset=['journal_title'], inplace=True)
    journal_info = get_final_output(journal_info)
    try:
        journal_info.to_csv(filename, index=False)
    except Exception as e:
        print("ERROR: ", e)
    print(f"CSV file located here: {filename}")

In [45]:
journal_info = generate_journal_info(df)

ERROR: For issn 1572-8943: Not Found
ERROR: For issn 1572-8943: Not Found


In [None]:
# outputting all journal articles
journal_articles_df = df[(df.work_type == 'journal-article')].sort_values(['token_frac_refs','token_counts_by_electronic_issn', 'token_counts_by_print_issn'], ascending=False)[['DOI', 'separated_tokens', 'token_frac_refs', 'author', 'flag', 'title', 'container_title','print_issn', 'electronic_issn','ref_pge','token_counts_by_electronic_issn','token_counts_by_print_issn','token_counts_by_container_title','member']]
journal_articles_df.to_csv("journal_articles.csv", index=False)

# flagged journal articles
flagged_articles = journal_articles_df[(journal_articles_df.flag == "Yes") & ((journal_articles_df.token_counts_by_electronic_issn >= 10) | (journal_articles_df.token_counts_by_print_issn >= 10) | (journal_articles_df.token_counts_by_container_title >= 10))].sort_values(['token_counts_by_electronic_issn','token_counts_by_container_title','token_counts_by_print_issn','token_frac_refs'], ascending=False)

flagged_articles.to_csv("flagged_ja_articles.csv", index=False)