In [4]:
import os, sys
import pandas as pd
import json
import numpy as np

In [6]:
# Pandas configuration to show Dataframes
pd.set_option("display.max_columns", None)     # Show all columns

## Analyzing ELIXIR corpus

In [2]:
ea_papers = '../data/enriched_aggregated_papers.csv'

In [5]:
ea_papers_df = pd.read_table(ea_papers, encoding='utf-8')

In [13]:
ordered_articles_ea = ea_papers_df.sort_values(by='Article Title')['Article Title'].tolist()

In [11]:
ea_papers_df.columns

Index(['Article Title', 'source', 'pmid', 'pmcid', 'doi_df1', 'title',
       'authorString', 'first_author_firstName', 'first_author_lastName',
       'last_author_firstName', 'last_author_lastName', 'abstractText',
       'printPublicationDate', 'journal_title_df1', 'affiliation', 'keywords',
       'meshHeadings', 'mod_title', 'Year', 'journal_title_df2', 'doi_df2',
       'Source', 'empty', 'final_doi'],
      dtype='object')

In [12]:
# check for NaN or duplicated article titles
print(f"Amount of NaN article titles: {len(ea_papers_df[ea_papers_df['Article Title'].isna()])}")
print(f"Amount of duplicated article titles: {len(ea_papers_df[ea_papers_df['Article Title'].duplicated()])}")

Amount of NaN article titles: 0
Amount of duplicated article titles: 0


In [13]:
# check for missing PMCIDs (after checking the code of GenderTracker)
print(f"Amount of articles with unknown PMCIDs: {len(ea_papers_df[ea_papers_df['pmcid'].isna()])}")

Amount of articles with unknown PMCIDs: 651


That's not very encouraging - the GenderTracker script uses PMCID as the main key to identify the article. However, we will test by putting the article title instead in order for us to not miss so much data in our gender predictions.

In [14]:
ea_papers_df['first_author'] = ea_papers_df['first_author_firstName'] + ' ' + ea_papers_df['first_author_lastName']
ea_papers_df['last_author'] = ea_papers_df['last_author_firstName'] + ' ' + ea_papers_df['last_author_lastName']

In [15]:
articles_full_authors = ea_papers_df[
    ea_papers_df['first_author_firstName'].notna() &
    ea_papers_df['first_author_lastName'].notna() &
    ea_papers_df['last_author_firstName'].notna() &
    ea_papers_df['last_author_lastName'].notna()
]

articles_first_authors_all = ea_papers_df[
    ea_papers_df['first_author_firstName'].notna() &
    ea_papers_df['first_author_lastName'].notna() &
    ea_papers_df['last_author_firstName'].isna() &
    ea_papers_df['last_author_lastName'].isna()
]

articles_first_authors_name = ea_papers_df[
    ea_papers_df['first_author_firstName'].notna() &
    ea_papers_df['first_author_lastName'].isna() &
    ea_papers_df['last_author_firstName'].isna() &
    ea_papers_df['last_author_lastName'].isna()
]

articles_first_authors_surname = ea_papers_df[
    ea_papers_df['first_author_firstName'].isna() &
    ea_papers_df['first_author_lastName'].notna() &
    ea_papers_df['last_author_firstName'].isna() &
    ea_papers_df['last_author_lastName'].isna()
]

articles_last_authors_all = ea_papers_df[
    ea_papers_df['first_author_firstName'].isna() &
    ea_papers_df['first_author_lastName'].isna() &
    ea_papers_df['last_author_firstName'].notna() &
    ea_papers_df['last_author_lastName'].notna()
]

articles_last_authors_name = ea_papers_df[
    ea_papers_df['first_author_firstName'].isna() &
    ea_papers_df['first_author_lastName'].isna() &
    ea_papers_df['last_author_firstName'].notna() &
    ea_papers_df['last_author_lastName'].isna()
]

articles_last_authors_surname = ea_papers_df[
    ea_papers_df['first_author_firstName'].isna() &
    ea_papers_df['first_author_lastName'].isna() &
    ea_papers_df['last_author_firstName'].isna() &
    ea_papers_df['last_author_lastName'].notna()
]

articles_none_authors = ea_papers_df[
    ea_papers_df['first_author_firstName'].isna() &
    ea_papers_df['first_author_lastName'].isna() &
    ea_papers_df['last_author_firstName'].isna() &
    ea_papers_df['last_author_lastName'].isna()
]

In [46]:
print(f"Papers with full author info: {len(articles_full_authors)}")
print(f"Papers with full first author info: {len(articles_first_authors_all)}")
print(f"Papers with name only first author info: {len(articles_first_authors_name)}")
print(f"Papers with surname only first author info: {len(articles_first_authors_surname)}")
print(f"Papers with full last author info: {len(articles_last_authors_all)}")
print(f"Papers with name only last author info: {len(articles_last_authors_name)}")
print(f"Papers with surname only last author info: {len(articles_last_authors_surname)}")
print(f"Papers with none author info: {len(articles_none_authors)}")

Papers with full author info: 3054
Papers with full first author info: 109
Papers with name only first author info: 0
Papers with surname only first author info: 1
Papers with full last author info: 2
Papers with name only last author info: 0
Papers with surname only last author info: 0
Papers with none author info: 225


Most papers have information about all the authors. Let's first work with these ones as input to **GenderTracker** and see what it predicts.

1) The first step is to convert the CSV to JSON format required by GenderTracker:

In [19]:
def csv_to_json_format(input_table, output_json: str) -> None:
    
    # Read the CSV file
    if type(input_table)==str: 
        df = pd.read_csv(input_table)
    elif isinstance(input_table, pd.DataFrame):
        df = input_table

    # Prepare the list for JSON data
    json_data = []

    # Iterate over each row to create the desired JSON structure
    for _, row in df.iterrows():
        # Extract relevant fields
        pmcid = row['Article Title']
        first_author_first_name = row['first_author_firstName']
        first_author_last_name = row['first_author_lastName']
        last_author_first_name = row['last_author_firstName']
        last_author_last_name = row['last_author_lastName']

        # Construct the full names of authors
        authors = []
        if pd.notna(first_author_first_name) and pd.notna(first_author_last_name):
            first_author = f"{first_author_first_name} {first_author_last_name}"
            authors.append(first_author)
            authors.append(first_author_first_name)
            authors.append(first_author_last_name)
        else:
            authors.append("NaN")
            authors.append("NaN")
            authors.append("NaN")
            
        if pd.notna(last_author_first_name) and pd.notna(last_author_last_name):
            last_author = f"{last_author_first_name} {last_author_last_name}"
            authors.append(last_author)
            authors.append(last_author_first_name)
            authors.append(last_author_last_name)
        else:
            authors.append("NaN")
            authors.append("NaN")
            authors.append("NaN")

        # Format authors list as JSON string
        authors_json_str = json.dumps(authors)

        # Append the data in the required format
        json_data.append({
            "pmcid": pmcid,
            "authors": authors_json_str
        })

    # Write to output JSON file with proper encoding
    with open(output_json, 'w', encoding='utf-8') as json_file:
        json.dump(json_data, json_file, ensure_ascii=False, indent=4)

In [37]:
ea_papers_df

Unnamed: 0,Article Title,source,pmid,pmcid,doi_df1,title,authorString,first_author_firstName,first_author_lastName,last_author_firstName,last_author_lastName,abstractText,printPublicationDate,journal_title_df1,affiliation,keywords,meshHeadings,mod_title,Year,journal_title_df2,doi_df2,Source,empty,final_doi,first_author,last_author
0,0s and 1s in marine molecular research: a regi...,MED,34405237.0,PMC8371273,10.1093/gigascience/giab053,0s and 1s in marine molecular research: a regi...,"Zafeiropoulos H, Gioti A, Ninidakis S, Potirak...",Haris,Zafeiropoulos,Evangelos,Pafilis,High-performance computing (HPC) systems have ...,2021-08-01,GigaScience,"Hellenic Centre for Marine Research, Institute...","Biodiversity, Biotechnology, Aquaculture, High...","Marine Biology, Biotechnology, Aquaculture, Co...",0SAND1SINMARINEMOLECULARRESEARCH:AREGIONALHPCP...,2021,GigaScience,10.1093/gigascience/giab053,MED,,10.1093/gigascience/giab053,Haris Zafeiropoulos,Evangelos Pafilis
1,"1,4-Disubstituted 1H-1,2,3-Triazole Containing...",MED,30972322.0,PMC6443886,10.3389/fchem.2019.00155,"1,4-Disubstituted 1H-1,2,3-Triazole Containing...","Schröder DC, Kracker O, Fröhr T, Góra J, Jewgi...",David C,Schröder,Norbert,Sewald,Peptidotriazolamers are hybrid foldamers with ...,2019-01-01,Frontiers in chemistry,"Organic and Bioorganic Chemistry, Department o...","1, 2, Foldamer, Molecular Dynamic Simulations,...",,"1,4-DISUBSTITUTED1H-1,2,3-TRIAZOLECONTAININGPE...",2019,Frontiers in chemistry,,MED,,10.3389/fchem.2019.00155,David C Schröder,Norbert Sewald
2,10th Anniversary of the European Association f...,,,,,,,,,,,,,,,,,10THANNIVERSARYOFTHEEUROPEANASSOCIATIONFORPRED...,2020,The EPMA journal,10.1007/s13167-020-00206-1,MED,,10.1007/s13167-020-00206-1,,
3,10th European Conference on Rare Diseases & Or...,MED,33168048.0,PMC7649705,10.1186/s13023-020-01550-1,10th European Conference on Rare Diseases & Or...,,,,,,,2020-11-01,Orphanet journal of rare diseases,,,"Humans, Rare Diseases, Drug Approval, Orphan D...",10THEUROPEANCONFERENCEONRAREDISEASES&ORPHANPRO...,2020,Orphanet journal of rare diseases,10.1186/s13023-020-01550-1,MED,,10.1186/s13023-020-01550-1,,
4,"1H, 13C and 15N assignment of the human mitoch...",,,,,,,,,,,,,,,,,"1H,13CAND15NASSIGNMENTOFTHEHUMANMITOCHONDRIALP...",2022,Biomolecular NMR assignments,10.1007/s12104-022-10113-3,MED,,10.1007/s12104-022-10113-3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3392,The Dutch Techcentre for Life Sciences: Enabli...,PPR,,,10.12688/f1000research.6009.2,The Dutch Techcentre for Life Sciences: Enabli...,"Eijssen L, Evelo CT, Kok R, Mons B, Hooft RW, ...",Lars,Eijssen,,,We describe the Data programme of the Dutch Te...,,,,,,THEDUTCHTECHCENTREFORLIFESCIENCES:ENABLINGDATA...,2015 | 2016,,10.12688/f1000research.6009.2,PPR,,10.12688/f1000research.6009.2,Lars Eijssen,
3393,The ELIXIR Biodiversity Community: Understandi...,PPR,,,10.12688/f1000research.133724.2,The ELIXIR Biodiversity Community: Understandi...,"Waterhouse RM, Adam-Blondon A, Balech B, Barta...",Robert,Waterhouse,Toni,Gabaldón,Biodiversity loss is now recognised as one of ...,,,,,,THEELIXIRBIODIVERSITYCOMMUNITY:UNDERSTANDINGSH...,2024 | 2023,,10.12688/f1000research.133724.2 | 10.12688/f10...,PPR,,10.12688/f1000research.133724.2,Robert Waterhouse,Toni Gabaldón
3394,The ELIXIR channel in F1000Research,PPR,,,10.12688/f1000research.7587.2,The ELIXIR channel in F1000Research,"Blomberg N, Oliveira A, Mons B, Persson B, Jon...",Niklas,Blomberg,Inge,Jonassen,"ELIXIR, the European life science infrastructu...",,,,,,THEELIXIRCHANNELINF1000RESEARCH,2015 | 2016,,10.12688/f1000research.7587.2,PPR,,10.12688/f1000research.7587.2,Niklas Blomberg,Inge Jonassen
3395,The need for standardisation in life science r...,MED | PPR,33604028.0,PMC7863991,10.12688/f1000research.27500.2,The need for standardisation in life science r...,"Hollmann S, Kremer A, Baebler Š, Trefois C, Gr...",Susanne,Hollmann,Domenica,D'Elia,"Today, academic researchers benefit from the c...",2020-01-01,F1000Research,"Faculty of Science, University of Potsdam, Pot...","Education, Open Access, Quality Management, St...","Trust, International Cooperation, Quality of L...",THENEEDFORSTANDARDISATIONINLIFESCIENCERESEARCH...,2020,F1000Research,10.12688/f1000research.27500.1,MED,,10.12688/f1000research.27500.2,Susanne Hollmann,Domenica D'Elia


In [41]:
csv_to_json_format(input_table = articles_full_authors, output_json = 'ea_papers_author_data.json')

In [42]:
with open('ea_papers_author_data.json', 'r', encoding='utf-8') as jf:
    ea_papers_author_data = json.load(jf)

In [43]:
len(ea_papers_author_data) # matches the length of our previous filters

3054

In [137]:
unique_fauthors = set(articles_full_authors['first_author'].unique())
unique_lauthors = set(articles_full_authors['last_author'].unique())

unique_authors = unique_fauthors.union(unique_lauthors)

In [138]:
print(f"Amount of unique first authors: {len(unique_fauthors)}")
print(f"Amount of unique last authors: {len(unique_lauthors)}")
print(f"Amount of unique authors: {len(unique_authors)}")

Amount of unique first authors: 2494
Amount of unique last authors: 1997
Amount of unique authors: 4289


In [139]:
unique_fauthors_fnames = set(articles_full_authors['first_author_firstName'].unique())
unique_lauthors_fnames = set(articles_full_authors['last_author_firstName'].unique())

unique_authors_fnames = unique_fauthors_fnames.union(unique_lauthors_fnames)

In [140]:
print(f"Amount of unique first authors' first names: {len(unique_fauthors_fnames)}")
print(f"Amount of unique last authors' first names: {len(unique_lauthors_fnames)}")
print(f"Amount of unique authors' first names: {len(unique_authors_fnames)}")

Amount of unique first authors' first names: 1822
Amount of unique last authors' first names: 1457
Amount of unique authors' first names: 2800


So we have gathered the information from papers that have all of the author information and made a first round of predictions.

Lastly, we include information of papers having at least info about the first or last author names:

In [47]:
articles_some_authors_names = pd.concat([articles_first_authors_all, articles_last_authors_all])
articles_some_authors_names = articles_some_authors_names.reset_index().drop(columns='index')

In [64]:
articles_some_authors_names

Unnamed: 0,Article Title,source,pmid,pmcid,doi_df1,title,authorString,first_author_firstName,first_author_lastName,last_author_firstName,last_author_lastName,abstractText,printPublicationDate,journal_title_df1,affiliation,keywords,meshHeadings,mod_title,Year,journal_title_df2,doi_df2,Source,empty,final_doi,first_author,last_author
0,3Rs toxicity testing and disease modeling proj...,MED,32636731.0,PMC7332811,10.17179/excli2020-1463,3Rs toxicity testing and disease modeling proj...,Vinken M.,Mathieu,Vinken,,,"The 3Rs concept, calling for replacement, redu...",2020-01-01,EXCLI journal,Department of In Vitro Toxicology and Dermato-...,"In vitro, In silico, Europe, 3Rs, Horizon 2020...",,3RSTOXICITYTESTINGANDDISEASEMODELINGPROJECTSIN...,2020,EXCLI journal,10.17179/excli2020-1463,MED,,10.17179/excli2020-1463,Mathieu Vinken,
1,A benchmark of optimization solvers for genome...,MED,38251879.0,PMC10878033,10.1128/msystems.00833-23,A benchmark of optimization solvers for genome...,Machado D.,Daniel,Machado,,,Genome-scale metabolic modeling is a powerful ...,2024-02-01,mSystems,"Department of Biotechnology and Food Science, ...","Metabolism, Genome-scale Modeling, Optimizatio...","Humans, Ecosystem, Biochemical Phenomena, Geno...",ABENCHMARKOFOPTIMIZATIONSOLVERSFORGENOME-SCALE...,2024,mSystems,10.1128/msystems.00833-23,MED,,10.1128/msystems.00833-23,Daniel Machado,
2,A recent origin of Orf3a from M protein across...,MED,33363705.0,PMC7749296,10.1016/j.csbj.2020.11.047,A recent origin of Orf3a from M protein across...,Ouzounis CA.,Christos A,Ouzounis,,,"The genome of SARS-CoV-2, the coronavirus resp...",2020-01-01,Computational and structural biotechnology jou...,Biological Computation & Process Laboratory (B...,"Protein superfamily, Structure prediction, M p...",,ARECENTORIGINOFORF3AFROMMPROTEINACROSSTHECORON...,2020,Computational and structural biotechnology jou...,10.1016/j.csbj.2020.11.047,MED,,10.1016/j.csbj.2020.11.047,Christos A Ouzounis,
3,A survey into the contribution of regional/nat...,PPR,,,10.12688/f1000research.142165.1,A survey into the contribution of regional/nat...,"Neves A, Willassen NP, Hjerde E, Cuesta I, Mar...",Aitana,Neves,,,Background: Regional/national SARS-CoV-2 geno...,,,,,,ASURVEYINTOTHECONTRIBUTIONOFREGIONAL/NATIONALP...,2023,,10.12688/f1000research.142165.1,PPR,,10.12688/f1000research.142165.1,Aitana Neves,
4,A versatile and interoperable computational fr...,PPR,,,10.1101/2022.12.17.520865,A versatile and interoperable computational fr...,"Niarakis A, Ostaszewski M, Mazein A, Kuperstei...",Anna,Niarakis,,,The COVID-19 Disease Map project is a large-sc...,,,,,,AVERSATILEANDINTEROPERABLECOMPUTATIONALFRAMEWO...,2022,,10.1101/2022.12.17.520865,PPR,,10.1101/2022.12.17.520865,Anna Niarakis,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,htsget: a protocol for securely streaming geno...,MED,29931085.0,PMC6298043,10.1093/bioinformatics/bty492,htsget: a protocol for securely streaming geno...,"Kelleher J, Lin M, Albach CH, Birney E, Davies...",Jerome,Kelleher,,,SummaryStandardized interfaces for efficiently...,2019-01-01,"Bioinformatics (Oxford, England)","Big Data Institute, Li Ka Shing Centre for Hea...",,"Computational Biology, Genomics, Genome, Softw...",HTSGET:APROTOCOLFORSECURELYSTREAMINGGENOMICDATA.,2019,"Bioinformatics (Oxford, England)",,MED,,10.1093/bioinformatics/bty492,Jerome Kelleher,
107,Mental health impact of the first wave of COVI...,MED,34127211.0 | 33309957.0,PMC10068024 | PMC7726524,10.1016/j.rpsmen.2021.05.003 | 10.1016/j.rpsm....,Mental health impact of the first wave of COVI...,"Alonso J, Vilagut G, Mortier P, Ferrer M, Alay...",Jordi,Alonso,,,IntroductionHealthcare workers are vulnerable ...,2021-04-01,Revista de psiquiatria y salud mental,"Health Services Research Unit, IMIM-Institut H...","Mental disorders, Disability, Healthcare Worke...","Humans, Occupational Diseases, Prevalence, Cro...",MENTALHEALTHIMPACTOFTHEFIRSTWAVEOFCOVID-19PAND...,2021 | 2020,Revista de psiquiatria y salud mental,10.1016/j.rpsmen.2021.05.003 | 10.1016/j.rpsm....,MED,,10.1016/j.rpsmen.2021.05.003 | 10.1016/j.rpsm....,Jordi Alonso,
108,The Dutch Techcentre for Life Sciences: Enabli...,PPR,,,10.12688/f1000research.6009.2,The Dutch Techcentre for Life Sciences: Enabli...,"Eijssen L, Evelo CT, Kok R, Mons B, Hooft RW, ...",Lars,Eijssen,,,We describe the Data programme of the Dutch Te...,,,,,,THEDUTCHTECHCENTREFORLIFESCIENCES:ENABLINGDATA...,2015 | 2016,,10.12688/f1000research.6009.2,PPR,,10.12688/f1000research.6009.2,Lars Eijssen,
109,Capturing variation impact on molecular intera...,MED,30602777.0,PMC6315030,10.1038/s41467-018-07709-6,Capturing variation impact on molecular intera...,"IMEx Consortium Curators, Del-Toro N, Duesbury...",,,P,Porras,The current wealth of genomic variation data i...,2019-01-01,Nature communications,,,"Animals, Humans, Disease, Amino Acid Substitut...",CAPTURINGVARIATIONIMPACTONMOLECULARINTERACTION...,2019,Nature communications,10.1038/s41467-018-07709-6,MED,,10.1038/s41467-018-07709-6,,P Porras


In [48]:
csv_to_json_format(input_table = articles_some_authors_names, output_json = 'ea_papers_author_names_data.json')

In [49]:
with open('ea_papers_author_names_data.json', 'r', encoding='utf-8') as jf:
    ea_papers_author_names_data = json.load(jf)

In [51]:
len(ea_papers_author_names_data)

111

genderTracker is run as:

```bash
python -m genderTracker -j <your_data.json> -od <output_folder> -v True
```

## Analyzing PubMed corpus

In [26]:
papers_5k = pd.read_table('1312.csv')

In [27]:
# Convert to string and handle NaN values by replacing them with an empty string
papers_5k['authors'] = papers_5k['authors'].fillna('').astype(str)

# Apply the function to get first and last authors
papers_5k['first_author'] = papers_5k['authors'].apply(lambda x: x.split(',')[0] if x else '')
papers_5k['last_author'] = papers_5k['authors'].apply(lambda x: x.split(',')[-1].lstrip(' ') if x else '')

papers_5k = papers_5k[papers_5k['first_author']!='']
papers_5k = papers_5k[papers_5k['last_author']!='']

In [31]:
papers_5k

Unnamed: 0,category,entry_id,openalex_id,doi,publication_year,authors,first_author,last_author,first_author_firstName,first_author_lastName,last_author_firstName,last_author_lastName
0,subfields,https://openalex.org/subfields/1312,https://openalex.org/W1775749144,https://doi.org/10.1016/s0021-9258(19)52451-6,1951,"OliverH. Lowry, NiraJ. Rosebrough, A. Farr, Ro...",OliverH. Lowry,RoseJ. Randall,Oliver,Lowry,Rose,Randall
1,subfields,https://openalex.org/subfields/1312,https://openalex.org/W2100837269,https://doi.org/10.1038/227680a0,1970,Ulrich K. Laemmli,Ulrich K. Laemmli,Ulrich K. Laemmli,Ulrich,Laemmli,Ulrich,Laemmli
2,subfields,https://openalex.org/subfields/1312,https://openalex.org/W2128635872,https://doi.org/10.1006/abio.1976.9999,1976,Mark A. Bradford,Mark A. Bradford,Mark A. Bradford,Mark,Bradford,Mark,Bradford
3,subfields,https://openalex.org/subfields/1312,https://openalex.org/W2107277218,https://doi.org/10.1006/meth.2001.1262,2001,"Kenneth J. Livak, Thomas D. Schmittgen",Kenneth J. Livak,Thomas D. Schmittgen,Kenneth,Livak,Thomas,Schmittgen
4,subfields,https://openalex.org/subfields/1312,https://openalex.org/W2144634347,,2001,"Joseph Sambrook, Elisabeth Fritsch, Tom Maniatis",Joseph Sambrook,Tom Maniatis,Joseph,Sambrook,Tom,Maniatis
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,subfields,https://openalex.org/subfields/1312,https://openalex.org/W2895451238,https://doi.org/10.1007/springerreference_39918,2011,"Anita B. Roberts, Michael B. Sporn",Anita B. Roberts,Michael B. Sporn,Anita,Roberts,Michael,Sporn
4996,subfields,https://openalex.org/subfields/1312,https://openalex.org/W1849819493,https://doi.org/10.1016/s1387-2656(05)11004-7,2005,"Michael V. Berridge, Patries M. Herst, An S. Tan",Michael V. Berridge,An S. Tan,Michael,Berridge,An,Tan
4997,subfields,https://openalex.org/subfields/1312,https://openalex.org/W1870980090,https://doi.org/10.1016/0022-510x(88)90132-3,1988,"Jan Lexell, Charles Taylor, Michael Sjöstróm",Jan Lexell,Michael Sjöstróm,Jan,Lexell,Michael,Sjöstróm
4998,subfields,https://openalex.org/subfields/1312,https://openalex.org/W1999463736,https://doi.org/10.1073/pnas.1732912100,2003,"Christos Sotiriou, Soek-Ying Neo, Lisa M. McSh...",Christos Sotiriou,Edison T. Liu,Christos,Sotiriou,Edison,Liu


In [28]:
import re
def get_first_name(name):
    # Match the first part of the name, stopping before any uppercase letter followed by a dot
    match = re.match(r'([A-Za-z]+?)(?=[A-Z]\.)', name)
    return match.group(1) if match else name.split(' ')[0]

In [29]:
papers_5k['first_author_firstName'] = papers_5k['first_author'].apply(lambda x: get_first_name(x.split(' ')[0]))
papers_5k['first_author_lastName'] = papers_5k['first_author'].apply(lambda x: x.split(' ')[-1])
papers_5k['last_author_firstName'] = papers_5k['last_author'].apply(lambda x: get_first_name(x.split(' ')[0]))
papers_5k['last_author_lastName'] = papers_5k['last_author'].apply(lambda x: x.split(' ')[-1])

In [30]:
papers_5k.to_csv('1312_splitnames.csv', index=False)

In [32]:
papers_5k[papers_5k['openalex_id'].isna()].shape

(0, 12)

In [50]:
# Split the DataFrame into 4 parts
papers_5k_parts = np.array_split(papers_5k, 4)

# Access each part by indexing
part1 = papers_5k_parts[0]
part2 = papers_5k_parts[1]
part3 = papers_5k_parts[2]
part4 = papers_5k_parts[3]

  return bound(*args, **kwds)


In [45]:
papers_5k

Unnamed: 0,category,entry_id,openalex_id,doi,publication_year,authors,first_author,last_author,first_author_firstName,first_author_lastName,last_author_firstName,last_author_lastName
0,subfields,https://openalex.org/subfields/1312,https://openalex.org/W1775749144,https://doi.org/10.1016/s0021-9258(19)52451-6,1951,"OliverH. Lowry, NiraJ. Rosebrough, A. Farr, Ro...",OliverH. Lowry,RoseJ. Randall,Oliver,Lowry,Rose,Randall
1,subfields,https://openalex.org/subfields/1312,https://openalex.org/W2100837269,https://doi.org/10.1038/227680a0,1970,Ulrich K. Laemmli,Ulrich K. Laemmli,Ulrich K. Laemmli,Ulrich,Laemmli,Ulrich,Laemmli
2,subfields,https://openalex.org/subfields/1312,https://openalex.org/W2128635872,https://doi.org/10.1006/abio.1976.9999,1976,Mark A. Bradford,Mark A. Bradford,Mark A. Bradford,Mark,Bradford,Mark,Bradford
3,subfields,https://openalex.org/subfields/1312,https://openalex.org/W2107277218,https://doi.org/10.1006/meth.2001.1262,2001,"Kenneth J. Livak, Thomas D. Schmittgen",Kenneth J. Livak,Thomas D. Schmittgen,Kenneth,Livak,Thomas,Schmittgen
4,subfields,https://openalex.org/subfields/1312,https://openalex.org/W2144634347,,2001,"Joseph Sambrook, Elisabeth Fritsch, Tom Maniatis",Joseph Sambrook,Tom Maniatis,Joseph,Sambrook,Tom,Maniatis
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,subfields,https://openalex.org/subfields/1312,https://openalex.org/W2895451238,https://doi.org/10.1007/springerreference_39918,2011,"Anita B. Roberts, Michael B. Sporn",Anita B. Roberts,Michael B. Sporn,Anita,Roberts,Michael,Sporn
4996,subfields,https://openalex.org/subfields/1312,https://openalex.org/W1849819493,https://doi.org/10.1016/s1387-2656(05)11004-7,2005,"Michael V. Berridge, Patries M. Herst, An S. Tan",Michael V. Berridge,An S. Tan,Michael,Berridge,An,Tan
4997,subfields,https://openalex.org/subfields/1312,https://openalex.org/W1870980090,https://doi.org/10.1016/0022-510x(88)90132-3,1988,"Jan Lexell, Charles Taylor, Michael Sjöstróm",Jan Lexell,Michael Sjöstróm,Jan,Lexell,Michael,Sjöstróm
4998,subfields,https://openalex.org/subfields/1312,https://openalex.org/W1999463736,https://doi.org/10.1073/pnas.1732912100,2003,"Christos Sotiriou, Soek-Ying Neo, Lisa M. McSh...",Christos Sotiriou,Edison T. Liu,Christos,Sotiriou,Edison,Liu


In [46]:
def csv_to_json_format(input_table, output_json: str) -> None:
    
    # Read the CSV file
    if type(input_table)==str: 
        df = pd.read_csv(input_table)
    elif isinstance(input_table, pd.DataFrame):
        df = input_table

    # Prepare the list for JSON data
    json_data = []

    # Iterate over each row to create the desired JSON structure
    for _, row in df.iterrows():
        # Extract relevant fields
        pmcid = row['openalex_id']
        first_author_first_name = row['first_author_firstName']
        first_author_last_name = row['first_author_lastName']
        last_author_first_name = row['last_author_firstName']
        last_author_last_name = row['last_author_lastName']

        # Construct the full names of authors
        authors = []
        if pd.notna(first_author_first_name) and pd.notna(first_author_last_name):
            first_author = f"{first_author_first_name} {first_author_last_name}"
            authors.append(first_author)
            authors.append(first_author_first_name)
            authors.append(first_author_last_name)
        else:
            authors.append("NaN")
            authors.append("NaN")
            authors.append("NaN")
            
        if pd.notna(last_author_first_name) and pd.notna(last_author_last_name):
            last_author = f"{last_author_first_name} {last_author_last_name}"
            authors.append(last_author)
            authors.append(last_author_first_name)
            authors.append(last_author_last_name)
        else:
            authors.append("NaN")
            authors.append("NaN")
            authors.append("NaN")

        # Format authors list as JSON string
        authors_json_str = json.dumps(authors)

        # Append the data in the required format
        json_data.append({
            "pmcid": pmcid,
            "authors": authors_json_str
        })

    # Write to output JSON file with proper encoding
    with open(output_json, 'w', encoding='utf-8') as json_file:
        json.dump(json_data, json_file, ensure_ascii=False, indent=4)

In [58]:
csv_to_json_format(input_table = part1, output_json = '1312_data1.json')
csv_to_json_format(input_table = part2, output_json = '1312_data2.json')
csv_to_json_format(input_table = part3, output_json = '1312_data3.json')
csv_to_json_format(input_table = part4, output_json = '1312_data4.json')

In [9]:
with open('data/1312_data.json', 'r', encoding='utf-8') as jf:
    papers_5k_data = json.load(jf)

In [10]:
papers_5k_data[:5]

[{'pmcid': 'https://openalex.org/W1775749144',
  'authors': '["Oliver Lowry", "Oliver", "Lowry", "Rose Randall", "Rose", "Randall"]'},
 {'pmcid': 'https://openalex.org/W2100837269',
  'authors': '["Ulrich Laemmli", "Ulrich", "Laemmli", "Ulrich Laemmli", "Ulrich", "Laemmli"]'},
 {'pmcid': 'https://openalex.org/W2128635872',
  'authors': '["Mark Bradford", "Mark", "Bradford", "Mark Bradford", "Mark", "Bradford"]'},
 {'pmcid': 'https://openalex.org/W2107277218',
  'authors': '["Kenneth Livak", "Kenneth", "Livak", "Thomas Schmittgen", "Thomas", "Schmittgen"]'},
 {'pmcid': 'https://openalex.org/W2144634347',
  'authors': '["Joseph Sambrook", "Joseph", "Sambrook", "Tom Maniatis", "Tom", "Maniatis"]'}]