# Parsing Medline file

In [4]:
from Bio import Medline
import pandas as pd

In [5]:
#It reads medline file and prints a table with the following fields: journal, title, authors, country, year.
with open("table1.csv", "w") as file: 
    with open("sp_nov_2001_2018_pubmed_16012019.medline") as handle:
        records = Medline.parse(handle)
        for record in records:
            title = record["TI"]
            journal = record["JT"]
            authors = str(record.get("FAU")).replace(',', '')
            year = record["DP"].split(" ")[0]
            try:
                abstract = record["AB"]
            except:
                abstract = "no abstract"
            file.write(journal + "\t" + title + "\t" + authors + "\t" + year + "\t" + abstract + "\n")




In [6]:
df = pd.read_csv("table1.csv", sep="\t")
df.columns = ["Journal", "Title", "Authors", "Year", "Abstract"]
df

Unnamed: 0,Journal,Title,Authors,Year,Abstract
0,International journal of systematic and evolut...,"Leucothrix arctica sp. nov., isolated from Arc...",['Baek Kiwoon' 'Choi Ahyoung' 'Lee Yung Mi' 'L...,2018,"A Gram-stain-negative, non-motile, oxidase- an..."
1,International journal of systematic and evolut...,"Hyphobacterium indicum sp. nov., isolated from...",['Ruan Chu-Jin' 'Zheng Xiao-Wei' 'Wang Jian' '...,2018,"A novel aerobic, Gram-stain-negative bacterium..."
2,"Journal of microbiology (Seoul, Korea)","Gramella fulva sp. nov., isolated from a dry s...",['Hwang Sae Hyun' 'Hwang Woon Mo' 'Kang Keunso...,2019,"A novel Gram-stain-negative, aerobic, motile b..."
3,"Journal of microbiology (Seoul, Korea)","Flavisolibacter aluminii sp. nov., a novel mem...",['Lee Hyosun' 'Kim Dong-Uk' 'Lee Suyeon' 'Kim ...,2019,"A Gram-stain-negative, aerobic, non-motile, ro..."
4,International journal of systematic and evolut...,Sphingopyxis lindanitolerans sp. nov. strain W...,['Kaminski Michal A' 'Sobczak Adam' 'Spolnik G...,2018,"An aerobic, Gram-stain-negative, rod-shaped, n..."
5,International journal of systematic and evolut...,"Mesosutterella multiformis gen. nov., sp. nov....",['Sakamoto Mitsuo' 'Ikeyama Nao' 'Kunihiro Tad...,2018,"Two novel, obligately anaerobic, Gram-stain-ne..."
6,International journal of systematic and evolut...,"Acinetobacter sichuanensis sp. nov., recovered...",['Qin Jiayuan' 'Hu Yiyi' 'Feng Yu' 'Lv Xiaoju'...,2018,"A novel Acinetobacter strain, WCHAc060041(T), ..."
7,International journal of systematic and evolut...,"Enterobacter sichuanensis sp. nov., recovered ...",['Wu Wenjing' 'Feng Yu' 'Zong Zhiyong'],2018,"An Enterobacter strain, WCHECL1597(T), was rec..."
8,International journal of systematic and evolut...,"Kribbella monticola sp. nov., a novel actinomy...",['Song Wei' 'Duan Liping' 'Zhao Junwei' 'Jiang...,2018,"A novel actinobacterium, designated strain NEA..."
9,International journal of systematic and evolut...,"Halomonas litopenaei sp. nov., a moderately ha...",['Xue Ming' 'Wen Chong-Qing' 'Liu Lan' 'Fang B...,2018,"Two Gram-stain negative, moderately halophilic..."


# Screening

In [7]:
#there was some divergence between electronic publication date and publication date. Itens from 2019 were removed.
table_1_not_2019 = df[df["Year"] != 2019]
table_1_not_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9751 entries, 0 to 9752
Data columns (total 5 columns):
Journal     9751 non-null object
Title       9751 non-null object
Authors     9751 non-null object
Year        9751 non-null int64
Abstract    9751 non-null object
dtypes: int64(1), object(4)
memory usage: 457.1+ KB


In [8]:
#detect records containing Errata, Comments or Responses
table_1_not_2019[table_1_not_2019["Title"].str.contains("Erratum to|Comments on|Response to")]

Unnamed: 0,Journal,Title,Authors,Year,Abstract
2722,Systematic and applied microbiology,"Response to: ""Comments on: ""A polyphasic appro...",['Gaget Virginie' 'Welker Martin' 'Rippka Rosm...,2015,no abstract
2764,Systematic and applied microbiology,"Comments on: ""A polyphasic approach leading to...",['Oren Aharon'],2015,no abstract
4223,Antonie van Leeuwenhoek,Erratum to: Two novel species Enterococcus lem...,['Cotta Michael A' 'Whitehead Terence R' 'Fals...,2013,A polyphasic taxonomic study using morphologic...


In [9]:
#drop records containing Errata, Comments or Responses
table_1_not_ecr = table_1_not_2019.drop(table_1_not_2019[table_1_not_2019["Title"].str.contains("Erratum to|Comments on|Response to")].index)
table_1_not_ecr.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 9748 entries, 0 to 9752
Data columns (total 5 columns):
Journal     9748 non-null object
Title       9748 non-null object
Authors     9748 non-null object
Year        9748 non-null int64
Abstract    9748 non-null object
dtypes: int64(1), object(4)
memory usage: 456.9+ KB


In [10]:
#detect records without abstract
table_1_not_ecr[table_1_not_ecr["Abstract"] == "no abstract"]

Unnamed: 0,Journal,Title,Authors,Year,Abstract
3847,The Journal of general and applied microbiology,"Idiomarina piscisalsi sp. nov., from fermented...",['Sitdhipol Jaruwan' 'Visessanguan Wonnop' 'Be...,2013,no abstract
3848,The Journal of general and applied microbiology,"Swingsia samuiensis gen. nov., sp. nov., an os...",['Malimas Taweesak' 'Chaipitakchonlatarn Winai...,2013,no abstract
3975,The Journal of general and applied microbiology,"Sporomusa intestinalis sp. nov., a homoacetoge...",['Hattori Satoshi' 'Hongoh Yuichi' 'Itoh Takas...,2013,no abstract
3976,The Journal of general and applied microbiology,"Gemmatirosa kalamazoonesis gen. nov., sp. nov....",['DeBruyn Jennifer M' 'Fawaz Mariam N' 'Peacoc...,2013,no abstract
4057,The Journal of general and applied microbiology,Porphyrobacter colymbi sp. nov. isolated from ...,['Furuhata Katsunori' 'Edagawa Akiko' 'Miyamot...,2013,no abstract
4100,Mikrobiologiia,[Spirosoma xylofaga sp. nov. - oligotrophic pl...,['Zaichikova M V' 'Berestovskaia Iu Iu' 'Kuzne...,2013,no abstract
4117,The Journal of general and applied microbiology,"Nguyenibacter vanlangensis gen. nov., sp. nov....",['Thi Lan Vu Huong' 'Yukphan Pattaraporn' 'Cha...,2013,no abstract
4625,The Journal of general and applied microbiology,"Comamonas terrae sp. nov., an arsenite-oxidizi...",['Chipirom Kitja' 'Tanasupawat Somboon' 'Akara...,2012,no abstract
5002,Mikrobiologiia,"[Magnetospirillum aberrantis sp. nov., a new f...",['Gorlenko V M' 'Dziuba M V' 'Maleeva A N' 'Pa...,2011,no abstract
5035,The Journal of general and applied microbiology,"Gluconobacter uchimurae sp. nov., an acetic ac...",['Tanasupawat Somboon' 'Kommanee Jintana' 'Yuk...,2011,no abstract


In [11]:
#drop records without abstracts
table_1_with_abstracts = table_1_not_ecr.drop(table_1_not_ecr[table_1_not_ecr["Abstract"] == "no abstract"].index)
table_1_with_abstracts.to_csv("table_1_with_abstracts.csv")
table_1_with_abstracts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9716 entries, 0 to 9752
Data columns (total 5 columns):
Journal     9716 non-null object
Title       9716 non-null object
Authors     9716 non-null object
Year        9716 non-null int64
Abstract    9716 non-null object
dtypes: int64(1), object(4)
memory usage: 455.4+ KB


In [12]:
#identify record from Protist journal
table_1_with_abstracts[table_1_with_abstracts["Journal"] == "Protist"]

Unnamed: 0,Journal,Title,Authors,Year,Abstract
2173,Protist,"Paulinella longichromatophora sp. nov., a New ...",['Kim Sunju' 'Park Myung Gil'],2016,The freshwater testate filose amoeba Paulinell...


In [13]:
#drop records from Protist journal
table_1_no_protist = table_1_with_abstracts.drop(table_1_with_abstracts[table_1_with_abstracts["Journal"] == "Protist"].index)
table_1_no_protist.to_csv("table1_screening_final.csv")
table_1_no_protist.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9715 entries, 0 to 9752
Data columns (total 5 columns):
Journal     9715 non-null object
Title       9715 non-null object
Authors     9715 non-null object
Year        9715 non-null int64
Abstract    9715 non-null object
dtypes: int64(1), object(4)
memory usage: 455.4+ KB


In [16]:
#identify records containing candidatus species
table_1_no_protist[table_1_no_protist["Title"].str.contains("Candidatus")]


Unnamed: 0,Journal,Title,Authors,Year,Abstract
783,Journal of fish diseases,Candidatus Actinochlamydia pangasiae sp. nov. ...,['Sood N' 'Pradhan P K' 'Verma D K' 'Yadav M K...,2018,Chlamydial infections are recognised as causat...
1220,Systematic and applied microbiology,'Candidatus Dichloromethanomonas elyunquensis'...,['Kleindienst Sara' 'Higgins Steven A' 'Tsemen...,2017,Taxonomic assignments of anaerobic dichloromet...
2554,Mikrobiologiia,"[Candidatus ""Jettenia moscovienalis"" sp. nov.,...",['Nikolaev A' 'Kozlov M N' 'Kevbrina M V' 'Dor...,2015,A new species of bacteria oxidizing ammonium w...
3070,Archives of microbiology,"A new intracellular bacterium, Candidatus Simi...",['Steigen Andreas' 'Karlsbakk Egil' 'Plarre He...,2015,Certain wrasse species (Labridae) are used as ...
4002,PloS one,"'Candidatus Megaira polyxenophila' gen. nov., ...",['Schrallhammer Martina' 'Ferrantini Filippo' ...,2013,Neglected Rickettsiaceae (i.e. those harboured...
4758,Extremophiles : life under extreme conditions,Amino acid-assimilating phototrophic heliobact...,['Asao Marie' 'Takaichi Shinichi' 'Madigan Mic...,2012,"Two novel taxa of heliobacteria, Heliorestis a..."
4872,Systematic and applied microbiology,"Candidatus ""Thiodictyon syntrophicum"", sp. nov...",['Peduzzi Sandro' 'Storelli Nicola' 'Welsh All...,2012,Strain Cad16(T) is a small-celled purple sulfu...
5564,Environmental microbiology,'Candidatus Liberibacter europaeus' sp. nov. t...,['Raddadi Noura' 'Gonella Elena' 'Camerota Cat...,2011,'Candidatus Liberibacter spp.' cause serious p...
5914,European journal of protistology,"Saccamoeba lacustris, sp. nov. (Amoebozoa: Lob...",['Corsaro Daniele' 'Michel Rolf' 'Walochnik Ju...,2010,"An amoeba isolated from an aquatic biotope, id..."
8037,International journal of systematic and evolut...,Isolates of 'Candidatus Nostocoida limicola' B...,['McKenzie C M' 'Seviour E M' 'Schumann P' 'Ma...,2006,"Despite differences in their morphologies, com..."


In [17]:
#drop records containing candidatus species
table_1_screening_final = table_1_no_protist.drop(table_1_no_protist[table_1_no_protist["Title"].str.contains("Candidatus")].index)
table_1_screening_final.to_csv("table1_screening_final.csv")
table_1_screening_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9695 entries, 0 to 9752
Data columns (total 5 columns):
Journal     9695 non-null object
Title       9695 non-null object
Authors     9695 non-null object
Year        9695 non-null int64
Abstract    9695 non-null object
dtypes: int64(1), object(4)
memory usage: 454.5+ KB


# Eligibility

In [18]:
table_1_screening_final["Journal"].value_counts().head(10)

International journal of systematic and evolutionary microbiology    7733
Antonie van Leeuwenhoek                                               682
Systematic and applied microbiology                                   292
Journal of microbiology (Seoul, Korea)                                168
Current microbiology                                                  161
Archives of microbiology                                              145
The Journal of general and applied microbiology                        69
Extremophiles : life under extreme conditions                          63
Journal of clinical microbiology                                       59
Applied and environmental microbiology                                 46
Name: Journal, dtype: int64

In [21]:
not_main_journals = table_1_screening_final[(table_1_screening_final["Journal"] != "International journal of systematic and evolutionary microbiology") &
                      (table_1_screening_final["Journal"] != "Antonie van Leeuwenhoek") &
                      (table_1_screening_final["Journal"] != "Systematic and applied microbiology") &
                      (table_1_screening_final["Journal"] != "Journal of microbiology (Seoul, Korea)") &
                      (table_1_screening_final["Journal"] != "Current microbiology") &
                      (table_1_screening_final["Journal"] != "Archives of microbiology")
                      ]


In [29]:
not_main_journals.head(5)

Unnamed: 0,Journal,Title,Authors,Year,Abstract
90,Microbes and environments,Presence of Cu-Type (NirK) and cd1-Type (NirS)...,['Jang Jeonghwan' 'Ashida Naoaki' 'Kai Ayaaki'...,2018,Nitrite reductase is a key enzyme for denitrif...
238,PloS one,Metabolic and taxonomic insights into the Gram...,['Sharma Vikas' 'Siedenburg Gabriele' 'Birke J...,2018,"The pathway of rubber (poly [cis-1,4-isoprene]..."
266,MicrobiologyOpen,Characterization of a novel Gram-stain-positiv...,['Diop Khoudia' 'Diop Awa' 'Khelaifia Saber' '...,2018,"Strain Marseille-P2341(T) , a nonmotile, nonsp..."
278,Microbiological research,Phenotypic and genotypic characterisation of a...,['Mogany Trisha' 'Swalaha Feroz M' 'Allam Mush...,2018,A novel halotolerant species of cyanobacterium...
289,Microbiology and immunology,"Pseudopropionibacterium rubrum sp. nov., a nov...",['Saito Masanori' 'Shinozaki-Kuwahara Noriko' ...,2018,"In this study, sStrain SK-1(T) , a novel gram-..."


In [34]:
not_eligible = not_main_journals.loc[[989, 2974, 3161, 3811, 4497, 5293, 5827, 7121, 7233, 7569, 8410, 8490, 8548, 9029, 9070, 9213]]

In [36]:
table_1_eligible_final = table_1_screening_final.drop(not_main_journals.loc[[989, 2974, 3161, 3811, 4497, 5293, 5827, 7121, 7233, 7569, 8410, 8490, 8548, 9029, 9070, 9213]].index)
table_1_eligible_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9679 entries, 0 to 9752
Data columns (total 5 columns):
Journal     9679 non-null object
Title       9679 non-null object
Authors     9679 non-null object
Year        9679 non-null int64
Abstract    9679 non-null object
dtypes: int64(1), object(4)
memory usage: 453.7+ KB
