In [23]:
from Bio import Medline
import pandas as pd
from affiliation_parser import parse_affil

In [24]:
with open("table1.csv", "w") as file: #extrai info do medline e faz uma tabela com as seguintes colunas: journal, title, authors, country, year
    with open("sp_nov_2001_2018_pubmed.medline") as handle:
        records = Medline.parse(handle)
        for record in records:
            title = record["TI"]
            journal = record["JT"]
            authors = str(record.get("FAU")).replace(',', '')
            affiliations = str(record.get("AD")) #se fazer um split, perde vários países
            country_first_author = parse_affil(affiliations)["country"] 
            year = record["DP"].split(" ")[0]
            try:
                abstract = record["AB"]
            except:
                abstract = "no abstract"
            file.write(journal + "\t" + title + "\t" + authors + "\t" + country_first_author + "\t" 
                       + year + "\t" + abstract + "\n")




In [25]:
df = pd.read_csv("table1.csv", sep="\t")
df.columns = ["Journal", "Title", "Authors", "First author country", "Year", "Abstract"]
df

Unnamed: 0,Journal,Title,Authors,First author country,Year,Abstract
0,International journal of systematic and evolut...,"Reichenbachiella versicolor sp. nov., isolated...",['Shi Ming-Jing' 'Wang Chong' 'Liu Zuan-Yan' '...,china,2018,"A Gram-stain-negative, strictly aerobic, non-f..."
1,International journal of systematic and evolut...,"Lactobacillus paragasseri sp. nov., a sister t...",['Tanizawa Yasuhiro' 'Tada Ipputa' 'Kobayashi ...,japan,2018,"Three strains, JCM 5343(T), JCM 5344 and JCM 1..."
2,International journal of systematic and evolut...,"Geobacillus yumthangensis sp. nov., a thermoph...",['Najar Ishfaq Nabi' 'Sherpa Mingma Thundu' 'D...,india,2018,"A thermophilic, spore-forming, rod-shaped bact..."
3,International journal of systematic and evolut...,"Chryseolinea flava sp. nov., a new species of ...",['Wang Jing-Jing' 'Chen Qi' 'Li Yue-Zhong'],china,2018,"A pale yellow bacterial strain, designated SDU..."
4,International journal of systematic and evolut...,"Ottowia konkukae sp. nov., isolated from rotte...",['Yi Kwon Jung' 'Im Wan-Taek' 'Kim Dong-Woon' ...,south korea,2018,"A Gram-negative, aerobic, non-motile, non-spor..."
5,Systematic and applied microbiology,Caulobacter zeae sp. nov. and Caulobacter radi...,['Gao Jun-Lian' 'Sun Pengbo' 'Sun Xiao-Hong' '...,china,2018,"Four bacterial strains designated 410(T), 441,..."
6,Systematic and applied microbiology,Physiological and genomic properties of Thermu...,['Zhou En-Min' 'Xian Wen-Dong' 'Jiao Jian-Yu' ...,china,2018,"Two closely related, thermophilic bacteria, de..."
7,International journal of systematic and evolut...,"Blastococcus litoris sp. nov., isolated from s...",['Lee Dong Wan' 'Lee Hanbyul' 'Kwon Bong-Oh' '...,south korea,2018,"A novel Gram-strain-positive, non-spore-formin..."
8,International journal of systematic and evolut...,"Mycobacterium decipiens sp. nov., a new specie...",['Brown-Elliott Barbara A' 'Simmer Patricia J'...,italy,2018,Two mycobacterial strains with close similarit...
9,Applied and environmental microbiology,"Genome analysis, metabolic potential and preda...",['Livingstone Paul G' 'Morphew Russell M' 'Coo...,united kingdom,2018,"Herpetosiphon spp are ubiquitous, chemoheterot..."


In [26]:
df["Authors"] = df["Authors"].apply(lambda x: x.lower()).apply(lambda x: x.replace("['","")).apply(lambda x: x.replace('["',"")).apply(lambda x: x.replace("']","")).apply(lambda x: x.replace('"]',"")).apply(lambda x: x.replace('"',"'")).apply(lambda x: x.split("' '"))

In [27]:
df[df["First author country"].isna()]

Unnamed: 0,Journal,Title,Authors,First author country,Year,Abstract
13,Systematic and applied microbiology,Genome sequences and description of novel exop...,"[skraban jure, cleenwerck ilse, vandamme peter...",,2018,Strains T5K1 and AV446 isolated from apple cid...
1189,Microbes and environments,"Streptomyces cameroonensis sp. nov., a Geldana...","[boudjeko thaddee, tchinda romaric armel mouaf...",,2017,"The taxonomy of an actinobacterial strain, des..."
1251,International journal of systematic and evolut...,"Campylobacter ornithocola sp. nov., a novel me...","[caceres alberto, munoz ivo, iraola gregorio, ...",,2017,During a study on the prevalence and diversity...
1264,International journal of systematic and evolut...,"Psychromonas aquatilis sp. nov., isolated from...","[kampfer peter, irgang rute, poblete-morales m...",,2017,"A slightly beige-white pigmented, Gram-stainin..."
1415,International journal of systematic and evolut...,"Macrococcus canis sp. nov., a skin bacterium a...","[gobeli brawand stefanie, cotting kerstin, gom...",,2017,Gram-stain-positive cocci were isolated from m...
1671,International journal of systematic and evolut...,"Pseudoduganella danionis sp. nov., isolated fr...","[kampfer peter, irgang rute, busse hans-jurgen...",,2016,"One beige-pigmented, Gram-staining-negative, r..."
1702,International journal of systematic and evolut...,"Sphingobacterium jejuense sp. nov., with ginse...","[siddiqi muhammad zubair, muhammad shafi siddi...",,2016,"A Gram-stain-negative, strictly aerobic, non-m..."
1834,International journal of systematic and evolut...,Undibacterium danionis sp. nov. isolated from ...,"[kampfer peter, irgang rute, busse hans-jurgen...",,2016,"One beige-pigmented, Gram-stain-negative, rod-..."
1858,International journal of systematic and evolut...,Campylobacter geochelonis sp. nov. isolated fr...,"[piccirillo alessandra, niero giulia, calleros...",,2016,During a screening study to determine the pres...
1974,Archives of microbiology,Description of Hydrogenophaga laconesensis sp....,"[mantri soniya, chinthalagiri mohan rao, gundl...",,2016,"A light cream color colony was isolated, using..."


# Contagem autores

In [28]:
#plus, contagem dos autores
x = list(df["Authors"])
from itertools import chain
y = list(chain.from_iterable(x))

In [6]:
from collections import Counter

In [7]:
frequency_authors = Counter(y)

In [8]:
for key, count in frequency_authors.items():
    name = key
    print([name, count])

['shi ming-jing', 7]
['wang chong', 4]
['liu zuan-yan', 2]
['jiang lai-xiang', 2]
['du zong-jun', 73]
['tanizawa yasuhiro', 2]
['tada ipputa', 1]
['kobayashi hisami', 3]
['endo akihito', 13]
['maeno shintaro', 1]
['toyoda atsushi', 1]
['arita masanori', 2]
['nakamura yasukazu', 1]
['sakamoto mitsuo', 47]
['ohkuma moriya', 67]
['tohno masanori', 11]
['najar ishfaq nabi', 1]
['sherpa mingma thundu', 1]
['das sayak', 2]
['verma kamalesh', 1]
['dubey vikash kumar', 1]
['thakur nagendra', 2]
['wang jing-jing', 1]
['chen qi', 3]
['li yue-zhong', 2]
['yi kwon jung', 1]
['im wan-taek', 147]
['kim dong-woon', 4]
['kim soo-ki', 5]
['gao jun-lian', 14]
['sun pengbo', 8]
['sun xiao-hong', 1]
['tong shuai', 1]
['yan hui', 8]
['han mei-lin', 1]
['mao xiao-jie', 4]
['sun jian-guang', 14]
['zhou en-min', 37]
['xian wen-dong', 9]
['jiao jian-yu', 10]
['liu lan', 29]
['li meng-meng', 2]
['ding yi-ping', 2]
['yin yi-rui', 17]
['zhao jiao', 2]
['nimaichand salam', 13]
['xiao min', 41]
['li wen-jun', 384]


['kim sang bum', 1]
['jeong ha yeon', 1]
['park beom young', 1]
['srinivasan sathiyaraj', 53]
['an dong-shan', 24]
['kim kyoung-ho', 19]
['lee jae-bong', 9]
['hong sumin', 1]
['park su-jin', 15]
['chen yufei', 1]
['oh ji hye', 2]
['seo hyun-seok', 14]
['lee jung-hyun', 20]
['guo jian-wei', 13]
['wang yun', 23]
['bobodzhanova khursheda', 1]
['xiao tao', 1]
['tegtmeier dorothee', 2]
['belitz alexandra', 1]
['radek renate', 4]
['heimerl thomas', 1]
['brune andreas', 8]
['mohamad osama abdalla', 1]
['jiang chenying', 1]
['hilgarth maik', 1]
['fuertes sandra', 1]
['ehrmann matthias', 1]
['vogel rudi f', 8]
['kusada hiroyuki', 1]
['kameyama keishi', 1]
['kamagata yoichi', 38]
['xu guangtang', 1]
['lin caili', 1]
['wang xizhuo', 1]
['divyasree b', 9]
['suresh g', 3]
['yang xiao-deng', 2]
['he hai-lun', 3]
['ning daliang', 1]
['du zongjun', 2]
['kumar narender', 7]
['chaudhry vasvi', 2]
['sahni girish', 1]
['puche rafael', 1]
['ferres ignacio', 1]
['caraballo lizeth', 1]
['rangel yaritza', 1]


['armsrtong nicholas', 1]
['di pinto fabrizio', 3]
['selvapravin kumaran', 1]
['sundararaman aravind', 4]
['haas kelly nicole', 1]
['blanchard jeffrey l', 1]
['lin huina', 2]
['wang yong', 2]
['huang jiaomei', 1]
['kleindienst sara', 1]
['higgins steven a', 1]
['tsementzi despina', 1]
['chen gao', 1]
['konstantinidis konstantinos t', 3]
['mack e erin', 1]
['loffler frank e', 7]
['zhang liguo', 1]
['zhang xiaofei', 1]
['huang jinying', 1]
['zhao yu', 1]
['zhao yuanling', 1]
['liu jianxin', 1]
['huang cui', 1]
['wang jing', 3]
['hu yingying', 1]
['ren guoling', 1]
['xu xiuhong', 1]
['alou m t', 1]
['labas n', 4]
['couderc c', 4]
['diallo a', 3]
['fonkou m d m', 1]
['weber carolyn f', 1]
['boudjeko thaddee', 1]
['tchinda romaric armel mouafo', 1]
['zitouni mina', 1]
['nana joelle aimee vera tchatchou', 1]
['lerat sylvain', 1]
['beaulieu carole', 1]
['ay hilal', 1]
['selyanin vadim', 2]
['lukes martin', 2]
['dean jason', 2]
['kaftan david', 2]
['koblizek michal', 4]
['doi hiroyasu', 1]
['o

['lemenand olivier', 1]
['caillon jocelyne', 1]
['corvec stephane', 1]
['cheung candy l w', 1]
['huang yi', 4]
['yip eric k t', 1]
['ng kenneth h l', 1]
['que tak-lun', 1]
['matsumoto takehisa', 1]
['negishi tatsuya', 1]
['komaki hisayuki', 3]
['gonoi tohru', 3]
['yaguchi takashi', 2]
['songsumanus apakorn', 3]
['yao li', 4]
['zhang jun-jie', 1]
['yu lin-lu', 1]
['chen qin', 1]
['zhu jian-chun', 4]
['ding de-rong', 1]
['li chongping', 3]
['jeong sun hwan', 1]
['lee sang seob', 3]
['cabulong rhudith b', 1]
['hong soon-kwang', 2]
['filippini gianfranco', 3]
['pisi annamaria', 2]
['shamseldin abdelaal', 1]
['moawad hassan', 1]
['sadowsky michael j', 1]
['schuster julia', 1]
['bass david', 1]
['liang pan', 1]
['li hao', 5]
['liu minyuan', 1]
['xue zhaocheng', 1]
['flor-weiler lina', 2]
['kim wan-hoe', 2]
['kim do-hak', 1]
['min ui-gi', 4]
['hong heeji', 4]
['na jeong-geol', 3]
['nurkanto arif', 5]
['ratnakomala shanti', 8]
['lisdiyanti puspita', 10]
['jeong hye im', 4]
['chang yaoguang', 1

['lo naysim', 5]
['liu ping', 2]
['homonnay zalan g', 4]
['nemes-barnas katalin', 1]
['lucena-padros helena', 3]
['caballero-guerrero belen', 3]
['luis ruiz-barba jose', 1]
['maldonado-barragan antonio', 4]
['mcginnis jana m', 1]
['cole jocelyn a', 5]
['dickinson michelle c', 2]
['lapierre pascal', 1]
['al ruwaili jamal', 1]
['agsar dayanand', 1]
['chari abhishek', 1]
['oakeson kelly f', 1]
['enomoto shinichiro', 1]
['jackson d grant', 1]
['fisher mark a', 2]
['dale colin', 1]
['tejedor gil carmen', 1]
['palomo jose luis', 2]
['garcia benavides pablo', 1]
['fernandez pascual mercedes', 1]
['jang hani', 2]
['lee sang suk', 2]
['xia cong cong', 1]
['ding shu lan', 1]
['liu hong-ming', 3]
['zhang rong', 3]
['chen ding-bin', 2]
['robnett christie j', 5]
['gaget virginie', 1]
['welker martin', 1]
['rippka rosmarie', 1]
['de marsac nicole tandeau', 1]
['yao yao', 1]
['chen wen xn', 1]
['meng xiao-ling', 1]
['ward alan c', 9]
['pan xin-chi', 5]
['geng shuang', 6]
['lv xiang-lin', 6]
['mei ran

['hong seok-hyun', 2]
['hattori satoshi', 2]
['deevong pinsurang', 1]
['noparatnaraporn napavarn', 1]
['kudo toshiaki', 1]
['debruyn jennifer m', 1]
['fawaz mariam n', 1]
['peacock aaron d', 4]
['dunlap john r', 2]
['nixon lauren t', 1]
['cooper katherine e', 1]
['radosevich mark', 1]
['shao rui', 1]
['liu zhi-heng', 6]
['dai huan-qin', 4]
['song fu-hang', 2]
['jiang yang', 2]
['junicke helena', 1]
['kleerebezem robbert', 3]
['van loosdrecht mark c m', 1]
['kirk karen e', 1]
['hoffman jessica a', 1]
['smith katherine a', 1]
['strahan brittane l', 2]
['failor kevin c', 2]
['krebs jordan e', 1]
['gale andrew n', 1]
['do tri d', 1]
['sontag thomas c', 1]
['batties allison m', 2]
['mistiszyn kimberly', 1]
['van pham h t', 1]
['mishra ajay kumar', 1]
['matsunobu shun', 1]
['morifuku youji', 1]
['enokida yuya', 1]
['park seong chan', 28]
['kim eun mi', 13]
['seo dong-cheol', 1]
['cho ju-sik', 1]
['chandna piyush', 1]
['kuhad ramesh chander', 1]
['kim mi na', 11]
['bae taeok', 1]
['guerin-fau

['yu jun', 1]
['svetashev vasily i', 4]
['gonzalez-tirante maria', 1]
['sistek viridiana', 1]
['cantin philippe', 1]
['bergeron michel g', 1]
['baek sang-hoon', 13]
['kim ki-jeong', 1]
['scotta claudia', 1]
['bennasar a', 2]
['lalucat j', 1]
['gomila m', 1]
['briantseva i a', 3]
['turova t p', 19]
['flores gilberto e', 2]
['hunter ryan c', 1]
['gales g', 1]
['chehider n', 1]
['joulian c', 1]
['battaglia-brunet f', 1]
['borgomano j', 1]
['neria-gonzalez i', 1]
['lomans b p', 2]
['ollivier b', 5]
['trigui hana', 1]
['masmoudi salma', 1]
['dukan sam', 1]
['barcellos fernando gomes', 1]
['thompson fabiano lopes', 1]
['garcia ronald o', 2]
['gerth klaus', 1]
['irschik herbert', 1]
['fang caiyuan', 5]
['luo yuan-rong', 1]
['huang xu', 1]
['kwon kaekyoung', 1]
['zheng tian-ling', 1]
['hu huo', 2]
['xie xin-qiang', 2]
['scholz h c', 2]
['loncaric i', 1]
['whatmore a m', 1]
['heyrman jeroen', 11]
['oh yong-sik', 6]
['lee sang-ah', 1]
['cai yimin', 2]
['pang huili', 2]
['engene niclas', 1]
['rot

['hirsch peter', 7]
['xin yu hua', 3]
['cowan don a', 2]
['burton stephanie g', 1]
['merhej vicky', 2]
['straka j', 1]
['newcombe david', 1]
['dekas anne', 1]
['beaz hidalgo roxana', 1]
['chakravarthy s kalyan', 1]
['sucharitha k', 2]
['lee chang-soo', 1]
['kita-tsukamoto kumiko', 2]
['ikemoto eiko', 1]
['shi jin-xiao', 3]
['sasaya kinuyo', 1]
['ohiwa hitomi', 2]
['ikeno hironori', 2]
['ayame shohei', 2]
['kataoka naoaki', 2]
['miya akiko', 2]
['zhang li-min', 1]
['zhang xiu-min', 1]
['park so yeon', 1]
['takagaki akiko', 1]
['matsumoto kohei', 1]
['kato yuko', 6]
['goto keiichi', 12]
['sallam ahmed', 1]
['nelson david m', 1]
['glawe adam j', 1]
['cann isaac k o', 2]
['jimenez-pranteda maria l', 1]
['durban juan j', 1]
['russell nick j', 3]
['luo xueshong', 1]
['takadera takahide', 1]
['imamura nobutaka', 1]
['an kwang-deuk', 1]
['nagao tomokazu', 1]
['sano hiroshi', 1]
['braune annett', 1]
['gordon nathan s', 1]
['valenzuela alejandra', 1]
['adams sandra m', 1]
['ramsey philip w', 1]


['zhang tong', 1]
['fan xiaojun', 1]
['fang herbert h p', 1]
['haghdoost siamak', 1]
['stallwood bethan', 1]
['kimura sakurako', 1]
['tseng i-cheng', 2]
['cheng sheng-shung', 2]
['kinkel linda l', 1]
['samac deborah a', 1]
['jeong hack seong', 1]
['mortier stefanie', 1]
['dawyndt peter', 2]
['janssens danielle', 4]
['lee jun won', 1]
['leggiadro cindy', 1]
["o'neil david", 1]
['majumdar swati', 1]
['prabhagaran s r', 1]
["o'sullivan louise a", 2]
['rinna joachim', 2]
['humphreys gavin', 2]
['weightman andrew j', 2]
['fry john c', 3]
['berger pierre', 2]
['ma kai', 2]
['wartiainen ingvild', 1]
['hestnes anne grethe', 1]
['ghosh wriddhiman', 2]
['roy pradosh', 3]
['maruyama tomoko', 1]
['park ho-dong', 1]
['ozawa kazuhiko', 1]
['tanaka yoshinori', 1]
['sumino tatsuo', 1]
['kato kenji', 1]
['shin do-yun', 2]
['oh jong-won', 1]
['ju yoon jung', 2]
['nagy moria', 1]
['konstantinov sergey r', 2]
['poznanski elisa', 1]
['akkermans antoon d l', 3]
['silva luis r', 1]
['higashiguchi dennis t', 

['yuki norikatsu', 1]
['kado yukiko', 1]
['shimazaki tomoko', 1]
['yuyama teruhiko', 1]
['collins matthhew d', 1]
['jimenez francisco', 1]
['gilhaus helga', 1]
['beer w', 1]
['mietke henriette', 1]
['gelderblom h r', 1]
['burghardt barbel', 1]
['voigt w', 1]
['reissbrodt r', 1]
['denger karin', 1]
['muxi lucia', 1]
['bjorkroth k johanna', 1]
['geisen rolf', 1]
['korkeala hannu j', 1]
['daane l l', 1]
['harjono i', 1]
['barns s m', 1]
['launen l a', 1]
['palleron n j', 1]
['haggblom m m', 2]
['kim chul-joong', 1]
['poco sergio e jr', 1]
['sato michiko', 1]
['ikeda tetsuro', 1]
['kalfas sotos', 1]
['sundqvist goran', 1]
['hoshino etsuro', 1]
['reva oleg n', 1]
['smirnov valerie v', 1]
['pettersson bertil', 1]
['coram nicolette j', 1]
['rawlings douglas e', 1]
['hoyles l', 7]
['tornqvist e', 1]
['von essen r', 1]
['borodina elena', 1]
['ward-rainey naomi l', 1]
['garnova e s', 2]
['tee w', 1]
['midolo p', 1]
['kerr t', 1]
['dyall-smith m l', 1]
['riessen s', 1]
['antranikian g', 3]
['yaki

In [9]:
df_authors = pd.DataFrame.from_dict(frequency_authors, orient='index')

In [10]:
df_authors.columns = ["freq"]

In [11]:
df_authors.to_csv("authors_freq.txt")