## Goal: Use Langdetect to filter out non-English articles to reduce noise in NLP

In [47]:
import pandas as pd
import numpy as np
import pickle
import re
import string
from langdetect import detect

In [29]:
with open("./data/clean_df.pkl", "rb") as read_file:
    df = pickle.load(read_file)
df.head()

Unnamed: 0,cord_uid,source_x,title,doi,abstract,publish_time,authors,journal,url,publish_year
0,gl6kg2gt,WHO,Progress Toward Poliovirus Containment Impleme...,,"Since 1988, when World Health Organization (WH...",2020-01-01,"Moffett, Daphne B; Llewellyn, Anna; Singh, Har...",MMWR Morb Mortal Wkly Rep,,2020
1,jjgqnzp6,WHO,Anti)social Monitoring: Law and (or) Expediency?,,The paper is focused on analysis of compliance...,2020-01-01,"Savelyev, Alexander I.",Zakon,,2020
2,s5hd55ch,WHO,Running of high patient volume radiation oncol...,,"Purpose: Due to COVID 19 pandemic, the treatme...",2020-01-01,"Gupta, Manoj; Ahuja, Rachit; Gupta, Sweety; Jo...",Radiat. Oncol. J.,,2020
3,zl9kf1ax,MedRxiv; WHO,Rapid Development of a De Novo Convalescent Pl...,10.1101/2020.10.23.20217901,BackgroundWith no vaccine or treatment for SAR...,2020-01-01,"Reik, Rita; Gammon, Richard; Carol, Nancy; Smi...",,http://medrxiv.org/cgi/content/short/2020.10.2...,2020
4,gzjfyitd,WHO,COVID – 19 and gravid mothers,,Human history is observing a very horrible and...,2020-01-01,"Sadavarte, S.; Chaudhari, B. V.",International Journal of Research in Pharmaceu...,,2020


In [45]:
def detectlang(s):
    try:
        return detect(s)
    except:
        return np.nan

In [46]:
title = df.loc[12].title
print(detectlang(title))
print(title)

en
[Changes in adherence to the Mediterranean diet observed in a Spanish population during confinement for the SARS-CoV-2 pandemic]


In [50]:
df['title'] = df.title.astype(str)
df['lang'] = df['title'].map(detectlang)

In [51]:
df.sample(10)

Unnamed: 0,cord_uid,source_x,title,doi,abstract,publish_time,authors,journal,url,publish_year,lang
12606,k9s691pd,WHO,Preventing vitamin D deficiency during the COV...,,There is growing evidence linking vitamin D de...,2020-01-01,"Griffin, G.; Hewison, M.; Hopkin, J.; Kenny, R...",Clin Med (Lond),,2020,en
260171,wn85ilbh,BioRxiv; WHO,Rapid and Efficient Inactivation of SARS-CoV-2...,10.1101/2021.04.20.440654,Efforts are underway to develop countermeasure...,2021-04-21,"Dwivedi, Varun; Park, Jun-Gyu; Grenon, Stephen...",bioRxiv,https://doi.org/10.1101/2021.04.20.440654,2021,en
169463,sbkkkicf,PMC,COVID-19: Harbinger of a New Psychology of Rel...,10.1007/s42087-020-00167-x,"Religious practice, like every other human aff...",2020-11-24,"Akpan, James J.",Hu Arenas,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,2020,en
214224,y9adt4cc,Medline; PMC,Recruitment and Baseline Characteristics of Pa...,10.3390/ijerph18020408,"Targeting dementia prevention, first trials ad...",2021-01-07,"Röhr, Susanne; Zülke, Andrea; Luppa, Melanie; ...",Int J Environ Res Public Health,https://www.ncbi.nlm.nih.gov/pubmed/33430189/;...,2021,en
189523,vjmdi11d,WHO,Comparative phylogenetic analysis of SARS-CoV-...,,Coronavirus disease 2019 has developed into a ...,2021-01-01,"Ghorbani, Abozar; Samarfard, Samira; Eskandarz...",Brief. bioinform,,2021,en
18026,s5sequa5,WHO,Considerations on the restriction of assisted ...,,The rapid rise of novel coronavirus disease 20...,2020-01-01,"Gemmell, L. C.; Williams, Z.; Forman, E. J.",Seminars in Perinatology,,2020,en
54000,bau2frq5,Medline,Attenuated Renal and Hepatic Cells Apoptosis F...,10.2147/cia.s250321,Background/Objective One of the problems assoc...,2020-01-01,"Farzanegi, Parvin; Abbaszadeh, Hajar; Farokhi,...",Clinical interventions in aging,https://doi.org/10.2147/cia.s250321; https://w...,2020,en
209515,e15f9z7s,WHO,The impact of covid-19 pandemic on food suffic...,,Bantul is one of regencies in Special Region o...,2021-01-01,"Fitriana, L.; Susanto, S.; Ngadisih, Setyawan ...",IOP Conf. Ser. Earth Environ. Sci.,,2021,en
114810,08p122lw,Medline; PMC,Blocking IL-1 to prevent respiratory failure i...,10.1186/s13054-020-03166-0,COVID-19 is an emerging disease that can manif...,2020-07-18,"van de Veerdonk, Frank L.; Netea, Mihai G.",Crit Care,https://www.ncbi.nlm.nih.gov/pubmed/32682440/;...,2020,en
169585,89szwh99,Medline; PMC,Massive dissemination of a SARS-CoV-2 Spike Y8...,10.1080/22221751.2020.1844552,Genomic surveillance of SARS-CoV-2 was rapidly...,2020-11-25,"Borges, Vítor; Isidro, Joana; Cortes-Martins, ...",Emerging microbes & infections,https://www.ncbi.nlm.nih.gov/pubmed/33131453/;...,2020,en


In [52]:
df.lang.value_counts()

en       258961
es         2683
de         1645
fr         1565
pt         1163
it          866
ca          360
ro          272
nl          222
af          105
da           86
tl           82
hu           82
id           65
no           61
tr           48
et           46
sv           33
pl           33
ru           30
cy           29
lt           25
sl           18
ko           16
zh-cn        16
hr           14
vi           13
so           11
fi           10
cs            9
fa            8
sk            6
bg            6
sw            5
uk            4
lv            4
ja            4
ar            3
sq            1
Name: lang, dtype: int64

In [56]:
# pulling only English article to reduce noise in NLP
df_en = df[df.lang=='en']

In [57]:
df_en.shape

(258961, 11)

In [59]:
#save to csv file
df_en.to_csv("clean_df_eng.csv", index=False)