## Installing Libraries

In [2]:
!pip install sklearn

# !pip install Keras-Applications
# !pip install xgboost
# !pip install langdetect 
# !pip install requests
# !pip install beautifulsoup4
# !pip install scispacy
# !pip install pickle5
# !pip install langdetect 
# !pip install plotly
# !pip install pydotplus
# !pip install graphviz
# !pip install /Users/gwolfe/Downloads/cudf-0.6.1.post1.tar.gz
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz



In [5]:
!conda create -n rapids python=3.7.3
!conda install  -n rapids -c numba -c conda-forge -c nvidia -c rapidsai/label/cuda10.0 -c defaults cudf
!export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/cuda/targets/x86_64-linux/lib:/usr/local/cuda/compat
!source activate rapids
!python -c "import cudf;print(cudf.__version__)"

/bin/sh: conda: command not found
/bin/sh: conda: command not found
/bin/sh: activate: No such file or directory
Traceback (most recent call last):
  File "<string>", line 1, in <module>
ImportError: No module named cudf


In [1]:
from project_template_code import *
import datetime
import pandas as pd
import pickle5 as pickle

In [2]:
import os
relative_path = r'data\CORD-19-research-challenge'
script_path = os.getcwd() # i.e. /path/to/dir/COVID-19.py
script_dir = os.path.split(script_path)[0] #i.e. /path/to/dir/
root_path = os.path.join(script_dir, relative_path)
print(root_path)

C:\Users\MV_Elnashar_PC\projects\kaggle\COVID-19\data\CORD-19-research-challenge


## Load MetaData
Provides the schema and information from the metadata csv.

In [None]:
# pickl write meta_df

metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, low_memory=False, dtype={
     'pubmed_id': str,
     'Microsoft Academic Paper ID': str, 
     'doi': str
})

write_pickles('meta_df')

## Load JSON

In [None]:
# pickl write all_json

import glob

all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

write_pickles('all_json')


## Build DataFrame from JSON

In [None]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            if 'abstract' in content:
              for entry in content['abstract']:
                  self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'

In [None]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [None]:
dict_ = {'paper_id': [], 'doi':[], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    
    try:
        content = FileReader(entry)
    except Exception as e:
        continue  # invalid paper format, skip
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['abstract'].append(content.abstract)
    dict_['paper_id'].append(content.paper_id)
    dict_['body_text'].append(content.body_text)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0: 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 100 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # if more than 2 authors, take them all with html tag breaks in between
            dict_['authors'].append(get_breaks('. '.join(authors), 40))
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null value
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
#     dict_['journal'].append(meta_data['journal'].values[0])
    
    # add doi
    dict_['doi'].append(meta_data['doi'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'abstract_summary'])
df_covid.head()

In [None]:
# pickl write df_covid

write_pickles('df_covid')

### Languages
we need to determine the language of each paper in the dataframe. 

In [3]:
df_covid = read_pickles('df_covid', '2020-05-30')

languages = language_selection(df_covid, 'body_text', 'en')

# pickl write languages
date_of_launch = str(datetime.date.today())
fileObject = open(f'pickled_languages{date_of_launch}.pickle', 'wb')
pickle.dump(languages, fileObject)
fileObject.close()


100%|██████████| 32417/32417 [02:41<00:00, 200.63it/s]


## Data Pre-Processing

### Handle Duplicates

In [26]:
df = df_covid
df = text_preprocessing(df, 'abstract', 'body_text')
df['abstract'].describe(include='all')

count                                                 22525
unique                                                22525
top       Novel DNA sequencing techniques, referred to a...
freq                                                      1
Name: abstract, dtype: object

In [27]:
df['body_text'].describe(include='all')

count                                                 22525
unique                                                22525
top       The coronaviruses (CoV, family Coronaviridae) ...
freq                                                      1
Name: body_text, dtype: object

In [28]:
df.describe()

Unnamed: 0,abstract_word_count,body_word_count,body_unique_words
count,22525.0,22525.0,22525.0
mean,218.434406,4422.461443,1383.426371
std,139.639545,3666.115985,705.186983
min,1.0,23.0,23.0
25%,148.0,2731.0,991.0
50%,200.0,3834.0,1278.0
75%,256.0,5411.0,1648.0
max,3694.0,232431.0,30523.0


### Data Clean-Up

we need to clean-up the data to improve any clustering or classification efforts. 

In [29]:
df = df[df['language'] == 'en'] 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22160 entries, 1342 to 32416
Data columns (total 12 columns):
paper_id               22160 non-null object
doi                    22160 non-null object
abstract               22160 non-null object
body_text              22160 non-null object
authors                22160 non-null object
title                  22160 non-null object
journal                22160 non-null object
abstract_summary       22160 non-null object
abstract_word_count    22160 non-null int64
body_word_count        22160 non-null int64
body_unique_words      22160 non-null int64
language               22160 non-null object
dtypes: int64(3), object(9)
memory usage: 2.2+ MB


In [33]:
body_text_sample_df = df["body_text"]
print(body_text_sample_df)
abstract_sample_df = df['abstract']
print(abstract_sample_df)

1342     iNTRODUCTiON Human beings are constantly expos...
1343     Pathogens and vectors can now be transported r...
1344     a1111111111 a1111111111 a1111111111 a111111111...
1345     In addition to preventative care and nutrition...
1346     Ubiquitination is a widely used posttranslatio...
                               ...                        
32410    the viral receptor induces conformational chan...
32411    Immunocompromised patients with underlying hem...
32412    Over the past few decades, the world has witne...
32413    Regardless of geographic location, respiratory...
32416    Tick-borne encephalitis virus (TBEV) is a huma...
Name: body_text, Length: 22160, dtype: object
1342     Dendritic cells (DCs) are specialized antigen-...
1343     Dengue has a negative impact in low-and lower ...
1344     Fecal microbial transplantation (FMT), a treat...
1345     Fifteen years ago, United Nations world leader...
1346     Posttranslational modification of proteins by ...
          

In [34]:
# pickle df

tqdm.pandas()
# df["processed_text"] = df["body_text"].progress_apply(spacy_tokenizer)
df["processed_body_text_sample"] = body_text_sample_df.progress_apply(spacy_tokenizer)
df["processed_abstract_sample"] = abstract_sample_df.progress_apply(spacy_tokenizer)

# pickl write df

write_pickles('df')

In [1]:
smoking_synonyms = ['smoking',
                    'smoke',
                    'cigar', # this picks up cigar, cigarette, e-cigarette, etc.
                    'nicotine',
                    'cannabis',
                    'marijuana']

age_synonyms = ['median age',
                'mean age',
                'average age',
                'elderly',
                r'\baged\b',
                r'\bold',
                'young',
                'teenager',
                'adult',
                'child'
               ]

covid19_synonyms = ['covid',
                    'coronavirus disease 19',
                    'sars cov 2', # Note that search function replaces '-' with ' '
                    '2019 ncov',
                    '2019ncov',
                    r'2019 n cov\b',
                    r'2019n cov\b',
                    'ncov 2019',
                    r'\bn cov 2019',
                    'coronavirus 2019',
                    'wuhan pneumonia',
                    'wuhan virus',
                    'wuhan coronavirus',
                    r'coronavirus 2\b',
                    'risk factor analysis',
                    'cross sectional case control',
                    'prospective case control',
                    'matched case control',
                    'medical records review',
                    'seroprevalence survey',
                    'syndromic surveillance',
                    'corona', r'\bcov\b']

risk_factor_synonyms = ['risk factor',
                        'risk model',
                        'risk by',
                        'comorbidity',
                        'comorbidities',
                        'coexisting condition',
                        'co existing condition',
                        'clinical characteristics',
                        'clinical features',
                        'demographic characteristics',
                        'demographic features',
                        'behavioural characteristics',
                        'behavioural features',
                        'behavioral characteristics',
                        'behavioral features',
                        'predictive model',
                        'prediction model',
                        'univariate', # implies analysis of risk factors
                        'multivariate', # implies analysis of risk factors
                        'multivariable',
                        'univariable',
                        'odds ratio', # typically mentioned in model report
                        'confidence interval', # typically mentioned in model report
                        'logistic regression',
                        'regression model',
                        'factors predict',
                        'factors which predict',
                        'factors that predict',
                        'factors associated with',
                        'underlying disease',
                        'underlying condition']

sex_synonyms = ['sex',
                'gender',
                r'\bmale\b',
                r'\bfemale\b',
                r'\bmales\b',
                r'\bfemales\b',
                r'\bmen\b',
                r'\bwomen\b'
               ]

bodyweight_synonyms = [
    'overweight',
    'over weight',
    'obese',
    'obesity',
    'bodyweight',
    'body weight',
    r'\bbmi\b',
    'body mass',
    'body fat',
    'bodyfat',
    'kilograms',
    r'\bkg\b', # e.g. 70 kg
    r'\dkg\b'  # e.g. 70kg
]

diabetes_synonyms = [
    'diabet', # picks up diabetes, diabetic, etc.
    'insulin', # any paper mentioning insulin likely to be relevant
    'blood sugar',
    'blood glucose',
    'ketoacidosis',
    'hyperglycemi', # picks up hyperglycemia and hyperglycemic
]

chronicresp_synonyms = [
    'chronic respiratory disease',
    'asthma',
    'chronic obstructive pulmonary disease',
    r'\bcopd',
    'chronic bronchitis',
    'emphysema'
]

immunity_synonyms = [
    'immunity',
    r'\bvaccin',
    'innoculat'
]
asthma_synonyms = [
    'asthma',
    r'risk\basthma'
]

climate_synonyms = [
    'climate',
    'weather',
    'humid',
    'sunlight',
    'air temperature',
    'meteorolog', # picks up meteorology, meteorological, meteorologist
    'climatolog', # as above
    'dry environment',
    'damp environment',
    'moist environment',
    'wet environment',
    'hot environment',
    'cold environment',
    'cool environment'
]


synonym_dict = {
    "smoking": smoking_synonyms,
    "age": age_synonyms,
    "risk_factor": risk_factor_synonyms,
    "sex": sex_synonyms,
    "bodyweight": bodyweight_synonyms,
    "diabetes": diabetes_synonyms,
    "chronic_respiratory": chronicresp_synonyms,
    "immunity": immunity_synonyms,
    "asthma": asthma_synonyms,
    "climate": climate_synonyms,
    "target": covid19_synonyms
}


# Crawl and Scrape Data from Maryland Institute

In [9]:
import requests
import bs4

In [10]:
URL_COUNTIES ="https://data.covid.umd.edu/fips-counties.csv"
response = requests.get(URL_COUNTIES, {}).text
web_page = bs4.BeautifulSoup(response, "lxml")
sub_page = web_page.body.find_all("p")
sub_page = sub_page[0]

## Build DataFrame

In [11]:
sub_string = sub_page.string
df_md_states = pd.DataFrame(data=[x.split(',') for x in sub_string.split('\r\n')],
                            columns=['fips', 'county_name', 'state_abbr',	'state_name',	'long_name',	'sumlev',	'region',	'division',	'state',	'county',	'crosswalk',	'region_name',	'division_name'])
df_md_states = df_md_states[1:]
df_md_states.head()

Unnamed: 0,fips,county_name,state_abbr,state_name,long_name,sumlev,region,division,state,county,crosswalk,region_name,division_name
1,1001,Autauga County,AL,Alabama,Autauga County AL,50,3,6,1,1,3-6-1-1,South,East South Central
2,1003,Baldwin County,AL,Alabama,Baldwin County AL,50,3,6,1,3,3-6-1-3,South,East South Central
3,1005,Barbour County,AL,Alabama,Barbour County AL,50,3,6,1,5,3-6-1-5,South,East South Central
4,1007,Bibb County,AL,Alabama,Bibb County AL,50,3,6,1,7,3-6-1-7,South,East South Central
5,1009,Blount County,AL,Alabama,Blount County AL,50,3,6,1,9,3-6-1-9,South,East South Central


In [12]:
df_md_states.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3146 entries, 1 to 3146
Data columns (total 13 columns):
fips             3146 non-null object
county_name      3146 non-null object
state_abbr       3146 non-null object
state_name       3146 non-null object
long_name        3146 non-null object
sumlev           3146 non-null object
region           3146 non-null object
division         3146 non-null object
state            3146 non-null object
county           3146 non-null object
crosswalk        3146 non-null object
region_name      3146 non-null object
division_name    3146 non-null object
dtypes: object(13)
memory usage: 319.6+ KB


In [14]:
# commented because this dataframe is pickled.

URL_NATIONAL ="https://data.covid.umd.edu/data/National.csv"
response_national = requests.get(URL_NATIONAL, {}).text
web_page_national = bs4.BeautifulSoup(response_national, "lxml")
sub_page_national = web_page_national.body.find_all("p")
sub_page_national = sub_page_national[0]
sub_page_national

<p>Name,Social distancing index,% staying home,Trips/person,% out-of-county trips,% out-of-state trips,Miles/person,Work trips/person,Non-work trips/person,New COVID cases,Population,% change in consumption,date,Transit mode share,% people older than 60,Median income,% African Americans,% Hispanic Americans,% Male,Population density,Employment density,# hot spots/1000 people,Hospital beds/1000 people,ICUs/1000 people,# contact tracing workers/1000 people,COVID exposure/1000 people,#days: decreasing ILI cases,Unemployment claims/1000 people,Unemployment rate,% working from home,Cumulative inflation rate,COVID death rate,New cases/1000 people,Active cases/1000 people,#days: decreasing COVID cases,% hospital bed utilization,Testing capacity,Tests done/1000 people,% ICU utilization,Ventilator shortage,Imported COVID cases
USA,53,31,2.48,27.5,4.9,30.2,0.24,2.23,0,327167434,-21.6,01/01/2020,4.83,21,62940,12.3,17.9,49.24,86,40,133,2.86,0.28,0.023,0.0,0.0,1.0,3.6,4.9,0.4,0.0,0.0,0.0,0,51.66913

In [17]:
# sub_string_national = sub_page_national.string
# df_md_national = pd.DataFrame(data=[x.split(',') for x in sub_string_national.split('\n')],
#                             columns=['name','social_distancing_index','%_staying_home','#trips/person','%_out-of-county_trips','miles_traveled/person','#work_trips/person','#non-work_trips/person','covid_case_count','population','date'])
# df_md_national = df_md_national[1:]
# df_md_national.head()

In [16]:
# pickled

sub_string_national = sub_page_national.string
df_md_national = pd.DataFrame(data=[x.split(',') for x in sub_string_national.split('\n')],
                            columns=['Name',
'Social_distancing_index',
'%_staying_home',
'Trips/person',
'%_out-of-county_trips',
'%_out-of-state_trips',
'Miles/person',
'Work_trips/person',
'Non-work_trips/person',
'New_COVID_cases',
'Population',
'%_change_in_consumption',
'date',
'Transit_mode_share',
'%_people_older_than_60',
'Median_income',
'%_African_Americans',
'%_Hispanic_Americans',
'%_Male',
'Population_density',
'Employment_density',
'#_hot_spots/1000_people',
'Hospital_beds/1000_people',
'ICUs/1000_people',
'#_contact_tracing_workers/1000_people',
'COVID_exposure/1000_people',
'#days:_decreasing_ILI_cases',
'Unemployment_claims/1000_people',
'Unemployment_rate',
'%_working_from_home',
'Cumulative_inflation_rate',
'COVID_death_rate',
'New_cases/1000_people',
'Active_cases/1000_people',
'#days:_decreasing_COVID_cases',
'%_hospital_bed_utilization',
'Testing_capacity',
'Tests_done/1000_people',
'%_ICU_utilization',
'Ventilator_shortage',
'Imported_COVID_cases'])
df_md_national = df_md_national[1:]
df_md_national.head()

Unnamed: 0,Name,Social_distancing_index,%_staying_home,Trips/person,%_out-of-county_trips,%_out-of-state_trips,Miles/person,Work_trips/person,Non-work_trips/person,New_COVID_cases,...,COVID_death_rate,New_cases/1000_people,Active_cases/1000_people,#days:_decreasing_COVID_cases,%_hospital_bed_utilization,Testing_capacity,Tests_done/1000_people,%_ICU_utilization,Ventilator_shortage,Imported_COVID_cases
1,USA,53,31,2.48,27.5,4.9,30.2,0.24,2.23,0,...,0.0,0.0,0.0,0,51.66913535349564,0.0,0.0,0.0,12,0
2,USA,20,20,3.34,29.2,4.7,38.8,0.59,2.75,0,...,0.0,0.0,0.0,0,51.66913535349564,0.0,0.0,0.0,12,0
3,USA,16,19,3.51,29.4,4.7,40.8,0.59,2.93,0,...,0.0,0.0,0.0,0,51.66913535349564,0.0,0.0,0.0,2,0
4,USA,32,24,3.13,28.9,4.8,37.0,0.31,2.82,0,...,0.0,0.0,0.0,0,51.66913535349564,0.0,0.0,0.0,2,0
5,USA,45,28,2.72,27.7,4.9,33.8,0.26,2.46,0,...,0.0,0.0,0.0,0,51.66913535349564,0.0,0.0,0.0,2,0


In [18]:
# pickl write df_md_national

write_pickles('df_md_national')

In [None]:
# TBD 

# !jupyter nbconvert --to html COVID19Assignment.ipynbd
# TODO handle dupl, etc. look at description

# Approach. think about classifiers.
# df_md_states['fips'] = df_md_states['fips'].astype(int)
# df_md_states['county_name'] = df_md_states['county_name'].astype('string')
# df_md_states['state_abbr'] = df_md_states['state_abbr'].astype('string')
# df_md_states['state_name'] = df_md_states['state_name'].astype('string')
# df_md_states['long_name'] = df_md_states['long_name'].astype('string')
# df_md_states = df_md_states[df_md_states['sumlev'] != 'NA']
# df_md_states['sumlev'] = df_md_states['sumlev'].astype(int)
# df_md_states['region'] = df_md_states['region'].astype(int)
# df_md_states['division'] = df_md_states['division'].astype(int)
# df_md_states['state'] = df_md_states['state'].astype(int)
# df_md_states['county'] = df_md_states['county'].astype(int)
# df_md_states['crosswalk'] = df_md_states['crosswalk'].astype('string')
# df_md_states['region_name'] = df_md_states['region_name'].astype('string')
# df_md_states['division_name'] = df_md_states['division_name'].astype('string')