### How long should people infected with the virus isolate for? Possible steps to answer this are:
- Find all terms that describe isolation (incubation, quarantine, isolation, infectious, transmissable, spreading, transferable, contagious)
- Look for all numbers used near these words (in the form 'XX days') that describe the recommend isolation period.
- Put all numbers into a list and average them to determine the recommended isolation period.
- Spot check a few examples to check it's working as expected.

In [52]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
stopwords=set(nltk.corpus.stopwords.words('english'))
from collections import defaultdict

In [2]:
data = pd.read_csv("Data/clean_pmc.csv", nrows=10000)

In [3]:
data_cleaned = data[data['text'].apply(lambda x: bool(re.match('.*[a-zA-Z]+', x)))]
lang = pd.read_csv("Data/predicted_lang_10k.csv", index_col=0).squeeze()
data_eng = data_cleaned[lang == 'en']
# reset index for use in pd.iterrows()
data_eng = data_eng.reset_index()

In [4]:
def extract_days(tokenized_sent, anchor_word='days', days_regex="[0-9.]+"):
    
    # find index position of anchor word in sentence
    anchor_pos = tokenized_sent.index(anchor_word)
    days=[]

    # search for the two words preceding the anchor word and check
    # if they are numbers (using days_regex to confirm). If so, add
    # them to `days` list and take average. This is useful if a range
    # has been given. e.g. 2-6 days will return 4 days.
    for i in [anchor_pos-2, anchor_pos-1]:
        if i >= 0 and bool(re.match(days_regex, tokenized_sent[i])):
            day = re.findall(days_regex, tokenized_sent[i])
            days.extend(day)

    if days == []:
        return None
    else:
        try:
            np.asarray(days, dtype=np.float32).mean()
        except:
            print(f"Days list {days} can't be converted to numpy array. Extracted from {tokenized_sent}")
        else:
            return np.asarray(days, dtype=np.float32).mean()

In [63]:
def recommended_isolation(data, keywords, anchor_word='days', subset=None):

    # take subset of data to avoid long run time if requested by user
    if subset is not None:
        data = data[:subset]

    isolation=[]
    index=[]
    keyword_counter = defaultdict(int)
    
    # loop through pandas dataframe
    for indx, row in data.iterrows():
        # split 'text' column into sentences
        sents = sent_tokenize(row['text'])
        # split those sentences into words
        words = [word_tokenize(sent) for sent in sents]
        # loop through list of lists where outer list is each sentence and
        # inner list is each word in that sentence. Convert to lower text
        # and remove stopwords. Then extract the no. of days from each sentence
        # that contains a keyword(s).
        for sent in words:
            sent_clean = [word.lower() for word in sent if word.lower() not in stopwords]
            keywords_present = [word for word in keywords if word in sent_clean]
            if len(keywords_present) > 0 and anchor_word in sent_clean:
                days_from_sent = extract_days(sent_clean)
                if days_from_sent is not None:
                    isolation.append(days_from_sent)
                    index.append(indx)
                    for keyword_present in keywords_present:
                        keyword_counter[keyword_present] += 1
            
    return isolation, index, keyword_counter

Most similar words to "quarantine" from word2vec

- 'quarantining', 0.7624921798706055),
- 'quarantined', 0.7381011843681335),
- 'lockdown', 0.6942132115364075),
- 'compulsory', 0.693280816078186),
- 'workplace', 0.6875447034835815),
- 'visitors', 0.6798210740089417),
- 'voluntary', 0.6777964234352112),
- 'containment', 0.6706152558326721),
- 'tracing', 0.6631444692611694),
- 'restrictions', 0.6465995907783508)

In [124]:
keywords = ['quarantine', 'quarantining', 'quarantined', 'lockdown', 'compulsory', 'workplace']
days_to_isolate = recommended_isolation(data_eng, keywords=keywords, subset=1000)

In [125]:
days_to_isolate[2]

defaultdict(int, {'quarantine': 7, 'quarantining': 2, 'quarantined': 5})

In [126]:
a = np.asarray(days_to_isolate[0], dtype=np.float32)
print(f'Median recommended isolation: {np.median(a)} days')
print(f'Number of keywords found: {len(days_to_isolate[0])}')

Median recommended isolation: 14.0 days
Number of keywords found: 13


In [127]:
print(days_to_isolate[0])
print(days_to_isolate[1])

[100.0, 100.0, 14.0, 4.6, 14.0, 30.0, 5.0, 5.0, 14.0, 14.0, 30.0, 14.0, 3.0]
[10, 10, 22, 22, 22, 89, 426, 426, 657, 657, 728, 876, 968]


In [128]:
print(data_eng.iloc[1270, 6])

In [6]:
import gensim
from gensim.models import Word2Vec
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [26]:
corpus=[]
for index, row in tqdm(data_eng[:5000].iterrows(), total=data_eng[:5000].shape[0]):
    words = word_tokenize(row['text'])
    words_cleaned = [word.lower() for word in words if word.lower() not in stopwords]
    corpus.append(words_cleaned)

100%|██████████| 5000/5000 [09:42<00:00,  8.58it/s]  


In [27]:
model = Word2Vec(sentences=corpus)

In [108]:
model.wv.most_similar('quarantine')

[('quarantining', 0.7624921798706055),
 ('quarantined', 0.7381011843681335),
 ('lockdown', 0.6942132115364075),
 ('compulsory', 0.693280816078186),
 ('workplace', 0.6875447034835815),
 ('visitors', 0.6798210740089417),
 ('voluntary', 0.6777964234352112),
 ('containment', 0.6706152558326721),
 ('tracing', 0.6631444692611694),
 ('restrictions', 0.6465995907783508)]