### Initalize Script

If you're running this script on Google Colab<br>
Mount your Google drive: 
1. Click on the folder icon on the left
2. Click Mount Drive
3. The root directory would be /content/
```
# your Google Drive folder would be at:
/content/drive/My Drive/
```

Change working directory:<br>
1. Run this command:
```
%cd /content/drive/My Drive/<your folder>
```

In [None]:
%cd /content/drive/My Drive/Data Science/Covid-19

### Download optional (required) files

Download nltk stopwords to use Stopwords
```
import nltk
nltk.download('stopwords')
```
Download nltk wordnet to use WordNetLemmatizer:
```
import nltk
nltk.download('wordnet')
```
Download nltk punkt to use Punkt Sentence Tokenizer
```
import nltk
nltk.download('punkt')
```

### Read all
Each paper are in json format

In [None]:
%%time
from src.covid_19_tp import authors_name, body_text, format_bib

from os import walk as dir_list
from tqdm import tqdm
import json

folder = 'raw_data'
data = [
    {
        'paper_id': file['paper_id'],
        'title': file['metadata']['title'],
        'authors': authors_name(file['metadata']['authors'], affiliation=True),

        'abstract': body_text(file['abstract']),
        'text': body_text(file['body_text']),

        'bibliography': format_bib(file['bib_entries'])
    }
    for subdir, dirs, files in dir_list(f'./{folder}')
    for file in tqdm(
        [
            json.load(open(f'{subdir}/{file}'))
            for file in tqdm(files, desc=f'Loading all files in {subdir}')
        ], desc=f'Reading individual files in {subdir}'
    )
]

### Create DataFrame with dataset

In [None]:
import pandas as pd
data = pd.DataFrame(data)

In [None]:
data.head()

### Load Corpus from Pickle

In [None]:
import pickle
folder = 'processed_data'
filename = 'raw_data_comm_use_subset_corpus'
with open(f'./{folder}/{filename}.pkl', 'rb') as f:
    corpus = pickle.load(f) 

### Conduct TF-IDF

In [None]:
%%time
from src.tf_idf import corpus_tf_idf

corpus_doc_tf_idf, term_doc_freq = corpus_tf_idf(corpus)

In [None]:
'''
    corpus_doc_tf_idf: list of td-idf scores (terms: score) of each documents
    score:
        Low = frequent terms
        High = rare terms
'''
corpus_doc_tf_idf[:2]

In [None]:
'''
    term_doc_freq: a dict (key: value pairs) of a term and it's count of occurrence in different documents
'''
term_doc_freq

### Load NLP functions

In [None]:
from src.text_preprocessing import spacy_NLP, nltk_NLP
spacy_tokenizer = spacy_NLP('en_core_web_sm').tokenize_API()
nlp_tokenizer = nltk_NLP().tokenize_API()

from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nlp_custom_tokenizer = nltk_NLP(stemming=PorterStemmer, lemmatisation=WordNetLemmatizer).custom_API()

### Prepare search function

In [None]:
from src.covid_19_tf_idf import search_relevant_articles_tf_idf
from src.text_preprocessing import spacy_NLP, STOP_WORDS, text_preprocess

tf_idf_search = lambda query: search_relevant_articles_tf_idf(
    query = query, 
    n_articles = 10, 
    data_df = data, 
    corpus_doc_tf_idf = corpus_doc_tf_idf, 
    term_doc_freq = term_doc_freq,
    query_preprocess_func = lambda text: text_preprocess(
        text, tokenizer = spacy_tokenizer, stopwords = STOP_WORDS
    )
)

### Search Relevant Articles based on question

In [None]:
question_list = [
    "Is the virus transmitted by aerisol, droplets, food, close contact, fecal matter, or water",
    "How long is the incubation period for the virus",
    "Can the virus be transmitted asymptomatically or during the incubation period",
    "What is the quantity of asymptomatic shedding",
    "How does temperature and humidity affect the tramsmission of 2019-nCoV",
    "How long can 2019-nCoV remain viable on inanimate, environmental, or common surfaces",
    "What types of inanimate or environmental surfaces affect transmission, survival, or inactivation of 2019-nCov",
    "Can the virus be found in nasal discharge, sputum, urine, fecal matter, or blood",
    "What risk factors contribute to the severity of 2019-nCoV",
    "How does hypertension affect patients"
]

from IPython.display import display
from tqdm import tqdm
import numpy as np

for question in question_list:
    print(question)
    
    result_df = tf_idf_search(question)
    display(result_df)