### Initalize Script
If you're running this script on Google Colab<br>
Mount your Google drive: 
1. Click on the folder icon on the left
2. Click Mount Drive
3. The root directory would be /content/
```
# your Google Drive folder would be at:
/content/drive/My Drive/
```

Change working directory:<br>
1. Run this command:
```
%cd /content/drive/My Drive/<your folder>
```

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/My Drive/Data Science/Covid-19

## Load NLP functions

In [None]:
from src.text_preprocessing import spacy_NLP
spacy_tokenizer = spacy_NLP('en_core_web_sm').tokenize_API()

# from src.text_preprocessing import nltk_NLP
# from nltk.stem.porter import PorterStemmer
# from nltk.stem.wordnet import WordNetLemmatizer
# nlp_tokenizer = nltk_NLP().tokenize_API()
# nlp_tokenizer = nltk_NLP(stemming=PorterStemmer, lemmatisation=WordNetLemmatizer).tokenize_API()

from src.text_preprocessing import STOP_WORDS, text_preprocess
text_prep = lambda text: text_preprocess(text, tokenizer=spacy_tokenizer, stopwords=STOP_WORDS)

## Prepare and Pre-process Dataset

### Read all
Each paper are in json format

In [None]:
%%time
from src.covid_19_tp import authors_name, body_text, format_bib

from os import walk as dir_list
from tqdm import tqdm
import json

folder = 'raw_data/comm_use_subset'
data = [
    {
        'paper_id': file['paper_id'],
        'title': file['metadata']['title'],
        'authors': authors_name(file['metadata']['authors'], affiliation=True),

        'abstract': body_text(file['abstract']),
        'text': body_text(file['body_text']),

        'bibliography': format_bib(file['bib_entries'])
    }
    for subdir, dirs, files in dir_list(f'./{folder}')
    for file in tqdm(
        [
            json.load(open(f'{subdir}/{file}'))
            for file in tqdm(files, desc=f'Loading all files in {subdir}')
        ], desc=f'Reading individual files in {subdir}'
    )
]

import pandas as pd
data = pd.DataFrame(data)

# Save dataset to pickle for faster loading in the future
from src.helper import pickle_dump
filename = '_'.join(src_folder.split('/'))
des_folder = 'processed_data'
pickle_dump(f'{des_folder}/{filename}_df.pkl', data)

### Create Corpus from dataset
Save the corpus as pickle file to save time in the future; Load the pickle file

In [None]:
from src.text_preprocessing import STOP_WORDS, text_preprocess
from tqdm import tqdm
corpus = [
    text_preprocess(text, tokenizer=spacy_tokenizer, stopwords=STOP_WORDS)
    for text in tqdm(list(data['title'] + ' ' + data['abstract'] + ' ' + data['text']))
]

from src.helper import pickle_dump
filename = '_'.join(folder.split('/'))
folder = 'processed_data'
pickle_dump(f'./{folder}/{filename}_corpus.pkl', corpus)

### Conduct TF-IDF
corpus_doc_tf_idf: list of td-idf scores (terms: score) of each documents<br>
score:<br>
> Low = frequent terms<br>
> High = rare terms<br>

```
tfidf.corpus_doc_tfidf[:1]
```

term_doc_freq: a dict (key: value pairs) of a term and it's count of occurrence in different documents<br>

```
tfidf.term_doc_freq
```

In [None]:
%%time
from src.tf_idf import TFIDF
tfidf = TFIDF()
tfidf.tfidf_corpus(corpus)

from src.helper import pickle_dump
folder = 'processed_data'
filename = 'corpus_custom_tfidf'
pickle_dump(f'./{folder}/{filename}.pkl', tfidf)

### Get keywords for each document

In [None]:
data = data.reindex(columns=list(data.columns)+['keywords'])

import pandas as pd
from tqdm import tqdm
tqdm.pandas()
data['keywords'] = pd.Series(corpus).progress_apply(
    lambda doc: tfidf.get_text_keywords(doc)
)

from src.helper import pickle_dump
folder = 'processed_data'
filename = 'raw_data_comm_use_subset_df_keyword'
pickle_dump(f'./{folder}/{filename}.pkl', data)

## Load all pre-saved objects

In [None]:
from src.helper import pickle_load
# filepath = 'processed_data/raw_data_comm_use_subset_df.pkl'
# data = pickle_load(filepath)

filepath = 'processed_data/raw_data_comm_use_subset_corpus.pkl'
corpus = pickle_load(filepath)

filepath = 'processed_data/corpus_custom_tfidf.pkl'
tfidf = pickle_load(filepath)

filepath = 'processed_data/raw_data_comm_use_subset_df_keyword.pkl'
data = pickle_load(filepath)

In [None]:
data.head()

### Search Relevant Articles based on question

In [None]:
question_list = [
    "Is the virus transmitted by aerisol, droplets, food, close contact, fecal matter, or water",
    "How long is the incubation period for the virus",
    "Can the virus be transmitted asymptomatically or during the incubation period",
    "What is the quantity of asymptomatic shedding",
    "How does temperature and humidity affect the tramsmission of 2019-nCoV",
    "How long can 2019-nCoV remain viable on inanimate, environmental, or common surfaces",
    "What types of inanimate or environmental surfaces affect transmission, survival, or inactivation of 2019-nCov",
    "Can the virus be found in nasal discharge, sputum, urine, fecal matter, or blood",
    "What risk factors contribute to the severity of 2019-nCoV",
    "How does hypertension affect patients"
]

from src.covid_19_BM25 import BM25
from IPython.display import display
import pandas as pd

'''
    Prepare BM25 ranking
    data_df: 
        a Pandas DataFrame with keywords column generated using TFIDF
'''
bm25 = BM25(data_df=data)

for question in question_list:
    print(question)
    question = text_prep(question)

    # Based on TFIDF and Dot Product of TFIDF scores
    query_search = tfidf.search_similar(
        query_tokens = question
    )[:10]

    result_df = pd.concat(
        [
            data.iloc[idx]
            for idx, _ in query_search
        ], ignore_index=True, axis=1
    ).T[['title', 'abstract', 'text', 'keywords']]
    display(result_df)

    # Based on TFIDF, TFIDF generated keywords
    result_bm25 = bm25.search_similar(
        question.split(),
        {'title': 1, 'abstract': 0.5, 'text': 2}
    )
    display(result_bm25)
    break