### Initalize Script

If you're running this script on Google Colab<br>
Mount your Google drive: 
1. Click on the folder icon on the left
2. Click Mount Drive
3. The root directory would be /content/
```
# your Google Drive folder would be at:
/content/drive/My Drive/
```

Change working directory:<br>
1. Run this command:
```
%cd /content/drive/My Drive/<your folder>
```

In [None]:
%cd /content/drive/My Drive/Data Science/Covid-19

### Download optional (required) files

Download nltk stopwords to use Stopwords
```
import nltk
nltk.download('stopwords')
```
Download nltk wordnet to use WordNetLemmatizer:
```
import nltk
nltk.download('wordnet')
```
Download nltk punkt to use Punkt Sentence Tokenizer
```
import nltk
nltk.download('punkt')
```

### Read all
Each paper are in json format

In [None]:
%%time
import os
folder = 'raw_data/comm_use_subset'
filenames = os.listdir(f'./{folder}')

import json
import pandas as pd

from src.covid_19_tp import authors_name, body_text, format_bib
data = pd.DataFrame(
    [
        {
            'paper_id': file['paper_id'],
            'title': file['metadata']['title'],
            'authors': authors_name(file['metadata']['authors'], affiliation=True),

            'abstract': body_text(file['abstract']),
            'text': body_text(file['body_text']),

            'bibliography': format_bib(file['bib_entries'])
        }
        for file in [
            json.load(open(f'{folder}/{filename}'))
            for filename in filenames
        ]
    ]
)

In [None]:
data.head()

### Load NLP functions

In [None]:
from src.text_preprocessing import spacy_NLP, nltk_NLP
spacy_tokenizer = spacy_NLP('en_core_web_sm').tokenize_API()
nlp_tokenizer = nltk_NLP().tokenize_API()

from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nlp_custom_tokenizer = nltk_NLP(stemming=PorterStemmer, lemmatisation=WordNetLemmatizer).custom_API()

### Create Corpus from dataset
Save the corpus as pickle file to save time in the future; Load the pickle file

In [None]:
from src.text_preprocessing import STOP_WORDS, text_preprocess

from tqdm import tqdm

corpus = [
    text_preprocess(text, tokenizer=spacy_tokenizer, stopwords=STOP_WORDS)
    for text in tqdm(list(data['title'] + ' ' + data['abstract'] + ' ' + data['text']))
]

filename = '_'.join(folder.split('/'))
folder = 'processed_data'

from os.path import isdir
from os import mkdir
if not isdir(f'./{folder}'): mkdir(f'./{folder}') # Create folder if it does not exist

import pickle
with open(f'{folder}/{filename}_corpus.pkl', 'wb') as output:
    pickle.dump(corpus, output)

### Load Corpus from Pickle

In [None]:
import pickle
folder = 'processed_data'
filename = 'raw_data_comm_use_subset_corpus'
with open(f'./{folder}/{filename}.pkl', 'rb') as f:
    corpus = pickle.load(f) 

### Conduct TF-IDF

In [None]:
%%time
from src.tf_idf import corpus_tf_idf

corpus_doc_tf_idf, term_doc_freq = corpus_tf_idf(corpus)

In [None]:
'''
    corpus_doc_tf_idf: list of td-idf scores (terms: score) of each documents
    score:
        Low = frequent terms
        High = rare terms
'''
corpus_doc_tf_idf[:2]

In [None]:
'''
    term_doc_freq: a dict (key: value pairs) of a term and it's count of occurrence in different documents
'''
term_doc_freq