### Initalize Script

If you're running this script on Google Colab<br>
Mount your Google drive: 
1. Click on the folder icon on the left
2. Click Mount Drive
3. The root directory would be /content/
```
# your Google Drive folder would be at:
/content/drive/My Drive/
```

Change working directory:<br>
1. Run this command:
```
%cd /content/drive/My Drive/<your folder>
```

In [None]:
%cd /content/drive/My Drive/Data Science/Covid-19

### Download optional (required) files

Download nltk stopwords to use Stopwords
```
import nltk
nltk.download('stopwords')
```
Download nltk wordnet to use WordNetLemmatizer:
```
import nltk
nltk.download('wordnet')
```
Download nltk punkt to use Punkt Sentence Tokenizer
```
import nltk
nltk.download('punkt')
```

### Load NLP text preprocessing function

In [None]:
from src.text_preprocessing import nltk_NLP, spacy_NLP, STOP_WORDS, text_preprocess
spacy_tokenizer = spacy_NLP('en_core_web_sm').tokenize_API()
nlp_tokenizer = nltk_NLP().tokenize_API()

# from nltk.stem.porter import PorterStemmer
# from nltk.stem.wordnet import WordNetLemmatizer
# nlp_custom_tokenizer = nltk_NLP(stemming=PorterStemmer, lemmatisation=WordNetLemmatizer).custom_API()

text_prep = lambda text: text_preprocess(text, tokenizer=spacy_tokenizer, stopwords=STOP_WORDS)

### Load and pre-process Dataset

In [None]:
from src.text_preprocessing import STOP_WORDS, text_preprocess

from os import walk as dir_list
from tqdm import tqdm
import json
import os

folder = 'raw_data'

data_title_abstract = [
    {
        'paper_id':file['paper_id'], 
        'title':file['metadata']['title'], 
        'abstract': text_prep(
            ''.join([row['text']+'\n' for row in file['abstract']])
        )
    }
    for subdir, dirs, files in dir_list(f'./{folder}')
    for file in tqdm(
        [
            json.load(open(f'{subdir}/{file}'))
            for file in tqdm(files, desc=f'Loading all files in {subdir}')
        ], desc=f'Reading individual files in {subdir}'
    )
]

### Create DataFrame with dataset

In [None]:
import pandas as pd
data_title_abstract = pd.DataFrame(data_title_abstract)

### Save and Load Data to and from pickle file

In [None]:
import pickle
folder = 'processed_data'
filename = 'data_entire_title_abstract'
with open(f'./{folder}/{filename}.pkl', 'wb') as output:
    pickle.dump(data_title_abstract, output)

In [None]:
import pickle
folder = 'processed_data'
filename = 'data_entire_title_abstract'
with open(f'./{folder}/{filename}.pkl', 'rb') as f:
    data_title_abstract = pickle.load(f)

### Prepare Corpus

In [None]:
corpus = list(data_title_abstract.loc[:, 'abstract'])

### Conduct TF-IDF using skLearn package

In [None]:
from src.tf_idf import sklearn_TFIDF

sk_tfidf = sklearn_TFIDF()
sk_tfidf.tfidf_corpus(corpus)

In [None]:
# Create new column (keywords)
data_title_abstract = data_title_abstract.reindex(columns=list(data_title_abstract.columns)+['keywords'])
data_title_abstract['keywords'] = data_title_abstract['abstract'].apply(
    lambda text: sk_tfidf.get_text_keywords(text)
)

### Save new DataFrame as pickle file

In [None]:
import pickle
folder = 'processed_data'
filename = 'data_entire_title_abstract_keywords'
with open(f'./{folder}/{filename}.pkl', 'wb') as output:
    pickle.dump(data_title_abstract, output)