Change background color for output as it wasn't distinguishable from the Markdown text.

In [44]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
%autoreload 2

[autoreload of utils.s2orc.read_dataset failed: Traceback (most recent call last):
  File "/home/vivoli/miniconda3/envs/arxiv-manipulation/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/vivoli/miniconda3/envs/arxiv-manipulation/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 394, in superreload
    module = reload(module)
  File "/home/vivoli/miniconda3/envs/arxiv-manipulation/lib/python3.7/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/home/vivoli/miniconda3/envs/arxiv-manipulation/lib/python3.7/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 630, in _exec
  File "<frozen importlib._bootstrap_external>", line 728, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/home/vivoli/Thesis/notebooks/utils/s2orc/read_dataset.py"

# Datasets
---

In this notebook we'll build/implement the Dataset classes we need to work with all the dataset we have.
First we will introduce the datasets, then we will separate those based on the usage we are going to make of them, then we will use/build/implement our classes in order to manage those different datasets and tasks.

# 0.0 Utils
---

We will be using the 🤗*Datasets* library, the 🤗 *Tranformers* library, as we need a tokenizer and a vocab and we'll be using (for loggin) Weigths and Biases (`wandb`) so we are going to install it, independently from Hugging face, and use it within it.

Let's define all the `imports` and `hyperparameters` in one place.

In [30]:
# ----------------------------------- #
#           All Imports
# ----------------------------------- #
import os # generic
import time # logging
from tqdm.auto import tqdm # custom progress bar

import io
import json # load/write data
import torch 
import numpy as np
import pandas as pd

# 🤗 Datasets
from datasets import (
    load_dataset, 
    # concatenate_datasets,
    DatasetDict, 
    Dataset as hfDataset
)

# 🤗 Tranformers
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    PreTrainedTokenizer, 
    DataCollatorForLanguageModeling, 
    Trainer, 
    BertForMaskedLM,
    TrainingArguments
)

# s2orc read dataset
from utils.s2orc.read_dataset import (
    # read_meta_json_list_dict,
    # read_pdfs_json_list_dict,
    # s2orc_chunk_read,
    s2orc_multichunk_read
)

# s2orc load dataset (not preprocessed)
from utils.s2orc.loader import (
    s2ortc_loader
)

# Preprocessing
from utils.s2orc.preprocessing import (
    # fuse_dictionaries,
    # getDataset,
    # data_target_preprocess,
    # mag_preprocess,
    preprocessing
)

# Dataset configuration files
from utils.config.datasets import (
    S2orcConfig,
    KeyPHConfig
)

from utils.config.execution import (
    RunConfig
)

# Padding
from torch.nn.utils.rnn import pad_sequence

# data types
from torch.utils.data import (
    Dataset, 
    DataLoader
)
from typing import (
    Dict, List, Union
)

In [34]:
# ----------------------------------- #
#           Hyperparameters
# ----------------------------------- #

# --------- dataset         --------- #
dataset_config = {
    'dataset_name':'s2orc',           # str
    'dataset_config_name': 'full',    # str
    'idxs': '0,1,2',                  # list or comma separated list value
    'only_extrated': False,           # bool
    'keep_extracted': False,          # bool
    'mag_field_of_study': ['Computer Science'], # list
    'data': ['abstract'],             # list
    'target': ['title'],              # list
    'classes': ['mag_field_of_study'] # list
}

# --------- logging         --------- #
logging_config = {
    'verbose': True,      # bool
    'debug': False,       # bool
    'callback': 'WandbCallback'   # Callbacks: (supported) 'PrinterCallback/ProgressCallback', 'TensorBoardCallback', 'WandbCallback', 'CometCallback', 'MLflowCallback', 'AzureMLCallback'
}

# verbose = True
# debug = False
# wandb_flag = True

# Logging on Weigths and Biases
if logging_config['callback'] == 'WandbCallback':
    import wandb
    wandb.login()

# --------- preprocessing   --------- #
# in **partial_prepare_data**
preprocessing_config = {
    'keep_none_papers': False, # if False, remove papers with None eather in abstract or title
    'keep_unused_columns': False # if False, remove columns not in dictionary
}

# remove_None_papers = True 
# remove_Unused_columns = True


# --------- paths           --------- #
DATA_PATH = '/home/vivoli/Thesis/data'

# --------- model/tokenizer --------- #
# hugginface model/tokenizer name
MODEL_PATH = 'allenai/scibert_scivocab_uncased'

# --------- Run config -------------- #
run_config = {
    # RunArguments
    'name': 'scibert-s2orc',
    'number': 0,
    'iter': 0,
    # TrainingArguments
    'seed': 1234 # seed for reproducibility of experiments
}

In [35]:
dataset_config: S2orcConfig = S2orcConfig(**dataset_config)
run_config: RunConfig = RunConfig(**run_config)

output_dir=f"./tmp_trainer/#{run_config.number}_{run_config.iteration}_{run_config.name}_seed{run_config.seed}"

In [None]:
# ----------------------------------- #
#           Logging
# ----------------------------------- #
LOGS_PATH = 'logs'
import logging

# Create a custom logger
logger = logging.getLogger("datasets.explanation")

# Create handlers
c_handler = logging.StreamHandler()
f_handler = logging.FileHandler(f'{LOGS_PATH}/file.log')
d_handler = logging.FileHandler(f'{LOGS_PATH}/debug.log')

c_handler.setLevel(logging.DEBUG if verbose else logging.WARNING) # verbose is to log everything
f_handler.setLevel(logging.ERROR)
d_handler.setLevel(logging.DEBUG)

# Create formatters and add it to handlers
c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
d_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

c_handler.setFormatter(c_format)
f_handler.setFormatter(f_format)
d_handler.setFormatter(d_format)

# Add handlers to the logger
logger.addHandler(c_handler)
logger.addHandler(f_handler)
logger.addHandler(d_handler)

logger.warning('This is a warning')
logger.error('This is an error')
logger.info('This is an info')
logger.debug('This is a debug')


In [None]:
wandb.init(project="huggingface")
# Optional: log both gradients and parameters
%env WANDB_WATCH=all

# 0.1 KeyPhrase Dataset
---

These are testing datasets

This keyphrase dataset could be useful for testing some model on keyphrase task or abstract-title summarization/generation/embedding.

For now, we can avoid implementing the Dataset's and DataLoader's classes for this objects.

Although, the dataset and Dataloader would be simple as follow:

The `data` object is composed by `500 tuples`, each one composed by 4 objects:
- `title_tensor_` is the title embedding (composed by integers values)
- `abstract_tensor_` is the abstract embedding (composed by integers values)
- `fulltext_tensor_` is the fulltext embedding (composed by integers values)
- `keywords_tensor_` is the keywords embedding (composed by integers values)

# 0.2 S2ORC Dataset
---

## 0.2.1 S2ORC ( jsonl | jsonl.gz ) Loader 
---
First of all we need to manage with the data, to unzip or already unzipped.

In [11]:
DATA_PATH = DATA_PATH
!ls $DATA_PATH

README.md  s2orc-full-20200705v1  s2orc-sample-20200705v1  sentence-tranformers
keyphrase  s2orc-mini		  scibert		   snli_1.0


## 0.2.2 Creation (s2orc)
---

Now we have explored the `S2ORC` structure, we are ready to load the data (starting from the `sample` and following on the `full` folder). The first thing to do is create (as we did before) a method for read the json: `json_s2orc_read`.

Lets's see what's inside the folder (in this case `metadata` but should be the same for `pdf_parses`:

#### a. Intersection
---
As we want to examinate all `meta_s2orc` and `pdfs_s2orc` files, we need to search the intersection between those files namea and `metadata_output` and `pdf_parses_output` lists.

So we can describe the function in charge to load the `jsonl` files. The function must have in input the `generic_path` (f"{DATA_PATH}/{SAMPLE_FOLDER}") and then searching in `metadata` and `pdf_parses` for the files present in `file_names`.

Important objects are:
    
- `s2orc_path` ('/home/vivoli/Thesis/data/s2orc-full-20200705v1/full')
- `meta_s2orc_path` (f'{s2orc_path}/metadata')
- `pdfs_s2orc_path` (f'{s2orc_path}/pdf_parses')
- `toread_meta_s2orc` ( ['metadata_0.jsonl.gz', 'metadata_1.jsonl.gz'] )
- `toread_pdfs_s2orc` ( ['pdf_parses_0.jsonl.gz', 'pdf_parses_1.jsonl.gz'] )

In [17]:
# we need to get `vocab` and the `tokenizer`, all comes with *AutoTokenizer*
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModel.from_pretrained(MODEL_PATH)

In [18]:
max_seq_length = model.config.max_position_embeddings

In [14]:
toread_meta_s2orc, toread_pdfs_s2orc = dataset_config.memory_save_pipelines()

We have used only the `sample.jsonl` or the pair (`metadata_0.jsonl`-`pdf_parses_0.jsonl`) so we just have one element in the `multichunks_lists`. 

We have parses all the `metadata` and `pdf_parses` elements, so we have now a dictionary that is composed by:
```python
json_dict_of_list = {
    'metadata': [], 
    'pdf_parses': {}, 
    'meta_key_idx': {}, 
    'pdf_key_idx': {}
}
```
In this dictionary we see:
* metadata - `List[dict]` of type `metadata`.
* pdf_parses - `List[dict]` of type `pdf_parses`.
* meta_key_idx - `dict` with keys: `paper_id` and values: `index` in the metadata list.
* pdf_key_idx - `dict` with keys: `paper_id` and values: `index` in the pdf_parses list.

In [20]:
# get dictionary input from config
dictionary_input = dataset_config.get_dictionary_input()

In [21]:
dictionary_input

{'data': ['abstract'], 'target': ['title'], 'classes': ['mag_field_of_study']}

In [22]:
dictionary_columns = sum(dictionary_input.values(), [])

In [23]:
dictionary_columns

['abstract', 'title', 'mag_field_of_study']

In [28]:
all_datasets = s2ortc_loader(dataset_config, run_config)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…



len meta single_chunk: 121562
len pdfs single_chunk: 51058
[TIME] load_dataset: 2.0178585052490234
[TIME] dataset_train selection: 2.384185791015625e-07
[TIME] remove.indexes: 0.34812331199645996
[TIME] remove.concat: 9.560585021972656e-05


HBox(children=(FloatProgress(value=0.0, max=122.0), HTML(value='')))


[TIME] remove.filter: 2.034330368041992
[TIME] remove None fields: 2.3826234340667725
[TIME] remove.column: 5.0067901611328125e-06
[TIME] first [train-(test-val)] split: 0.0033721923828125
[TIME] second [test-val] split: 0.00179290771484375
[TIME] TOTAL: 4.405852794647217
len meta single_chunk: 121126
len pdfs single_chunk: 50853
[TIME] load_dataset: 2.082895040512085
[TIME] dataset_train selection: 2.384185791015625e-07
[TIME] remove.indexes: 0.3542666435241699
[TIME] remove.concat: 9.274482727050781e-05


HBox(children=(FloatProgress(value=0.0, max=122.0), HTML(value='')))


[TIME] remove.filter: 2.026700019836426
[TIME] remove None fields: 2.3811447620391846
[TIME] remove.column: 4.5299530029296875e-06
[TIME] first [train-(test-val)] split: 0.003193378448486328
[TIME] second [test-val] split: 0.0017948150634765625
[TIME] TOTAL: 4.4692065715789795
len meta single_chunk: 121487
len pdfs single_chunk: 50529
[TIME] load_dataset: 2.072370767593384
[TIME] dataset_train selection: 0.0
[TIME] remove.indexes: 0.35334277153015137
[TIME] remove.concat: 9.1552734375e-05


HBox(children=(FloatProgress(value=0.0, max=122.0), HTML(value='')))


[TIME] remove.filter: 1.955249309539795
[TIME] remove None fields: 2.3087570667266846
[TIME] remove.column: 2.8371810913085938e-05
[TIME] first [train-(test-val)] split: 0.0032918453216552734
[TIME] second [test-val] split: 0.0017571449279785156
[TIME] TOTAL: 4.3863441944122314


In [36]:
all_datasets

DatasetDict({
    train: Dataset({
        features: ['abstract', 'title', 'mag_field_of_study'],
        num_rows: 216042
    })
    test: Dataset({
        features: ['abstract', 'title', 'mag_field_of_study'],
        num_rows: 27007
    })
    valid: Dataset({
        features: ['abstract', 'title', 'mag_field_of_study'],
        num_rows: 27005
    })
})

In [37]:
all_datasets['train'][:3]

{'abstract': ['We develop a new algorithm to perform facial reconstruction from a given skull. This technique has forensic application in helping the identification of skeletal remains when other information is unavailable. Unlike most existing strategies that directly reconstruct the face from the skull, we utilize a database of portrait photos to create many face candidates, then perform a superimposition to get a well matched face, and then revise it according to the superimposition. To support this pipeline, we build an effective autoencoder for image-based facial reconstruction, and a generative model for constrained face inpainting. Our experiments have demonstrated that the proposed pipeline is stable and accurate.',
  'Thank you for downloading reporting public affairsproblems and solutions. As you may know, people have search hundreds times for their favorite novels like this reporting public affairsproblems and solutions, but end up in infectious downloads. Rather than readin

In [38]:
raw_datasets = load_dataset("stsb_multi_mt", name="en")

Couldn't find file locally at stsb_multi_mt/stsb_multi_mt.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.5.0/datasets/stsb_multi_mt/stsb_multi_mt.py.
The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/stsb_multi_mt/stsb_multi_mt.py.
Reusing dataset stsb_multi_mt (/home/vivoli/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/bc6de0eaa8d97c28a4c22a07e851b05879ae62c60b0b69dd6b331339e8020f07)


In [39]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})

In [40]:
raw_datasets['train'][:3]

{'sentence1': ['A plane is taking off.',
  'A man is playing a large flute.',
  'A man is spreading shreded cheese on a pizza.'],
 'sentence2': ['An air plane is taking off.',
  'A man is playing a flute.',
  'A man is spreading shredded cheese on an uncooked pizza.'],
 'similarity_score': [5.0, 3.799999952316284, 3.799999952316284]}

## Multichunks getDataset( (id, multichunk) | (single_chunk) )
---

In [None]:
DATA_FIELD =  ["title", "abstract"]
dataset_dict_test = fuse_dictionaries(multichunks_lists[0], data_field=DATA_FIELD)

## Multichunks getDatasets
---

In [None]:
# tokenizer from 'allenai/scibert_scivocab_uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = BertForMaskedLM.from_pretrained(MODEL_PATH)

In [None]:
max_seq_length = model.config.max_position_embeddings
print(max_seq_length)

In [None]:
%time

dictionary_input = { "data": ["abstract"], "target": ["title"], "classes": ["mag_field_of_study"]}
dictionary_columns = sum(dictionary_input.values(), [])

# here we use meta_s2orc for speed, 
dataset = getDataset(multichunks_lists[0], tokenizer, data_field=dictionary_columns, max_seq_length=max_seq_length, seed=SEED)

In [None]:
dataset

## S2ORC Preparation
---

To build a generic loading function we take inspiration from [here](https://discuss.huggingface.co/t/pipeline-with-custom-dataset-tokenizer-when-to-save-load-manually/1084/11).

In [41]:
vocab = tokenizer.get_vocab()
print(f"[PAD]: {vocab['[PAD]']}")
print(f"[UNK]: {vocab['[UNK]']}")
print(f"[SEP]: {vocab['[SEP]']}")
print(f"[CLS]: {vocab['[CLS]']}")
print(f"0: {tokenizer.convert_ids_to_tokens(0)}")
print(f"1: {tokenizer.convert_ids_to_tokens(1)}")
print(f"2: {tokenizer.convert_ids_to_tokens(2)}")
print(f"99: {tokenizer.convert_ids_to_tokens(99)}")
print(f"100: {tokenizer.convert_ids_to_tokens(100)}")
print(f"101: {tokenizer.convert_ids_to_tokens(101)}")

[PAD]: 0
[UNK]: 101
[SEP]: 103
[CLS]: 102
0: [PAD]
1: [unused0]
2: [unused1]
99: [unused98]
100: [unused99]
101: [UNK]


In [42]:
tokenizer

PreTrainedTokenizerFast(name_or_path='allenai/scibert_scivocab_uncased', vocab_size=31090, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

Finally, I found [this](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=datasetdict#datasets.DatasetDict.map) documentation for the function `DatasetDict.map` from the `dataset` library.

In [None]:
debug = False

dataset_map = dataset.map(data_target_preprocess, input_columns= dictionary_columns, fn_kwargs= dictionary_input, batched=True)

In [43]:
dataset_map

NameError: name 'dataset_map' is not defined

In [None]:
mag_field_dict: Dict = {
    "Medicine":    0,
    "Biology":     1,
    "Chemistry":   2,
    "Engineering": 4,
    "Computer Science":    5,
    "Physics":     6,
    "Materials Science":     7,
    "Mathematics":        8,
    "Psychology":  9,
    "Economics":   10,
    "Political Science":    11,
    "Business":    12,
    "Geology":     13,
    "Sociology":   14,
    "Geography":   15,
    "Environmental Science":     16,
    "Art":         17,
    "History":     18,
    "Philosophy":  19
    # "null":         3, 
}

# The key null is actually null, not "null":str
#
#      real_mag_field_value = paper_metadata['mag_field_of_study']
#
# so we could return the id 3 if it not contained as key of dictionary
#
#      mag_field_dict.get(real_mag_field_value, 3)
#

In [None]:
dataset_mag_map = dataset_map.map(mag_preprocessing, input_columns= dictionary_input['classes'], batched=True)

In [None]:
dataset_mag_map

# Rename it as you want
---

- `dataset_map.rename_column` ,method for renaming
- `dataset_map.set_format`, method for define what columns need to be returned

In [None]:
dataset_mag_map = dataset_mag_map.rename_column("data_input_ids", "input_ids")

In [None]:
dataset_mag_map.set_format("torch", columns=["input_ids"])

In [None]:
print(dataset_mag_map['train'][1]['input_ids'].size())

Then, if you want to store it, it will be stored in the conda environment you are

In [None]:
%store dataset_mag_map

---
---
## ❌ FAKE PIPELINE for train BERT-based NETS
---
---

In [None]:
%store -r dataset_mag_map

In [None]:
# If you print some element from `dataset_map['train'][element_index]['input_ids']` you'll see that lots of element
vect = [ele[ele.nonzero()].size(0) for ele in dataset_mag_map['train'][:]['input_ids']]

In [None]:
max_vect = max(vect)
min_vect = min(vect)
sum_vect = sum(vect)
len_vect = len(vect)

print(f" max: {max_vect} \n min: {min_vect} \n avg: {sum_vect/len_vect}")

In [None]:
print(dataset_mag_map['train'][:10])

---
---
## ❌ FAKE PIPELINE for train BERT-based NETS
---
---

From [here](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_mlm.py) you can get an idea from were the code has been borrowed.

In [None]:
# Data collator
# This one will take care of randomly masking the tokens.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

train_dataset = dataset_mag_map['train']
eval_dataset = dataset_mag_map['valid']

# Inizialize TrainerArguments
training_args = TrainingArguments(
    output_dir=output_dir,           # [def.`tmp_trainer`] output directory
    num_train_epochs=3,              # [def.   3 ] total # of training epochs
    per_device_train_batch_size=16,   # [def.   8 ] batch size per device during training
    per_device_eval_batch_size=16,    # [def.   8 ] batch size for evaluation
    evaluation_strategy="no",        # [def. 'no'] evaluation is done (and logged) every eval_steps
    warmup_steps=0,                  # [def.   0 ] number of warmup steps for learning rate scheduler
    weight_decay=0,                  # [def.   0 ] strength of weight decay 
    learning_rate=5e-4,              # [def. 5e-5] 
    logging_dir='./logs',            # [def. runs/__id__] directory for storing logs. TensorBoard log directory.
)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)



In [None]:
# Training

checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()  # Saves the tokenizer too for easy upload
metrics = train_result.metrics

# max_train_samples = data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
max_train_samples = len(train_dataset)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()



In [None]:
# Evaluation

logger.info("*** Evaluate ***")

metrics = trainer.evaluate()

# max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
max_val_samples = len(eval_dataset)
metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))

import math
perplexity = math.exp(metrics["eval_loss"])
metrics["perplexity"] = perplexity

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [None]:
wandb.finish()

---
# 1. Introduction
---

The following datasets were downloaded from the internet (we try to provide links to those we have the right to do so). We divide the dataset based on the task they are mostly used for.

## 1.1 Keyphrase task
---

SOTA: [keyphrase generation](https://arxiv.org/pdf/1704.06879.pdf).

The Keyphrase datasets (***duc***, ***Inspect***, ***Krapivin***, ***NUS***, ***SemEval-2010***, ***KP20k dataset***, ***MagKP-CS***) are structured as follow:

- title
- abstract
- fulltext
- keywords

The only dataset that variates is ***STACKEX*** that instead of having *abstract* and *keywords* has:

- question (abstract)
- tags (keywords)

Here there is a list of the datasets previously cited, with some information:

- **duc**, we haven't had much information on this dataset untill now.

- **Inspec** [(Hulth, 2003)](https://www.aclweb.org/anthology/W03-1028.pdf), This dataset provides *2,000 paper abstracts*. We adopt the *500 testing* papers and their corresponding uncontrolled keyphrases for evaluation, and the remaining *1,500 papers* are used for *training* the supervised baseline models.

- **Krapivin** [(Krapivin et al., 2008)](http://eprints.biblio.unitn.it/1671/1/disi09055-krapivin-autayeu-marchese.pdf): This dataset provides *2,304 papers with full-text* and *author-assigned keyphrases*. However, the author did not mention how to split testing data, so we selected the first *400 papers in alphabetical order as the testing data*, and the *remaining* papers are used to *train* the supervised baselines.

- **NUS** [(Nguyen and Kan, 2007)](https://www.comp.nus.edu.sg/~kanmy/papers/icadl2007.pdf): We use both author-assigned and reader-assigned keyphrases and treat *all 211 papers as the testing data*. Since the NUS dataset did not specifically mention the ways of splitting training and testing data, the results of the supervised baseline models are obtained through a *five-fold cross-validation*.

- **SemEval-2010** [(Kim et al., 2010)](https://www.aclweb.org/anthology/S10-1004.pdf): 288 articles were collected from the ACM Digital Library. 100 articles were used for testing and the rest were used for training supervised baselines.

- **KP20k dataset** [(Meng et al., 2018)](https://arxiv.org/abs/1704.06879): They built a new testing dataset that contains the *titles, abstracts, and keyphrases* of *20,000 scientific articles* in computer science. They were *randomly selected from their obtained 567,830 articles*. Thus they took the 20,000 articles in the validation set to train the supervised baselines.

- **MagKP-CS** (from OpenNMT-py and [OpenNMT-kpg-release](https://github.com/memray/OpenNMT-kpg-release)) that is available for download. 

- **STACKEX** (from [StackExchange](https://archive.org/details/stackexchange)) has been constructed from the computer science forums (CS/AI) at StackExchange using “title” + “body” as source text and “tags” as the target keyphrases. After removing questions without valid tags, they collected 330,965 questions. They randomly selected *16,000 for validation*, and another *16,000 as test set*. Note some questions in StackExchange forums contain large blocks of code, resulting in long texts (sometimes more than 10,000 tokens after tokenization), this is difficult for most neural models to handle. Consequently, the texts have been truncated to 300 tokens and 1,000 tokens for training and evaluation splits respectively.

###### ⚠️ATTENTION
> As we aren't going to use the Keyphrase dataset for now, we don't need any custom classes for managing this dataset. We will implement this functions and classes as we go, if there will be the needs.

## 1.2 Sentence embedding task
---

SOTA: [sBERT](https://arxiv.org/abs/1908.10084)

- **SNLI** [(Bowman et al., 2015)](https://arxiv.org/abs/1508.05326) is a collection of *570,000 sentence pairs* annotated with the *labels contradiction, eintailment, and neutral*.

- **MultiNLI** [(Williams et al., 2018)](https://arxiv.org/abs/1704.05426) contains *430,000 sentence pairs* and covers a *range of genres of spoken and written text*.

- **SciTail** [(allenai)](http://ai2-website.s3.amazonaws.com/publications/scitail-aaai-2018_cameraready.pdf), the entailment dataset consists of 27k. In contrast to the SNLI and MultiNLI, it was not crowd-sourced but created from sentences that already exist “in the wild”. *Hypotheses* were created from *science questions* and the corresponding *answer candidates*, while relevant web sentences from a large corpus were used as premises. Models are evaluated based on accuracy.

###### ❌ATTENTION
> As we aren't going to use the NLI tasks dataset (for now), we don't need any custom classes for managing this dataset. We will implement this functions and classes as we go, if there will be the needs.

## 1.3 Generic NLP tasks
---

- **S2ORC** [(Lo et al., 2020)](https://github.com/allenai/s2orc) is a large corpus of *81.1M English-language academic papers* spanning many academic disciplines. The corpus consists of *rich metadata, paper abstracts, resolved bibliographic references*, as well as *structured full text for 8.1M open access papers*. Full text is annotated with automatically-detected inline mentions of citations, figures, and tables, each linked to their corresponding paper objects. In S2ORC, they aggregate papers from hundreds of academic publishers and digital archives into a unified source, and create the largest publicly-available collection of machine-readable academic text to date. Built for text mining over academic text.

- **OAG** [(Tang et al., 2008)](http://keg.cs.tsinghua.edu.cn/jietang/publications/KDD08-Tang-et-al-ArnetMiner.pdf)  is a large knowledge graph unifying *two billion-scale academic graphs*: Microsoft Academic Graph (**MAG**) and **AMiner**. In mid 2017, they published OAG v1, which contains *166,192,182 papers from MAG and 154,771,162 papers from AMiner* and generated *64,639,608 linking (matching) relations between the two graphs*. This time, in OAG v2, author, venue and newer publication data and the corresponding matchings are available.



###### ✅ATTENTION
> We are going to use the S2ORC dataset as it contains full_text data as well as citation/reference informations. It contains also authorship - title - tables data that we will describe below.

---
# 2. S2ORC
---

## 2.1 Description (s2orc)
---
The `S2ORC` dataset is in the `data` path under the folder `s2orc-full-20200705v1` (where `s2orc` is the name of the dataset, `full` is the type, as there is also a sample fingerprint; and `20200705v1` is the version). 
We can reach the data folder exiting by the project and entering in the data folder:

As you can see (going into `s2orc-full-20200705v1/full/`) there are the `metadata` folder and the `pdf_parses` folder. The main difference (as we can already get it from the names) is that in the `metadata` you only have some information about the dataset (retrieved from the published metadata), while in the `pdf_parses` you get all the extensive data conteined in the paper (if the paper was present, was correctly parsed and no restriction in the paper data were applied due to limited licence permition). For some reason, the `title` of the paper is contained only in the `metadata` file, but it can get from the `paper_id` field of the paper itself.

More information about the `S2ORC` dataset can be read in the [README.md](https://github.com/allenai/s2orc/blob/master/README.md) of the project and in the [project repository](https://github.com/allenai/s2orc/)

### mag field
- MAG fields of study:

| Field of study | All papers | Full text |
|----------------|------------|-----------|
| Medicine       | 12.8M      | 1.8M      |
| Biology        | 9.6M       | 1.6M      |
| Chemistry      | 8.7M       | 484k      |
| n/a            | 7.7M       | 583k      |
| Engineering    | 6.3M       | 228k      |
| Comp Sci       | 6.0M       | 580k      |
| Physics        | 4.9M       | 838k      |
| Mat Sci        | 4.6M       | 213k      |
| Math           | 3.9M       | 669k      |
| Psychology     | 3.4M       | 316k      |
| Economics      | 2.3M       | 198k      |
| Poli Sci       | 1.8M       | 69k       |
| Business       | 1.8M       | 94k       |
| Geology        | 1.8M       | 115k      |
| Sociology      | 1.6M       | 93k       |
| Geography      | 1.4M       | 58k       |
| Env Sci        | 766k       | 52k       |
| Art            | 700k       | 16k       |
| History        | 690k       | 22k       |
| Philosophy     | 384k       | 15k       |

We need now a function that reads all the lines of the `jsonl` files inside both `metadata` and `pdf_parses` folders. Then we'll 

## `metadata` schema

We recommend everyone work with `metadata/` as the starting point.  This is a JSONlines file (one line per paper) with the following keys:

#### Identifier fields

* `paper_id`: a `str`-valued field that is a unique identifier for each S2ORC paper.

* `arxiv_id`: a `str`-valued field for papers on [arXiv.org](https://arxiv.org).

* `acl_id`: a `str`-valued field for papers on [the ACL Anthology](https://www.aclweb.org/anthology/).

* `pmc_id`: a `str`-valued field for papers on [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc/articles).

* `pubmed_id`: a `str`-valued field for papers on [PubMed](https://pubmed.ncbi.nlm.nih.gov/), which includes MEDLINE.  Also known as `pmid` on PubMed.

* `mag_id`: a `str`-valued field for papers on [Microsoft Academic](https://academic.microsoft.com).

* `doi`: a `str`-valued field for the [DOI](http://doi.org/).  

Notably:

* Resolved citation links are represented by the cited paper's `paper_id`.

* The `paper_id` resolves to a Semantic Scholar paper page, which can be verified using the `s2_url` field.

* We don't always have a value for every identifier field.  When missing, they take `null` value.


#### Metadata fields

* `title`: a `str`-valued field for the paper title.  Every S2ORC paper *must* have one, though the source can be from publishers or parsed from PDFs.  We prioritize publisher-provided values over parsed values.

* `authors`: a `List[Dict]`-valued field for the paper authors.  Authors are listed in order.  Each dictionary has the keys `first`, `middle`, `last`, and `suffix` for the author name, which are all `str`-valued with exception of `middle`, which is a `List[str]`-valued field.  Every S2ORC paper *must* have at least one author.

* `venue` and `journal`: `str`-valued fields for the published venue/journal.  *Please note that there is not often agreement as to what constitutes a "venue" versus a "journal". Consolidating these fields is being considered for future releases.*   

* `year`: an `int`-valued field for the published year.  If a paper is preprinted in 2019 but published in 2020, we try to ensure the `venue/journal` and `year` fields agree & prefer non-preprint published info. *We know this decision prohibits certain types of analysis like comparing preprint & published versions of a paper.  We're looking into it for future releases.*  

* `abstract`: a `str`-valued field for the abstract.  These are provided directly from gold sources (not parsed from PDFs).  We preserve newline breaks in structured abstracts, which are common in medical papers, by denoting breaks with `':::'`.     

* `inbound_citations`: a `List[str]`-valued field containing `paper_id` of other S2ORC papers that cite the current paper.  *Currently derived from PDF-parsed bibliographies, but may have gold sources in the future.*

* `outbound_citations`: a `List[str]`-valued field containing `paper_id` of other S2ORC papers that the current paper cites.  Same note as above.   

* `has_inbound_citations`: a `bool`-valued field that is `true` if `inbound_citations` has at least one entry, and `false` otherwise.

* `has_outbound_citations` a `bool`-valued field that is `true` if `outbound_citations` has at least one entry, and `false` otherwise.

We don't always have a value for every metadata field.  When missing, `str` fields take `null` value, while `List` fields are empty lists.

#### PDF parse-related metadata fields

* `has_pdf_parse`:  a `bool`-valued field that is `true` if this paper has a corresponding entry in `pdf_parses/`, which means we had processed that paper's PDF(s) at some point.  The field is `false` otherwise.

* `has_pdf_parsed_abstract`: a `bool`-valued field that is `true` if the paper's PDF parse contains a parsed abstract, and `false` otherwise.   

* `has_pdf_parsed_body_text`: a `bool`-valued field that is `true` if the paper's PDF parse contains parsed body text, and `false` otherwise.

* `has_pdf_parsed_bib_entries`: a `bool`-valued field that is `true` if the paper's PDF parse contains parsed bibliography entries, and `false` otherwise.

* `has_pdf_parsed_ref_entries`: a `bool`-valued field that is `true` if the paper's PDF parse contains parsed reference entries (e.g. tables, figures), and `false` otherwise.

Please note:

* If `has_pdf_parse = false`, the other four fields will not be present in the JSON (trivially `false`).

* If `has_pdf_parse = true` but `has_pdf_parsed_abstract`, `has_pdf_parsed_body_text`, or `has_pdf_parsed_ref_entries` are `false`, this can be because:

    * Our PDF parser failed to extract that element
    * Our PDF parser succeeded but that paper simply did not have that element (e.g. papers without abstracts)
    * Our PDF parser succeeded but that element was removed because the paper is not identified as open-access.  


##### metadata_CLASS
```python
{
 "paper_id": (string), 
 "title": (string), 
 "authors": [
     {
         "first": (string), 
         "middle": [], 
         "last": (string), 
         "suffix": (string)
     },
     ...
   ]: **Author_Class**, 
 "abstract": (string), 
 "year": (int), 
 "arxiv_id": null, 
 "acl_id": null, 
 "pmc_id": null, 
 "pubmed_id": null, 
 "doi": null, 
 "venue": null, 
 "journal": (string), 
 "mag_id": (string-number), 
 "mag_field_of_study": [
     "Medicine",
     "Computer Science"
   ]: **FieldOfStudy_Enum**, 
 "outbound_citations": [], 
 "inbound_citations": [], 
 "has_outbound_citations": false, 
 "has_inbound_citations": false, 
 "has_pdf_parse": false, 
 "s2_url": (string)
}
```

Here I represent Author_Class as an object of 
```python
{
    "first": (string), 
    "middle": [], 
    "last": (string), 
    "suffix": (string)
}
```
and `FieldOfStudy_Enum` as an Enum of string such as `[ "Medicine", "Computer Science", "Physics", "Mathematics", ... ]`


## `pdf_parses` schema

We view `pdf_parses/` as supplementary to the `metadata/` entries.  PDF parses are also represented as JSONlines file (one line per paper) with the following keys:

* `paper_id`: a `str`-valued field which is the same S2ORC paper ID in `metadata/`

* `_pdf_hash`: a `str`-valued field.  Internal usage only.  We use this for debugging.

* `abstract` and `body_text` are `List[Dict]`-valued fields representing parsed text from the PDF.  Each `Dict` corresponds to a paragraph.  `List` preserves their original ordering.

* `bib_entries` and `ref_entries` are `Dict`-valued fields representing extracted entities that can be referenced (inline) within the text.

#### example 1

One example paragraph in `abstract` or `body_text` might look like:

```python
{
    "section": "Introduction",
    "text": "Dogs are happier cats [13, 15]. See Figure 3 for a diagram.",
    "cite_spans": [
        {"start": 22, "end": 25, "text": "[13", "ref_id": "BIBREF11"},
        {"start": 27, "end": 30, "text": "15]", "ref_id": "BIBREF30"},
        ...
    ],
    "ref_spans": [
        {"start": 36, "end": 44, "text": "Figure 3", "ref_id": "FIGREF2"},
    ]
}
```

and example `bib_entries` and `ref_entries` might look like:

```python
{
    ...,
    "BIBREF11": {
        "title": "Do dogs dream of electric humans?",
        "authors": [
            {"first": "Lucy", "middle": ["Lu"], "last": "Wang", "suffix": ""}, 
            {"first": "Mark", "middle": [], "last": "Neumann", "suffix": "V"}
        ],
        "year": "", 
        "venue": "barXiv",
        "link": null
    },
    ...
}
```

```python
{
    "TABREF4": {
        "text": "Table 5. Clearly, we achieve SOTA here or something.",
        "type": "table"
    }
    ...,
    "FIGREF2": {
        "text": "Figure 3. This is the caption of a pretty figure.",
        "type": "figure"
    },
    ...
}
```

Notice: 

* Inline `spans` are represented by character start and end indices into the paragraph `text`
* `spans` resolve to `BIBREF`, `TABREF` or `FIGREF` entries.
* `BIBREF` are IDs of bibliographic elements of `bib_entries`.  Bib entries may be missing fields (e.g. `year`).  They can be linked to S2ORC papers, as specified by `link`, but we also preserve any unlinked entries by setting `link` to `null`.
* `FIGREF` and `TABREF` are IDs of figure and table elements of `ref_entries`.  Ref entries contain the caption text of the corresponding object, and also indicate the type of object.


#### example 2

You may see empty `pdf_parses/` JSONs that look like: 

```python
{
    "paper_id": "...", 
    "_pdf_hash": "...", 
    "abstract": [], 
    "body_text": [], 
    "bib_entries": {}, 
    "ref_entries": {}
}
```

We keep these around for our internal usage, but the way to interpret these is that there is no usable PDF parse here, despite the corresponding `metadata/` entry still displaying `has_pdf_parse = true`.

These exist when (i) `bib_entries` does not successfully parse *and* (ii) the paper is not open-access, so we had to remove `abstract`, `body_text`, and `ref_entries`.   



##### pdf_parses_CLASS
```python
{
 "paper_id": (string), 
 "_pdf_hash": (string-number), 
 "abstract": [
     {
         "section": (string) "Abstract", 
         "text": (string), 
         "cite_spans": [
             {
                 "start": (int), 
                 "end": (int), 
                 "text": (string-number) "[4, 
                 "ref_id": (string)
             }
           ]: **CiteSpan_Class**, 
         "ref_spans": []
     },
     ...
 ]: **TextSection_Class**, 
 "body_text": [], 
 "bib_entries": 
     {
         "BIBREF0": 
             {
              "title": (string), 
              "authors": [
                  {
                      "first": (string), 
                      "middle": [], 
                      "last": (string), 
                      "suffix": (string)
                   }
                 ], 
               "year": (int), 
               "venue": (string), 
               "link": (string-number)
              }, 
          "BIBREF1": 
              {
                  ...
              }
       }: **BIBREF_Class**, 
 "ref_entries": {}
}
```

Here I represent `TextSection_Class` as an object of 
```python
{
 "section": (string), 
 "text": (string), 
 "cite_spans": [
     {
         "start": (int), 
         "end": (int), 
         "text": (string-number) "[4, 
         "ref_id": (string)
     }
   ], 
 "ref_spans": []
}
```
where `CiteSpan_Class` itself is another structured object:
```python
{
 "start": (int), 
 "end": (int), 
 "text": (string-number), 
 "ref_id": (string)
}
```
and `BIBREF_Class` as dictionary field with `BIBREF_#` as key and related to it an object as follow:
```python
"BIBREF_#": 
 {
  "title": (string), 
  "authors": [
      {
          "first": (string), 
          "middle": [] (list of string),
          "last": (string), 
          "suffix": (string)
       }
     ], 
   "year": (int), 
   "venue": (string), 
   "link": null
  }
```

## 2.3 Title Abstract - Full text  (s2orc)
---
We have loaded the `S2ORC` dataset, created our (one chunk) dataset parses and we want now starting creating our dataset objects (Classes and Loaders).

Let's start with the datasets.

### Dataset creation
We want to create the datasets for papers' title-abstract and fulltext-(title-abstract) generation. 
> we'd like also to create a KeyPhrase dataset, we are actualling waiting for the response from the `S2ORC` authors to understand where can we possibly obtain the keyphrases/keywords.

In order to do this, we want to create the two datasets (saving them as `jsonl` files).
We can organize the data folder as :
```bash
- data/
    # keyphrase dataset 
    - keyphrase/
        # (title - abstract - fulltext - keyphrase)
        - s2orc/
            - README.md
            - chuncks_dataset_idx.json
            - train/
                - train_0.jsonl
                - train_1.jsonl
                - ...
            - test/
                - test_0.jsonl
                - test_1.jsonl
                - ...
            - val/
                - val_0.jsonl
                - val_1.jsonl
                - ...
    
    # sts datasets
    - sts/ 
        # (title - abstract - cosine_similarity)
        - s2orc_partial/
            - README.md
            - chuncks_dataset_idx.json
            - train/
                - train_0.jsonl
                - train_1.jsonl
                - ...
            - test/
                - test_0.jsonl
                - test_1.jsonl
                - ...
            - val/
                - val_0.jsonl
                - val_1.jsonl
                - ...
                
        # (title - abstract - fulltext - cosine_similarity)
        - s2orc_full/
            - README.md
            - chuncks_dataset_idx.json
            - train/
                - train_0.jsonl
                - train_1.jsonl
                - ...
            - test/
                - test_0.jsonl
                - test_1.jsonl
                - ...
            - val/
                - val_0.jsonl
                - val_1.jsonl
                - ...
```
and in the `chuncks_dataset_idx.json` there is the dictionary that maps the `chuncks` (`metadata_{id}.jsonl, pdf_parses_{id}.jsonl for id in range(99)`) into the {train|test|validation}_{id}.

A first step to not-using chuncks (neither metadata nor fulltext) anymore is to summarize the data we want into a new python structure (dict) as follow, and save them 

```python
{
    "paper_id": (string-int), 
    "title":  (string),
    "abstract": (string), 
    "fulltext": (string), 
    "keywords": List[string],
}
```

1. get the training/validation dataset by extracting Title-Abstract from the `S2ORC` dataset, and getting the testing data from the `KeyPhrase` (*'inspec', 'krapivin', 'nus', 'semeval', 'kp20k', 'duc', 'stackexchange'*) datasets. We should have a pair of sentence (indicativelly a *title* and an *abstract*), possibly a *fulltext* and a *keywords* fields those can be

    - completelly related (abstract and its corresponding title)
    - someway related (abstract and a field-keyphrase related title {cs+(deep learning; metric learning; nlp; sts;)}
    - unrelated but not far away (abstract and a field-**not**keyphrase related title {cs+(nlp; transformer;)-vs-(cv; attention)}
    - completelly unrelated (abstract and title are field-keyphrase unrelated {cs+a -vs- phy+z})



2. **🤗transformers**, we can see [here](https://huggingface.co/docs/datasets/loading_datasets.html#json-files) the dataset loader (from `jsonl` files) can be used to load train/validation datasets. As we have alrerady load the dataset as dictionary (it is called `multichunks_lists` now, depending on how many chuncks we need to load in one shot) we could also be using the example [here](https://huggingface.co/docs/datasets/loading_datasets.html#from-a-python-dictionary) in order to load the dataset from an existing dictionary. 


1. **sentence-transformer**, [sBERT example for train](https://www.sbert.net/docs/training/overview.html#loss-functions) 

2. **🤗transformers**, we can see [here](https://huggingface.co/docs/datasets/loading_datasets.html#json-files) the dataset loader (from `jsonl` files) can be used to load train/validation datasets. As we have alrerady load the dataset as dictionary (it is called `multichunks_lists` now, depending on how many chuncks we need to load in one shot) we could also be using the example [here](https://huggingface.co/docs/datasets/loading_datasets.html#from-a-python-dictionary) in order to load the dataset from an existing dictionary. 

In [None]:
import torch

# TADataset states for TitleAbstractDataset
class TADataset(torch.utils.data.Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
paper_id = metadata_dict['paper_id']
print(f"Currently viewing S2ORC paper: {paper_id}")

# suppose we only care about ACL anthology papers
if not metadata_dict['acl_id']:
    continue

# and we want only papers with resolved outbound citations
if not metadata_dict['has_outbound_citations']:
    continue

# get citation context (paragraphs)!
if paper_id in paper_id_to_pdf_parse:
    # (1) get the full pdf parse from the previously computed lookup dict
    pdf_parse = paper_id_to_pdf_parse[paper_id]

    # (2) pull out fields we need from the pdf parse, including bibliography & text
    bib_entries = pdf_parse['bib_entries']
    paragraphs = pdf_parse['abstract'] + pdf_parse['body_text']

    # (3) loop over paragraphs, grabbing citation contexts
    for paragraph in paragraphs:

        # (4) loop over each inline citation in this paragraph
        for cite_span in paragraph['cite_spans']:

            # (5) each inline citation can be resolved to a bib entry
            cited_bib_entry = bib_entries[cite_span['ref_id']]

            # (6) that bib entry *may* be linked to a S2ORC paper.  if so, grab paragraph
            linked_paper_id = cited_bib_entry['link']
            if linked_paper_id:
                citation_contexts.append({
                    'citing_paper_id': paper_id,
                    'cited_paper_id': linked_paper_id,
                    'context': paragraph['text'],
                    'citation_mention_start': cite_span['start'],
                    'citation_mention_end': cite_span['end'],
                })

# 3. Computing Word Embeddings: `Continuous Bag-of-Words`

The Continuous Bag-of-Words model (CBOW) is frequently used in NLP deep learning. It is a model that tries to predict words given the context of a few words before and a few words after the target word. This is distinct from language modeling, since CBOW is not sequential and does not have to be probabilistic. Typcially, CBOW is used to quickly train word embeddings, and these embeddings are used to initialize the embeddings of some more complicated model. Usually, this is referred to as pretraining embeddings. It almost always helps performance a couple of percent.

The CBOW model is as follows. Given a target word $w_i$ and an $N$ context window on each side, $w_{i−1}, … , w_{i−N}$ and $w_{i+1},…,w_{i+N}$, referring to all context words collectively as $C$, CBOW tries to minimize:


$$ −log p(w_i|C) = − log Softmax( A( \sum_{w∈C}{}{q_w})+b) $$

where $q_w$ is the embedding for word $w$.

In [None]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])


class CBOW(nn.Module):

    def __init__(self):
        pass

    def forward(self, inputs):
        pass

# create your model and train.  here are some functions to help you make
# the data ready for use by your module


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


make_context_vector(data[0][0], word_to_ix)  # example