In [1]:
from data_utils import *
import os

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


# Usage 

In [2]:
#PATH_DATA = "../data/"
PATH_DATA = "/src/data/nfs/analysis/NLP/audiobooks-nlu/direct-speech/final/"

## Loading corpus

In [3]:
%%time
corpus = load_corpus_from_json(PATH_DATA+"aggregated_corpus.json"
                               , **{"document_maker":nltk_tokenizer_pipe
                                    , "annotated":True
                                    , "label_name":"label"
                                    , "label_pos":"DS"
                                    , "label_neg":"O"
                                    , "annotation_scheme":None
                                   }
                              )

print("Loaded corpus of {nb_files} files.".format(nb_files=len(corpus)))

Loaded corpus of 86 files.
CPU times: user 45.4 s, sys: 522 ms, total: 45.9 s
Wall time: 45.6 s


In [4]:
%%time
# by default, this methods splits subcorpora into <main> and <ood> as predefined
main_corpus, corpus_ood = corpus.split_corpus()

CPU times: user 172 µs, sys: 0 ns, total: 172 µs
Wall time: 179 µs


### Files

For each of the files contained in the corpus, the labels are stored in terms of spans indexed on characters. See below:

In [5]:
example_file = main_corpus["Pauline"]

for i, ds_char_span in enumerate(example_file.char_spans_labels[:3]):
    print("---\nDS segment number {}:\n".format(i))
    print(example_file.text[ds_char_span[0]:ds_char_span[1]])

---
DS segment number 0:

— Cela sera-t-il bien long ?
---
DS segment number 1:

— C'est l'affaire d'une heure,
---
DS segment number 2:

nous allons nous mettre tout de suite à manger l'avoine.


The text is tokenized and a `DataFrame` containing word-tokens and their corresponding labels is stored for each file:

In [6]:
example_file.df_tokens

Unnamed: 0,token,sentstart,token_idx,label
0,George,yes,"(0, 6)",O
1,Sand,no,"(7, 11)",O
2,Nouvelle,no,"(12, 20)",O
3,ÉDition,no,"(21, 28)",O
4,Paris,no,"(29, 34)",O
...,...,...,...,...
12913,de,no,"(59822, 59824)",O
12914,toutes,no,"(59825, 59831)",O
12915,ses,no,"(59832, 59835)",O
12916,forces,no,"(59836, 59842)",O


### Corpus

The corpus can then be separated in Train/Val/Test splits, and their corresponding `DataFrame`s merged to be able to train ML models on these splits. `CorpusDict`'s method `merge_dfs_by_keys` will give a `dict` storing the merged `DataFrame`s according to a given segregating key and values. The name of the file is added in the table and "EOF" tags separate the different files aggregated in the `DataFrame`.

See below the resulting tables:

In [7]:
main_df_splits = main_corpus.merge_dfs_by_keys(key="split",
                                               values=["train", "val", "test"],
                                              )

In [8]:
main_df_splits["train"]

Unnamed: 0,token,sentstart,token_idx,label,file
0,Madame,yes,"(0, 6)",O,Madame_de_Hautefort
1,de,no,"(7, 9)",O,Madame_de_Hautefort
2,Hautefort,no,"(10, 19)",O,Madame_de_Hautefort
3,\n,no,"(19, 20)",O,Madame_de_Hautefort
4,Voici,no,"(20, 25)",O,Madame_de_Hautefort
...,...,...,...,...,...
47433,fera,no,"(211563, 211567)",DS,cousinebette_cecile
47434,faire,no,"(211568, 211573)",DS,cousinebette_cecile
47435,fortune,no,"(211574, 211581)",DS,cousinebette_cecile
47436,.,no,"(211581, 211582)",DS,cousinebette_cecile


## Generating `.tsv` files

In [9]:
# Generating files using the custom RegEx NLTK document tokenization

generate_tokens_df(raw_data_path=PATH_DATA+"aggregated_corpus.json"
                   ,output_dir=PATH_DATA+"nltk_tokenization/"
                   ,**{"document_maker":nltk_tokenizer_pipe
                       , "annotated":True
                       , "label_name":"label"
                       , "label_pos":"DS"
                       , "label_neg":"O"
                       , "annotation_scheme":None
                      }
                  )

Loaded corpus of 86 files.
Saving Dataframe with columns:
 ['token', 'sentstart', 'token_idx', 'label', 'file']
 composed of 353695 rows at:
/src/data/nfs/analysis/NLP/audiobooks-nlu/direct-speech/final/nltk_tokenization/main_corpus/train.tsv

Saving Dataframe with columns:
 ['token', 'sentstart', 'token_idx', 'label', 'file']
 composed of 71745 rows at:
/src/data/nfs/analysis/NLP/audiobooks-nlu/direct-speech/final/nltk_tokenization/main_corpus/val.tsv

Saving Dataframe with columns:
 ['token', 'sentstart', 'token_idx', 'label', 'file']
 composed of 62831 rows at:
/src/data/nfs/analysis/NLP/audiobooks-nlu/direct-speech/final/nltk_tokenization/main_corpus/test.tsv

Saving Dataframe with columns:
 []
 composed of 0 rows at:
/src/data/nfs/analysis/NLP/audiobooks-nlu/direct-speech/final/nltk_tokenization/ood_corpus/train.tsv

Saving Dataframe with columns:
 []
 composed of 0 rows at:
/src/data/nfs/analysis/NLP/audiobooks-nlu/direct-speech/final/nltk_tokenization/ood_corpus/val.tsv

Saving 

In [10]:
# Generating files using the SpaCy document tokenization

generate_tokens_df(raw_data_path=PATH_DATA+"aggregated_corpus.json"
                   ,output_dir=PATH_DATA+"spacy_tokenization/"
                   ,**{"document_maker":spacy_nlp
                       , "annotated":True
                       , "label_name":"label"
                       , "label_pos":"DS"
                       , "label_neg":"O"
                       , "annotation_scheme":None
                      }
                  )

Loaded corpus of 86 files.
Saving Dataframe with columns:
 ['token', 'sentstart', 'token_idx', 'label', 'file']
 composed of 333675 rows at:
/src/data/nfs/analysis/NLP/audiobooks-nlu/direct-speech/final/spacy_tokenization/main_corpus/train.tsv

Saving Dataframe with columns:
 ['token', 'sentstart', 'token_idx', 'label', 'file']
 composed of 67574 rows at:
/src/data/nfs/analysis/NLP/audiobooks-nlu/direct-speech/final/spacy_tokenization/main_corpus/val.tsv

Saving Dataframe with columns:
 ['token', 'sentstart', 'token_idx', 'label', 'file']
 composed of 59022 rows at:
/src/data/nfs/analysis/NLP/audiobooks-nlu/direct-speech/final/spacy_tokenization/main_corpus/test.tsv

Saving Dataframe with columns:
 []
 composed of 0 rows at:
/src/data/nfs/analysis/NLP/audiobooks-nlu/direct-speech/final/spacy_tokenization/ood_corpus/train.tsv

Saving Dataframe with columns:
 []
 composed of 0 rows at:
/src/data/nfs/analysis/NLP/audiobooks-nlu/direct-speech/final/spacy_tokenization/ood_corpus/val.tsv

Sa