# Tutorial 6: Creating a Corpus



### Reading A Sequence Labeling Dataset


In [1]:
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher

# define columns
columns = {0: 'text', 1: 'pos', 2: 'chunk'}

# this is the folder in which train, test and dev files reside
data_folder = '/home/wohlg/nltk_data/corpora/conll2000/'

# retrieve corpus using column format, data folder and the names of the train, dev and test files
corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns,
                                                              train_file='train.txt',
                                                              test_file='test.txt')
                                                              # dev_file='dev.txt')
    


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
2019-04-29 18:39:07,359 Reading data from /home/wohlg/nltk_data/corpora/conll2000
2019-04-29 18:39:07,360 Train: /home/wohlg/nltk_data/corpora/conll2000/train.txt
2019-04-29 18:39:07,361 Dev: None
2019-04-29 18:39:07,362 Test: /home/wohlg/nltk_data/corpora/conll2000/test.txt


### Now we have TaggedCorpus object that contains the train, and test splits, each has a *list of Sentence*. 

In [2]:
print(len(corpus.train))
print(len(corpus.test))

print(corpus.train[0].to_tagged_string('pos'))
print(corpus.train[0].to_tagged_string('chunk'))


8042
2012
Confidence <NN> in <IN> the <DT> pound <NN> is <VBZ> widely <RB> expected <VBN> to <TO> take <VB> another <DT> sharp <JJ> dive <NN> if <IN> trade <NN> figures <NNS> for <IN> September <NNP> , <,> due <JJ> for <IN> release <NN> tomorrow <NN> , <,> fail <VB> to <TO> show <VB> a <DT> substantial <JJ> improvement <NN> from <IN> July <NNP> and <CC> August <NNP> 's <POS> near-record <JJ> deficits <NNS> . <.>
Confidence <B-NP> in <B-PP> the <B-NP> pound <I-NP> is <B-VP> widely <I-VP> expected <I-VP> to <I-VP> take <I-VP> another <B-NP> sharp <I-NP> dive <I-NP> if <B-SBAR> trade <B-NP> figures <I-NP> for <B-PP> September <B-NP> , due <B-ADJP> for <B-PP> release <B-NP> tomorrow <B-NP> , fail <B-VP> to <I-VP> show <I-VP> a <B-NP> substantial <I-NP> improvement <I-NP> from <B-PP> July <B-NP> and <I-NP> August <I-NP> 's <B-NP> near-record <I-NP> deficits <I-NP> .


## Reading a Text Classification Dataset

In [16]:
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from pathlib import Path

# use your own data path
data_folder = Path('/home/wohlg/itmo/misc/cooking_classification/simple_and_preprocessed')

# load corpus containing training, test and dev data
corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus(data_folder,
                                                                     test_file='cooking.test',
                                                                     dev_file='cooking.valid',
                                                                     train_file='cooking.train')
    
print('Done loading')
print(corpus.obtain_statistics())




2019-04-29 18:47:47,062 Reading data from /home/wohlg/itmo/misc/cooking_classification/simple_and_preprocessed
2019-04-29 18:47:47,063 Train: /home/wohlg/itmo/misc/cooking_classification/simple_and_preprocessed/cooking.train
2019-04-29 18:47:47,064 Dev: /home/wohlg/itmo/misc/cooking_classification/simple_and_preprocessed/cooking.valid
2019-04-29 18:47:47,065 Test: /home/wohlg/itmo/misc/cooking_classification/simple_and_preprocessed/cooking.test
Done loading
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 7502,
        "number_of_documents_per_class": {
            "sauce": 327,
            "cheese": 227,
            "food-safety": 943,
            "storage-method": 359,
            "equipment": 649,
            "bread": 564,
            "baking": 1133,
            "substitutions": 710,
            "chocolate": 227,
            "oven": 223,
            "storage-lifetime": 252,
            "cake": 309,
            "flavor": 290,
            "beef": 190,


### download and use a builtin corpus

In [19]:
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB).downsample(0.1)
print(TaggedCorpus)

2019-04-29 18:57:23,394 Reading data from /home/wohlg/.flair/datasets/imdb
2019-04-29 18:57:23,397 Train: /home/wohlg/.flair/datasets/imdb/train.txt
2019-04-29 18:57:23,399 Dev: None
2019-04-29 18:57:23,404 Test: /home/wohlg/.flair/datasets/imdb/test.txt
TaggedCorpus: 2250 train + 250 dev + 2500 test sentences


### corpus from one file

### loading included corpora

In [1]:
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH)
print('Done')

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
2019-04-30 08:17:08,462 Reading data from /home/wohlg/.flair/datasets/ud_english
2019-04-30 08:17:08,462 Train: /home/wohlg/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2019-04-30 08:17:08,463 Dev: /home/wohlg/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2019-04-30 08:17:08,463 Test: /home/wohlg/.flair/datasets/ud_english/en_ewt-ud-test.conllu
Done


In [2]:
print(corpus.obtain_statistics())


{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 12543,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 204585,
            "min": 1,
            "max": 159,
            "avg": 16.310691222195647
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 2077,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 25096,
            "min": 1,
            "max": 81,
            "avg": 12.082811747713048
        }
    },
    "DEV": {
        "dataset": "DEV",
        "total_number_of_documents": 2002,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 25148,
            "min": 1,
            "max": 75,
            "avg": 12.561438561438562
        }
    }
}


In [3]:
corpus = corpus.downsample(0.3)
print(corpus.obtain_statistics())


{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 3763,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 61173,
            "min": 1,
            "max": 135,
            "avg": 16.25644432633537
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 624,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 7759,
            "min": 1,
            "max": 75,
            "avg": 12.434294871794872
        }
    },
    "DEV": {
        "dataset": "DEV",
        "total_number_of_documents": 601,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 7708,
            "min": 1,
            "max": 65,
            "avg": 12.825291181364392
        }
    }
}
