# Tutorial 6: Creating a Corpus



### Reading A Sequence Labeling Dataset


In [2]:
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher

# define columns
columns = {0: 'text', 1: 'pos', 2: 'chunk'}

# this is the folder in which train, test and dev files reside
data_folder = '/home/wohlg/nltk_data/corpora/conll2000/'

# retrieve corpus using column format, data folder and the names of the train, dev and test files
corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns,
                                                              train_file='train.txt',
                                                              test_file='test.txt')
                                                              # dev_file='dev.txt')
    


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
2019-04-22 10:44:25,212 Reading data from /home/wohlg/nltk_data/corpora/conll2000
2019-04-22 10:44:25,213 Train: /home/wohlg/nltk_data/corpora/conll2000/train.txt
2019-04-22 10:44:25,213 Dev: None
2019-04-22 10:44:25,214 Test: /home/wohlg/nltk_data/corpora/conll2000/test.txt


In [2]:
print(len(corpus.train))
print(len(corpus.test))

print(corpus.train[0].to_tagged_string('pos'))
print(corpus.train[0].to_tagged_string('chunk'))


8042
2012
Confidence <NN> in <IN> the <DT> pound <NN> is <VBZ> widely <RB> expected <VBN> to <TO> take <VB> another <DT> sharp <JJ> dive <NN> if <IN> trade <NN> figures <NNS> for <IN> September <NNP> , <,> due <JJ> for <IN> release <NN> tomorrow <NN> , <,> fail <VB> to <TO> show <VB> a <DT> substantial <JJ> improvement <NN> from <IN> July <NNP> and <CC> August <NNP> 's <POS> near-record <JJ> deficits <NNS> . <.>
Confidence <B-NP> in <B-PP> the <B-NP> pound <I-NP> is <B-VP> widely <I-VP> expected <I-VP> to <I-VP> take <I-VP> another <B-NP> sharp <I-NP> dive <I-NP> if <B-SBAR> trade <B-NP> figures <I-NP> for <B-PP> September <B-NP> , due <B-ADJP> for <B-PP> release <B-NP> tomorrow <B-NP> , fail <B-VP> to <I-VP> show <I-VP> a <B-NP> substantial <I-NP> improvement <I-NP> from <B-PP> July <B-NP> and <I-NP> August <I-NP> 's <B-NP> near-record <I-NP> deficits <I-NP> .


## Reading a Text Classification Dataset

In [6]:
from flair.data_fetcher import NLPTaskDataFetcher
from pathlib import Path

# use your own data path
data_folder = Path('/home/wohlg/itmo/misc/cooking_classification/simple')

# load corpus containing training, test and dev data
corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus(data_folder,
                                                                     test_file='cooking.test',
                                                                     dev_file='cooking.valid',
                                                                     train_file='cooking.train')
    
print('Done loading')

2019-04-20 19:28:58,895 Reading data from /home/wohlg/itmo/misc/cooking_classification/simple
2019-04-20 19:28:58,896 Train: /home/wohlg/itmo/misc/cooking_classification/simple/cooking.train
2019-04-20 19:28:58,896 Dev: /home/wohlg/itmo/misc/cooking_classification/simple/cooking.valid
2019-04-20 19:28:58,897 Test: /home/wohlg/itmo/misc/cooking_classification/simple/cooking.test
Done loading


### corpus from one file

In [5]:
cl_corp = "/home/wohlg/misc/text2class/text2class_train.txt"
NLPTaskDataFetcher.read_text_classification_file(cl_corp)
print('Done')


KeyboardInterrupt: 

### loading included corpora

In [4]:
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask

corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB)
print('Done')

2019-04-22 10:44:41,999 Reading data from /home/wohlg/.flair/datasets/imdb
2019-04-22 10:44:42,000 Train: /home/wohlg/.flair/datasets/imdb/train.txt
2019-04-22 10:44:42,000 Dev: None
2019-04-22 10:44:42,001 Test: /home/wohlg/.flair/datasets/imdb/test.txt


KeyboardInterrupt: 

In [6]:
corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH)
print('Done')

2019-04-22 10:48:27,978 Reading data from /home/wohlg/.flair/datasets/ud_english
2019-04-22 10:48:27,980 Train: /home/wohlg/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2019-04-22 10:48:27,981 Dev: /home/wohlg/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2019-04-22 10:48:27,982 Test: /home/wohlg/.flair/datasets/ud_english/en_ewt-ud-test.conllu
Done


In [10]:
print(corpus.obtain_statistics())


{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 12543,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 204585,
            "min": 1,
            "max": 159,
            "avg": 16.310691222195647
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 2077,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 25096,
            "min": 1,
            "max": 81,
            "avg": 12.082811747713048
        }
    },
    "DEV": {
        "dataset": "DEV",
        "total_number_of_documents": 2002,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 25148,
            "min": 1,
            "max": 75,
            "avg": 12.561438561438562
        }
    }
}


In [11]:
corpus = corpus.downsample(0.3)
print(corpus.obtain_statistics())


{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 1129,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 18314,
            "min": 1,
            "max": 83,
            "avg": 16.221434898139947
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 188,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 2397,
            "min": 1,
            "max": 54,
            "avg": 12.75
        }
    },
    "DEV": {
        "dataset": "DEV",
        "total_number_of_documents": 181,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 2283,
            "min": 1,
            "max": 56,
            "avg": 12.613259668508288
        }
    }
}
