In [6]:
import pandas as pd
import numpy as np

In [43]:
!python notebook2script.py data.ipynb

Converted data.ipynb to exp/nb_data.py


# Read Single File

In [10]:
#export
def readfile(filename):
    '''  read file to (sentence,tag) pairs '''
    f = open(filename)
    data, sentence, label = [], [], []
    for line in f:
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                data.append((sentence, label))
                sentence, label = [], []
            continue
        splits = line.split(' ')
        sentence.append(splits[0])
        label.append(splits[-1][:-1])

    if len(sentence) > 0:
        data.append((sentence, label))
    return data

In [14]:
data = readfile('NER_datasets/CONLL2003/train.txt')
data[0]

(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'])

In [3]:
#export
def readdf(filename):
    ''' read file to dataframe '''
    f = open(filename)
    data, sentence, label = [], [], []
    sentence_idx = 0
    for line in f:
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                for word, tag in zip(sentence, label):
                    data.append( (word, tag, sentence_idx) )
                sentence_idx += 1
                sentence, label = [], []
            continue
        splits = line.split(' ')
        sentence.append(splits[0])
        label.append(splits[-1][:-1])

    if len(sentence) > 0:
        data.append((sentence, label))
    return pd.DataFrame(data, columns=['word', 'tag', 'sentence_idx'])

In [13]:
data = readdf('NER_datasets/CONLL2003/train.txt')
data

Unnamed: 0,word,tag,sentence_idx
0,EU,B-ORG,0.0
1,rejects,O,0.0
2,German,B-MISC,0.0
3,call,O,0.0
4,to,O,0.0
...,...,...,...
203613,Preston,B-ORG,14038.0
203614,1,O,14038.0
203615,Division,O,14039.0
203616,three,O,14039.0


In [38]:
#export
def readdfsentences(filename):
    df = readdf(filename)
    
    agg_func = lambda s: ' '.join(s["word"].values)
    sentences = df.groupby("sentence_idx").apply(agg_func)
    agg_func = lambda s: ' '.join(s["tag"].values)
    labels = df.groupby("sentence_idx").apply(agg_func)
    
    return pd.concat([sentences, labels], axis=1)

In [41]:
data = readdfsentences('NER_datasets/CONLL2003/train.txt')
data

Unnamed: 0_level_0,0,1
sentence_idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,EU rejects German call to boycott British lamb .,B-ORG O B-MISC O O O B-MISC O O
1.0,Peter Blackburn,B-PER I-PER
2.0,BRUSSELS 1996-08-22,B-LOC O
3.0,The European Commission said on Thursday it di...,O B-ORG I-ORG O O O O O O B-MISC O O O O O B-M...
4.0,Germany 's representative to the European Unio...,B-LOC O O O O B-ORG I-ORG O O O B-PER I-PER O ...
...,...,...
14035.0,Results of English league matches,O O B-MISC O O
14036.0,on Friday :,O O O
14037.0,Division two,O O
14038.0,Plymouth 2 Preston 1,B-ORG O B-ORG O


In [None]:
#         tokens      sentences
#     -----------------------------------
# dev:    51361       3250
# test:   46434       3453
# train:  203612      14035
#     -----------------------------------
# all:    301407      20738


#         dev     test    train   all
#     --------------------------------
# PER:    1842    1617    6600    10059
# LOC:    1837    1668    7140    10645
# ORG:    1341    1661    6321    9323
# MISC:   922     702     3438    5062
#     --------------------------------
# ALL:    5942    5648    23499   35089

# Read Train, Valid, Test

In [1]:
data = {
    'test': 'NER_datasets/CONLL2003/test.txt',
    'valid': 'NER_datasets/CONLL2003/valid.txt',
    'train': 'NER_datasets/CONLL2003/train.txt'
}

# Plots