In [22]:
import pandas as pd
import os
from os import listdir
import csv
from os.path import isfile, join

In [53]:
RANDOM_STATE=12

In [24]:
def get_subdirectories(folder_path):
    return [f.path for f in os.scandir(folder_path) if f.is_dir()]

def get_files_in_folder(folder_path):
    return [f for f in listdir(folder_path) if isfile(join(folder_path, f))]

In [25]:
# Data downloaded from https://github.com/nytud/NYTK-NerKor
# https://link.springer.com/chapter/10.1007/978-3-030-83527-9_19

base_path = "/Users/attilanagy/Personal/NYTK-NerKor/data/genres"

## Load data and convert it to a nicer format

In [26]:
def load_conll_file(path):
    # Return two lists:
        # - the first is the list of token lists
        # - the second is the list of token annotation lists 
    sentence_list = []
    sentence_label_list = []
    with open(path) as f:
        tokens = []
        labels = []
        for line in f:
            splitted = line.split('\t')
            # This conll format should have 6 columns, 
            # we check for the newlines here separating the sentences
            if line.startswith('#'):
                continue
                
            elif len(splitted) != 6:
                sentence_list.append(tokens)
                sentence_label_list.append(labels)
                tokens = []
                labels = []
                continue
                
            else:
                splitted = line.split('\t')
                token_surface_form = splitted[0]
                token_label = splitted[-1]
                tokens.append(token_surface_form.strip())
                labels.append(token_label.strip())
                
    return sentence_list, sentence_label_list
                    

In [27]:
sents, labels = load_conll_file('/Users/attilanagy/Personal/NYTK-NerKor/data/genres/wikipedia/no-morph/huwiki_200_18.conllup')

In [28]:
# Sanity check that reading a file works correctly
for i in range(5):
    print(f"{sents[i]} \n {labels[i]}")
    print(len(sents[i]), len(labels[i]))
    print("----------------------------")

['Csáktornyától', '12', 'km-re', 'keletre', 'fekszik', '.'] 
 ['B-LOC', 'O', 'O', 'O', 'O', 'O']
6 6
----------------------------
['Az', 'Intermission', 'az', 'amerikai', 'Dio', 'heavy', 'metal', 'zenekar', 'első', 'koncertlemeze', '.'] 
 ['O', 'B-MISC', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O']
11 11
----------------------------
['Dio', 'csalódottságában', 'nevezte', 'el', 'az', 'albumot', 'Intermission-nek', '(', 'felvonásköz', ',', 'szünet', ')', '.'] 
 ['B-PER', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O']
13 13
----------------------------
['1896', 'októberében', 'Egerben', 'járt', ',', 's', 'barátai', 'ösztönzésére', '1897', '.'] 
 ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
10 10
----------------------------
['A', 'helyszűke', 'hamarosan', 'nyilvánvalónak', 'bizonyult', ',', 'emellett', 'Gárdonyit', 'zavarta', 'munkájában', 'az', 'utcáról', 'beszűrődő', 'zaj', '.'] 
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 

In [29]:
def load_data(base_path):
    # load data by genre
    # Returns:
        # Map of genre_name -> dataset, where dataset is:
        # tuple with sentence lists and token annotation lists
    data = {}
    
    genre_paths = get_subdirectories(base_path)
    for genre_path in genre_paths:
        genre_name = genre_path.split('/')[-1]
        if genre_name not in data:
            sentence_list = []
            sentence_label_list = []
            data[genre_name] = []
            
        subdirs = get_subdirectories(genre_path)
        # there is both morph and no-morph version of the data
        # take no-morph
        # the folders are pretty inconsistent
        if len(subdirs) == 2:
            path = list(filter(lambda x: x.endswith('no-morph'), subdirs))[0]
            
        # there is only a single subfolder (morph or no-morph)
        else:
            path = subdirs[0]
            
        files = get_files_in_folder(path)
        file_paths = [os.path.join(path, file) for file in files]
        for file_path in file_paths:
            sents, labels = load_conll_file(file_path)
            sentence_list.extend(sents)
            sentence_label_list.extend(labels)
        data[genre_name] = (sentence_list, sentence_label_list)
        
    return data

In [30]:
data = load_data(base_path)

### Sanity check the full dataset loading

In [31]:
for genre in data:
    print(f"-------{genre}-------")
    sentence_list, annotation_list = data[genre]
    for i in range(3):
        print(f"{sentence_list[i]} \n {annotation_list[i]}")
        print("----------------------------")
    print("\n\n\n")

-------wikipedia-------
['Csáktornyától', '12', 'km-re', 'keletre', 'fekszik', '.'] 
 ['B-LOC', 'O', 'O', 'O', 'O', 'O']
----------------------------
['Az', 'Intermission', 'az', 'amerikai', 'Dio', 'heavy', 'metal', 'zenekar', 'első', 'koncertlemeze', '.'] 
 ['O', 'B-MISC', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O']
----------------------------
['Dio', 'csalódottságában', 'nevezte', 'el', 'az', 'albumot', 'Intermission-nek', '(', 'felvonásköz', ',', 'szünet', ')', '.'] 
 ['B-PER', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O']
----------------------------




-------web-------
['A', 'portát', 'kb.', '2', 'éve', 'vásároltuk', ',', 'nem', 'volt', 'rajta', 'semmi', ',', 'azóta', 'próbálkozunk', 'a', 'kertészkedéssel', '.'] 
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
----------------------------
['Unokatestvérem', '"', 'talált', 'Rátok', '"', 'és', 'mutatott', 'be', 'Nektek', '!'] 
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

### Flatten out data & prepare for train dev split

In [54]:
flattened_x = []
flattened_y = []
# need to create a dummy list for stratified sampling
stratify_keys = []
for genre in data:
    flattened_x.extend(data[genre][0])
    flattened_y.extend(data[genre][1])
    stratify_keys.extend([genre for x in range(len(data[genre][0]))])

In [55]:
# sanity check
assert len(flattened_x) == len(flattened_y) == len(stratify_keys)

In [56]:
dataset = list(zip(flattened_x, flattened_y, stratify_keys))

In [57]:
dataset[0]

(['Csáktornyától', '12', 'km-re', 'keletre', 'fekszik', '.'],
 ['B-LOC', 'O', 'O', 'O', 'O', 'O'],
 'wikipedia')

In [58]:
len(dataset)

65429

In [59]:
from sklearn.model_selection import train_test_split 

In [60]:
# Stratified sampling to make sure that the distribution 
# of genres is similar in both the train and dev sets
train, dev = train_test_split(dataset, test_size=0.25, stratify=stratify_keys, random_state=RANDOM_STATE) 

dev_stratify_keys = [x[2] for x in dev]
dev, test = train_test_split(dev, test_size=0.4, stratify=dev_stratify_keys, random_state=RANDOM_STATE)

In [62]:
train[0]

(['A',
  'moderátorok',
  'elsősorban',
  'a',
  'Felhasználók',
  ',',
  'illetőleg',
  'a',
  'Szolgáltató',
  'érdekei',
  'védelmében',
  'járnak',
  'el',
  '.'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 'web')

In [63]:
dev[0]

(['Newton',
  'hét',
  'körcikke',
  'azt',
  'a',
  'vélekedését',
  'tükrözi',
  ',',
  'miszerint',
  'hét',
  'különálló',
  'tiszta',
  'színnek',
  'kell',
  'léteznie',
  '.'],
 ['B-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 'wikipedia')

In [64]:
test[0]

(['Köszönöm', ',', 'apám', '.'], ['O', 'O', 'O', 'O'], 'fiction')

### Dump data in a more convenient format 

In [68]:
def save_data(data, path):
    with open(path, 'w+') as file:
        file.write(f'sentence\ttoken\ttag\n')
        sentence_idx = 1
        for sentence_list, label_list, source_list in data:
            for data_point in list(zip(sentence_list, label_list, source_list)):
                token = data_point[0]
                label = data_point[1]
                file.write(f'{sentence_idx}\t{token}\t{label}\n')
            sentence_idx += 1

In [77]:
save_data(train, './train.tsv')
save_data(dev, './dev.tsv')
save_data(test, './test.tsv')

In [78]:
pd.read_csv('/Users/attilanagy/Personal/bme-deep-learning-homework-ner/notebooks/dev.tsv', sep='\t', quoting=csv.QUOTE_NONE)

Unnamed: 0,sentence,token,tag
0,1,Newton,B-PER
1,1,hét,O
2,1,körcikke,O
3,1,azt,O
4,1,a,O
...,...,...,...
51863,9814,együttese,O
51864,9814,volt,O
51865,9814,",",O
51866,9814,mely,O


In [79]:
pd.read_csv('/Users/attilanagy/Personal/bme-deep-learning-homework-ner/notebooks/train.tsv', sep='\t', quoting=csv.QUOTE_NONE)

Unnamed: 0,sentence,token,tag
0,1,A,O
1,1,moderátorok,O
2,1,elsősorban,O
3,2,Criss,B-PER
4,2,a,O
...,...,...,...
259141,49070,engedéllyel,O
259142,49070,.,O
259143,49071,Mireille,B-PER
259144,49071,Hindoyan,I-PER
