In [388]:
import numpy as np
import re
import os

import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib
import matplotlib.mlab as mlab
%matplotlib inline

sns.set(style="whitegrid", font_scale=1.3)
matplotlib.rcParams["legend.framealpha"] = 1
matplotlib.rcParams["legend.frameon"] = True

## Parsing the York-Helsinki Corpus of Old English Poetry

We will process the parsed YCOE corpus to extract each sentence and the corresponging POS tags. YCOE has a very extensive set of POS tags and a representation of the sentence structute- we will discard the sentence struction information and simplify the tags to a higher-order level.

More information on the corpus here: http://www-users.york.ac.uk/~lang18/pcorpus.html

In [395]:
# use the Old English characters æ,ð,þ
def oe_character_sub(s):
    s = re.sub('\+a','æ',s)
    s = re.sub('\+A','Æ',s)
    s = re.sub('\+t','þ',s)
    s = re.sub('\+T','Þ',s)
    s = re.sub('\+d','ð',s)
    s = re.sub('\+D','Ð',s)
    s = re.sub('\$','',s)
    s = s.lower()
    return(s)

In [222]:
# function that extracts what we need from each sentence
def extract_tagged_tokens(ppcme_string):
    ppcme_string = re.sub(' +',' ',re.sub('\n|\t','',ppcme_string))
    ppcme_nodes = re.split('\(|\)',ppcme_string)
    tokens = []
    tags = []
    for node in ppcme_nodes:
        node_split = re.sub(' $','',node).split(' ')
        if len(node_split)==2 \
        and node_split[0]!='ID' \
        and node_split[0]!='CODE' \
        and node_split[1][0]!="*" \
        and node_split[1]!="0" \
        and node_split[1] not in set([',','.','!',';',':','?']):
            tokens += [oe_character_sub(node_split[1])]
            tags += [node_split[0]]
    if(len(tokens)>0):
        return((tokens,tags))

In [368]:
def simplify_pos_tagset(tag):
    # default tag
    simple_tag = 'XXX'
    # remove case tag
    tag = tag.split('^')[0]
    # remove negation and particle tags
    if re.search('\+',tag):
        tag = tag.split('+')[1]
    if tag[0] in set(['B','H','V','T']):
        simple_tag = 'VRB'
    elif tag[0:2] == 'AX' or tag[0]=='M':
        simple_tag = 'AUX'
    elif tag in set(['N','NPR']):
        simple_tag = 'NOUN'
    elif tag in set(['PRO','PRO$','WPRO']):
        simple_tag = 'PRONOUN'
    elif tag == 'P':
        simple_tag = 'PREP'
    elif tag in set(['ADJ','WADJ']):
        simple_tag = 'ADJ'
    elif tag in set(['Q','NUM']):
        simple_tag = 'NUM'
    elif tag in set(['ADV','WADV']):
        simple_tag = 'ADV'
    elif tag in set(['CONJ','WQ','C']):
        simple_tag = 'CONJ'
    elif tag == 'D':
        simple_tag = 'DET'
    elif tag == 'NEG':
        simple_tag = 'NEG'
    return(simple_tag)

In [352]:
poem_tokens = []
poem_tags = []
for l in check_blocks:
    parse_block = extract_tagged_tokens(l)
    if parse_block:
        block_tokens = []
        block_tags = []        
        for i in range(len(parse_block[0])):
            token = parse_block[0][i]
            tag = parse_block[1][i]
            if token not in set(['.',',','!',':',';','full-stop']):
                block_tokens += [token]
                tag_basic = simplify_pos_tagset(tag)
                block_tags += [tag_basic]
        poem_tokens += [block_tokens]            
        poem_tags += [block_tags]

In [344]:
print(poem_tags[0:10])

[['ADV^T', 'MDPI', 'VB', 'N^G', 'N^A', 'N^G', 'N^A', 'CONJ', 'PRO$', 'N^A', 'N^A', 'N^G', 'P', 'PRO^N', 'N^G', 'Q^G', 'ADJ^N', 'NPR^N', 'N^A', 'VBDI'], ['PRO^N', 'ADV^T', 'VBDI', 'N^G', 'N^D', 'N^A', 'P', 'N^D', 'ADJ^N', 'N^N'], ['ADV^T', 'N^A', 'N^G', 'N^N', 'ADJ^N', 'NPR^N', 'ADV^T', 'VBDI', 'N^D', 'N^A', 'N^N', 'ADJ^N'], ['P', 'D^D', 'N^D', 'NEG+Q^N', 'BEPI', 'ADJ^N', 'P', 'PRO^D', 'N^N', 'BEPS', 'TO', 'RP+VB^D', 'P', 'PRO$', 'N^D', 'WPRO^N', 'PRO$', 'N^D', 'ADJ^G', 'CONJ', 'ADJ^G', 'P', 'N^D', 'VBN', 'BEPS'], ['VBDI', 'PRO^A', 'NPR^N', 'ADJ^N'], ['ADV^T', 'PRO^N', 'MDD', 'P', 'N^A', 'VB', 'ADJ^N', 'P', 'N^A'], ['X'], ['PRO^N', 'ADJ^A', 'N^A', 'N^G', 'N^A'], ['VB', 'PRO^N', 'NEG', 'MDD'], ['VBDI', 'PRO^A', 'N^N', 'Q^A', 'ADV']]


In [353]:
print(poem_tags[0:10])

[['ADV', 'AUX', 'VRB', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'CONJ', 'PRONOUN', 'NOUN', 'NOUN', 'NOUN', 'PREP', 'PRONOUN', 'NOUN', 'NUM', 'ADJ', 'NOUN', 'NOUN', 'VRB'], ['PRONOUN', 'ADV', 'VRB', 'NOUN', 'NOUN', 'NOUN', 'PREP', 'NOUN', 'ADJ', 'NOUN'], ['ADV', 'NOUN', 'NOUN', 'NOUN', 'ADJ', 'NOUN', 'ADV', 'VRB', 'NOUN', 'NOUN', 'NOUN', 'ADJ'], ['PREP', 'DET', 'NOUN', 'NUM', 'VRB', 'ADJ', 'PREP', 'PRONOUN', 'NOUN', 'VRB', 'VRB', 'VRB', 'PREP', 'PRONOUN', 'NOUN', 'PRONOUN', 'PRONOUN', 'NOUN', 'ADJ', 'CONJ', 'ADJ', 'PREP', 'NOUN', 'VRB', 'VRB'], ['VRB', 'PRONOUN', 'NOUN', 'ADJ'], ['ADV', 'PRONOUN', 'AUX', 'PREP', 'NOUN', 'VRB', 'ADJ', 'PREP', 'NOUN'], ['XXX'], ['PRONOUN', 'ADJ', 'NOUN', 'NOUN', 'NOUN'], ['VRB', 'PRONOUN', 'NEG', 'AUX'], ['VRB', 'PRONOUN', 'NOUN', 'NUM', 'ADV']]


In [382]:
# apply to all files and collect sentences across all
contents = os.listdir(path)

poem_tokens = []
poem_tags = []
poem_sentence_mapping = []
for poem in contents:
    poem_raw = open(path+'/'+poem).read()
    poem_blocks = poem_raw.split('( ')
    for l in poem_blocks:
        parse_block = extract_tagged_tokens(l)
        if parse_block:
            block_tokens = []
            block_tags = []        
            for i in range(len(parse_block[0])):
                token = parse_block[0][i]
                tag = parse_block[1][i]
                if token not in set(['.',',','!',':',';','full-stop']):
                    if len(tag)>0:
                        block_tokens += [token]
                        tag_basic = simplify_pos_tagset(tag)
                        block_tags += [tag_basic]
            poem_tokens += [block_tokens]            
            poem_tags += [block_tags]
            poem_sentence_mapping += [poem[2:].split(".")[0]]

In [370]:
print(len(poem_tokens))
print(len(poem_tags))

6333
6333


In [371]:
print(poem_tokens[0:3])

[['hwæt'], ['we', 'gefrunan', 'on', 'fyrndagum', 'twelfe', 'under', 'tunglum', 'tireadige', 'hæleð', 'þeodnes', 'þegnas'], ['no', 'hira', 'þrym', 'alæg', 'camprædenne', 'þonne', 'cumbol', 'hneotan', 'syððan', 'hie', 'gedældon', 'swa', 'him', 'dryhten', 'sylf', 'heofona', 'heahcyning', 'hlyt', 'getæhte']]


In [372]:
print(poem_tags[0:3])

[['PRONOUN'], ['PRONOUN', 'VRB', 'PREP', 'NOUN', 'NUM', 'PREP', 'NOUN', 'ADJ', 'NOUN', 'NOUN', 'NOUN'], ['ADV', 'PRONOUN', 'NOUN', 'VRB', 'NOUN', 'PREP', 'NOUN', 'VRB', 'PREP', 'PRONOUN', 'VRB', 'PREP', 'PRONOUN', 'NOUN', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'VRB']]


In [373]:
tag_dict = {}
tag_set = set()

for sentence_tags in poem_tags:
    for tag in sentence_tags:
        if tag not in tag_set:
            tag_set.add(tag)
            tag_dict[tag] = 1
        else:
            tag_dict[tag] += 1

In [374]:
tag_count = pd.DataFrame.from_dict(tag_dict,orient='index').reset_index()
tag_count.columns = ['tag','freq']
tag_count = tag_count.sort_values('freq',ascending=False)\
    .reset_index(drop=True)
print(tag_count.shape)
print("Top tags by frequency:")
print(tag_count.head(15))

(12, 2)
Top tags by frequency:
        tag   freq
0      NOUN  22982
1       VRB  12335
2      PREP   6793
3       ADJ   6535
4   PRONOUN   5805
5       ADV   5400
6      CONJ   3634
7       DET   3421
8       NUM   2033
9       AUX   1468
10      NEG    552
11      XXX    500


In [394]:
dict((x,poem_sentence_mapping.count(x)) for x in set(poem_sentence_mapping))

{'andrea': 453,
 'beowul': 1696,
 'brunan': 26,
 'christ': 430,
 'cynew': 937,
 'dream': 112,
 'exeter': 676,
 'exodus': 318,
 'genesi': 446,
 'kentis': 70,
 'metboe': 340,
 'northu': 31,
 'phoeni': 259,
 'riddle': 539}

In [396]:
# test/train split of sentences stratified by source
ycoe_x_train, ycoe_x_test, \
ycoe_y_train, ycoe_y_test, \
ycoe_src_train, ycoe_src_test = \
train_test_split(poem_tokens,poem_tags,poem_sentence_mapping,\
                 test_size=0.2,stratify=poem_sentence_mapping)

In [397]:
dict((x,ycoe_src_train.count(x)) for x in set(ycoe_src_train))

{'andrea': 362,
 'beowul': 1357,
 'brunan': 21,
 'christ': 344,
 'cynew': 749,
 'dream': 90,
 'exeter': 541,
 'exodus': 254,
 'genesi': 357,
 'kentis': 56,
 'metboe': 272,
 'northu': 25,
 'phoeni': 207,
 'riddle': 431}