## Parsing the York-Helsinki Corpus of Old English Poetry

We will process the parsed YCOE corpus to extract each sentence and the corresponging POS tags. YCOE has a very extensive set of POS tags and a representation of the sentence structure- we will discard the sentence structure information and simplify the tags to a higher-order level.

More information on the corpus here: http://www-users.york.ac.uk/~lang18/pcorpus.html

#### Import necessary libraries

In [388]:
import numpy as np
import re
import os

import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib
import matplotlib.mlab as mlab
%matplotlib inline

sns.set(style="whitegrid", font_scale=1.3)
matplotlib.rcParams["legend.framealpha"] = 1
matplotlib.rcParams["legend.frameon"] = True

#### Show corpus contents

In [420]:
# show poems available in corpus
contents = os.listdir(path)
print(contents)

['coandrea.psd', 'cobeowul.psd', 'cobrunan.psd', 'cochrist.psd', 'cocynew.psd', 'codream.psd', 'coexeter.psd', 'coexodus.psd', 'cogenesi.psd', 'cokentis.psd', 'cometboe.psd', 'conorthu.psd', 'cophoeni.psd', 'coriddle.psd']


### PPCME2 structure
See the beginning of the poem "The Dream of the Rood"

    hwæt! ic swefna cyst secgan wylle,
    hwæt me gemætte to midre nihte
    syðþan reordberend reste wunedon.

The block below shows the representation of this sentence in the YCOE corpus:

In [426]:
# show example line from the raw file for Dream of the Rood
poem = 'codream.psd'
poem_raw = open(path+'/'+poem).readlines()
for l in poem_raw[0:27]:
    print(re.sub('\n','',re.sub('\t\t','\t',l)))

( (CODE <P_61>))
( (CODE <COM:THE_DREAM_OF_THE_ROOD>))
( (CODE <R_1>))
( (INTJP (INTJ Hw+at)
	 (. .)) (ID codream,61.1.4))
( (IP-MAT (NP-NOM (PRO^N Ic))
	  (NP-ACC (NP-GEN (N^G swefna))
	  (ADJ^A cyst)
	  (CP-QUE-PRN *ICH*-1))
	  (VB secgan)
	  (MDP wylle)
	  (, ,)
	  (CP-QUE-PRN-1 (WNP-NOM-2 (WPRO^N $hw+at))
		(C 0)
		(IP-SUB (NP-NOM *T*-2)
		(NP-DAT (PRO^D me))
		(VBD gem+atte)
		(PP (P to)
		    (NP-DAT (ADJ^D midre) (N^D nihte)))
		(, ,)
		(PP (P sy+d+tan)
		    (CP-ADV (C 0)
			    (IP-SUB (NP-NOM (N^N reordberend))
			    (NP-DAT-ADT (N^D reste))
			    (VBDI wunedon))))))
	  (. .)) (ID codream,61.1.5))
( (CODE <R_4>))


### Read in and reformat data

In [395]:
# use the Old English characters æ,ð,þ
def oe_character_sub(s):
    s = re.sub('\+a','æ',s)
    s = re.sub('\+A','Æ',s)
    s = re.sub('\+t','þ',s)
    s = re.sub('\+T','Þ',s)
    s = re.sub('\+d','ð',s)
    s = re.sub('\+D','Ð',s)
    s = re.sub('\$','',s)
    s = s.lower()
    return(s)

In [222]:
# function that extracts what we need from each sentence
def extract_tagged_tokens(ppcme_string):
    ppcme_string = re.sub(' +',' ',re.sub('\n|\t','',ppcme_string))
    ppcme_nodes = re.split('\(|\)',ppcme_string)
    tokens = []
    tags = []
    for node in ppcme_nodes:
        node_split = re.sub(' $','',node).split(' ')
        if len(node_split)==2 \
        and node_split[0]!='ID' \
        and node_split[0]!='CODE' \
        and node_split[1][0]!="*" \
        and node_split[1]!="0" \
        and node_split[1] not in set([',','.','!',';',':','?']):
            tokens += [oe_character_sub(node_split[1])]
            tags += [node_split[0]]
    if(len(tokens)>0):
        return((tokens,tags))

In [453]:
def extract_case(tag):
    if re.search('\^',tag):
        case = tag.split('^')[1]

        # Nominative case (subject)
        if case == 'N':
            case = 'NOM'
        # Accusative case (direct object)
        elif case == 'A':
            case = 'ACC'
        # Genitive case (possession)
        elif case == 'G':
            case = 'GEN'
        # Dative and instrumental cases (indirect object)
        elif case in set(['D','I']):
            case = 'DAT'
        # Temporal case (specifying a time)
        elif case == 'T':
            case = 'TEMP'
        # Set minor, rare cases as default "x"
        else:
            case = 'X'
    # If there is no case (for verbs, prep, etc.), set as "x"
    else:
        case = 'X'
    return(case)

In [458]:
def simplify_pos_tagset(tag):

    # remove case tag
    tag = tag.split('^')[0]
    # remove negation and particle tags
    if re.search('\+',tag):
        tag = tag.split('+')[1]
    
    # Verbs tags (B=Be, H=Have, V = other Verbs)
    if tag[0] in set(['B','H','V']):
        simple_tag = 'VRB'
    # Auxillary Verbs (M = Modal, T = "To" inifinitive)
    elif tag[0:2] == 'AX' or tag[0] in set(['M','T']):
        simple_tag = 'AUX'
    # Nouns (N = common noun, NPR = proper noun)
    elif tag in set(['N','NPR']):
        simple_tag = 'NOUN'
    # Pronouns (PRO$ = possesive, WPRO = Wh- pronoun)
    elif tag in set(['PRO','PRO$','WPRO']):
        simple_tag = 'PRON'
    # Prepositions
    elif tag == 'P':
        simple_tag = 'PREP'
    # Abjectives
    elif tag in set(['ADJ','WADJ']):
        simple_tag = 'ADJ'
    # Numerals and quantifiers
    elif tag in set(['Q','NUM']):
        simple_tag = 'NUM'
    # Adverbs
    elif tag in set(['ADV','WADV']):
        simple_tag = 'ADV'
    # Conjunctions (WQ = Whether, C = Complimentizer)
    elif tag in set(['CONJ','WQ','C']):
        simple_tag = 'CONJ'
    # Determiners
    elif tag == 'D':
        simple_tag = 'DET'
    # Negation
    elif tag == 'NEG':
        simple_tag = 'NEG'
    else:
        # default tag for anything not falling in the categories above
        simple_tag = 'X'    
        
    return(simple_tag)

In [459]:
# apply to all files and collect sentences across all
poem_tokens = []
poem_tags = []
poem_case = []
poem_sentence_mapping = []
for poem in contents:
    poem_raw = open(path+'/'+poem).read()
    poem_blocks = poem_raw.split('( ')
    for l in poem_blocks:
        parse_block = extract_tagged_tokens(l)
        if parse_block:
            block_tokens = []
            block_tags = []       
            block_case = []
            for i in range(len(parse_block[0])):
                token = parse_block[0][i]
                tag = parse_block[1][i]
                if token not in set(['.',',','!',':',';','full-stop']):
                    if len(tag)>0:
                        block_tokens += [token]
                        tag_basic = simplify_pos_tagset(tag)
                        case = extract_case(tag)
                        block_tags += [tag_basic]
                        block_case += [case]
            poem_tokens += [block_tokens]            
            poem_tags += [block_tags]
            poem_case += [block_case] 
            poem_sentence_mapping += [poem[2:].split(".")[0]]

In [460]:
print(len(poem_tokens))
print(len(poem_tags))
print(len(poem_case))

6333
6333
6333


In [461]:
print(poem_tokens[0:3])

[['hwæt'], ['we', 'gefrunan', 'on', 'fyrndagum', 'twelfe', 'under', 'tunglum', 'tireadige', 'hæleð', 'þeodnes', 'þegnas'], ['no', 'hira', 'þrym', 'alæg', 'camprædenne', 'þonne', 'cumbol', 'hneotan', 'syððan', 'hie', 'gedældon', 'swa', 'him', 'dryhten', 'sylf', 'heofona', 'heahcyning', 'hlyt', 'getæhte']]


In [462]:
print(poem_tags[0:3])

[['PRON'], ['PRON', 'VRB', 'PREP', 'NOUN', 'NUM', 'PREP', 'NOUN', 'ADJ', 'NOUN', 'NOUN', 'NOUN'], ['ADV', 'PRON', 'NOUN', 'VRB', 'NOUN', 'PREP', 'NOUN', 'VRB', 'PREP', 'PRON', 'VRB', 'PREP', 'PRON', 'NOUN', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'VRB']]


In [455]:
print(poem_case[0:3])

[['X'], ['NOM', 'X', 'X', 'DAT', 'ACC', 'X', 'DAT', 'ACC', 'ACC', 'GEN', 'ACC'], ['X', 'X', 'NOM', 'X', 'DAT', 'X', 'NOM', 'X', 'X', 'NOM', 'X', 'X', 'DAT', 'NOM', 'NOM', 'GEN', 'NOM', 'ACC', 'X']]


In [463]:
tag_dict = {}
tag_set = set()

for sentence_tags in poem_tags:
    for tag in sentence_tags:
        if tag not in tag_set:
            tag_set.add(tag)
            tag_dict[tag] = 1
        else:
            tag_dict[tag] += 1

In [465]:
tag_count = pd.DataFrame.from_dict(tag_dict,orient='index').reset_index()
tag_count.columns = ['tag','freq']
tag_count = tag_count.sort_values('freq',ascending=False)\
    .reset_index(drop=True)
print(tag_count.shape)
print("Top tags by frequency:")
print(tag_count.head(15))

(12, 2)
Top tags by frequency:
     tag   freq
0   NOUN  22982
1    VRB  12282
2   PREP   6793
3    ADJ   6535
4   PRON   5805
5    ADV   5400
6   CONJ   3634
7    DET   3421
8    NUM   2033
9    AUX   1521
10   NEG    552
11     X    500


In [466]:
case_dict = {}
case_set = set()

for sentence_case in poem_case:
    for case in sentence_case:
        if case not in case_set:
            case_set.add(case)
            case_dict[case] = 1
        else:
            case_dict[case] += 1

In [467]:
case_count = pd.DataFrame.from_dict(case_dict,orient='index').reset_index()
case_count.columns = ['case','freq']
case_count = case_count.sort_values('freq',ascending=False)\
    .reset_index(drop=True)
print(case_count.shape)
print("Top cases by frequency:")
print(case_count.head(15))

(6, 2)
Top cases by frequency:
   case   freq
0     X  28460
1   NOM  15102
2   DAT   9738
3   ACC   9628
4   GEN   6245
5  TEMP   2285


In [468]:
dict((x,poem_sentence_mapping.count(x)) for x in set(poem_sentence_mapping))

{'andrea': 453,
 'beowul': 1696,
 'brunan': 26,
 'christ': 430,
 'cynew': 937,
 'dream': 112,
 'exeter': 676,
 'exodus': 318,
 'genesi': 446,
 'kentis': 70,
 'metboe': 340,
 'northu': 31,
 'phoeni': 259,
 'riddle': 539}

In [470]:
# test/train split of sentences stratified by source
ycoe_token_train, ycoe_token_test, \
ycoe_pos_train, ycoe_pos_test, \
ycoe_case_train, ycoe_case_test, \
ycoe_src_train, ycoe_src_test = \
train_test_split(poem_tokens,poem_tags,poem_case,poem_sentence_mapping, \
                 test_size=0.2,stratify=poem_sentence_mapping)

In [473]:
dict((x,ycoe_src_test.count(x)) for x in set(ycoe_src_test))

{'andrea': 91,
 'beowul': 339,
 'brunan': 5,
 'christ': 86,
 'cynew': 188,
 'dream': 22,
 'exeter': 135,
 'exodus': 64,
 'genesi': 89,
 'kentis': 14,
 'metboe': 68,
 'northu': 6,
 'phoeni': 52,
 'riddle': 108}