In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import json
import os
import csv

In [64]:
def read_corpus(dir_path):
    corpus_soup = []
    for filename in os.listdir(dir_path):
        with open(os.path.join(dir_path, filename), 'rb') as file:
            raw = file.read()
            soup = BeautifulSoup(raw, 'lxml')
            corpus_soup.append(soup)
    return corpus_soup

In [65]:
class EDU():
    def __init__(self, edu_text, position):
        self.text = edu_text
        self.position = position
    def __repr__(self):
        return (self.text, self.position)

In [66]:
class EDUPair():
    def __init__(self, edu1, edu2, relation, text_id):
        self.edu1 = edu1
        self.edu2 = edu2
        self.relation = relation
        self.text_id = text_id

In [67]:
def ends_sentence(edu):
    sentence_splitters='.?!…'
    if edu.text[-1] in sentence_splitters:
        return True
    return False

In [68]:
def multinuclear(group):
    if group.attrs['type'] == 'multinuc':
        return True
    return False

In [69]:
def detect_parent(edu):
    if 'parent' in edu.attrs:
        return edu.attrs['parent']
    return None

In [70]:
def recursive_parent(root_parent_id, needed_edu_id, depth, groups):
    if depth == 0:
        return None
    for group in groups:
        if group.attrs['id'] == root_parent_id:
            current_group_parent_id = detect_parent(group)
            if current_group_parent_id == needed_edu_id:
                res_group = group
                return res_group
            elif current_group_parent_id is None:
                res_group = None
                return res_group
            else:
                return recursive_parent(current_group_parent_id, needed_edu_id, depth-1, groups)

In [71]:
def detect_relation(edu1, edu2, groups):
    edu1_parent=detect_parent(edu1)
    edu2_parent=detect_parent(edu2)
    edu1_id = edu1.attrs['id']
    edu2_id = edu2.attrs['id']
    nuclearity = None
    found = True
    if edu1_parent is None and edu2_parent is None:
        relation = 'no_relation'
    else:
        if edu1_parent == edu2_id:
            relation = edu1.attrs['relname']
            nuclearity = 'SN'
        elif edu2_parent == edu1_id:
            relation = edu2.attrs['relname']
            nuclearity = 'NS'
        elif edu1_parent == edu2_parent:
            relation1 = edu1.attrs['relname']
            relation2 = edu2.attrs['relname']
            if relation1 != 'span' and relation2 == 'span':
                relation = relation1
                nuclearity = 'SN'
            elif relation2 != 'span' and relation1 == 'span':
                relation = relation2
                nuclearity = 'NS'
            elif relation2 == relation2:
                relation = relation1
                nuclearity = 'M'
        else:
            found = False
            for group in groups:
                if group.attrs['id'] == edu1_parent and detect_parent(group) == edu2_id:
                    relation = group.attrs['relname']
                    if multinuclear(group):
                        nuclearity = 'M'
                    else:
                        nuclearity = 'SN'
                    found = True
                    break 
                elif group.attrs['id'] == edu2_parent and detect_parent(group) == edu1_id:
                    relation = group.attrs['relname']
                    if multinuclear(group):
                        nuclearity = 'M'
                    else:
                        nuclearity = 'NS'
                    found = True
                    break
                elif group.attrs['id'] == edu1_parent and group.attrs['id'] == edu2_parent:
                    relation = group.attrs['relname']
                    nuclearity = 'M'
                    if relation != 'span':
                        found = True
                        break             
    if not found:
        parent_group_id1 = edu1_parent
        parent_group_id2 = edu2_parent
        edu1_rec_parent = recursive_parent(edu1_parent, edu2_id, 5, groups)
        edu2_rec_parent = recursive_parent(edu2_parent, edu1_id, 5, groups)
        edu1_rec_same_parent = recursive_parent(edu1_parent, edu2_parent, 4, groups)
        edu2_rec_same_parent = recursive_parent(edu2_parent, edu1_parent, 4, groups)
        if edu1_rec_parent:
            relation = edu1_rec_parent.attrs['relname']
            nuclearity = 'SN'
            found = True     
        elif edu2_rec_parent:
            relation = edu2_rec_parent.attrs['relname']
            nuclearity = 'NS'
            found = True
        elif edu1_rec_same_parent:
            relation = edu1_rec_same_parent['relname']
            nuclearity = 'M'
            if relation != 'span':
                found = True
        elif edu2_rec_same_parent:
            relation = edu2_rec_same_parent['relname']
            nuclearity = 'M'
            if relation != 'span':
                found = True
    if not found:
        relation = 'no_relation'
        nuclearity = None
    detected_relation = '_'.join([i for i in [relation, nuclearity] if i is not None])
    return EDU(edu1.text, edu1.attrs['id']), EDU(edu2.text, edu2.attrs['id']), detected_relation

In [72]:
# def detect_relation(edu1, edu2, groups):
#     edu1_parent=detect_parent(edu1)
#     edu2_parent=detect_parent(edu2)
#     edu1_id = edu1.attrs['id']
#     edu2_id = edu2.attrs['id']
#     nuclearity = None
#     found = True
#     if edu1_parent is None and edu2_parent is None:
#         relation = 'no_relation'
#     else:
#         if edu1_parent == edu2_id:
#             relation = edu1.attrs['relname']
#             nuclearity = 'SN'
#         elif edu2_parent == edu1_id:
#             relation = edu2.attrs['relname']
#             nuclearity = 'NS'
#         elif edu1_parent == edu2_parent:
#             relation1 = edu1.attrs['relname']
#             relation2 = edu2.attrs['relname']
#             if relation1 != 'span' and relation2 == 'span':
#                 relation = relation1
#                 nuclearity = 'SN'
#             elif relation2 != 'span' and relation1 == 'span':
#                 relation = relation2
#                 nuclearity = 'NS'
#             elif relation2 == relation2:
#                 relation = relation1
#                 nuclearity = 'M'
#         else:
#             found = False
#             for group in groups:
#                 if group.attrs['id'] == edu1_parent and detect_parent(group) == edu2_id:
#                     relation = group.attrs['relname']
#                     if multinuclear(group):
#                         nuclearity = 'M'
#                     else:
#                         nuclearity = 'SN'
#                     found = True
#                     break 
#                 elif group.attrs['id'] == edu2_parent and detect_parent(group) == edu1_id:
#                     relation = group.attrs['relname']
#                     if multinuclear(group):
#                         nuclearity = 'M'
#                     else:
#                         nuclearity = 'NS'
#                     found = True
#                     break
#                 elif group.attrs['id'] == edu1_parent and group.attrs['id'] == edu2_parent:
#                     relation = group.attrs['relname']
#                     nuclearity = 'M'
#                     if relation != 'span':
#                         found = True
#                         break    
#                 else:
#                     parent_group_id1 = edu1_parent
#                     parent_group_id2 = edu2_parent
#                     if group.attrs['id'] == parent_group_id1:
#                         parent_parent_group_id = detect_parent(group)
#                         if parent_parent_group_id:
#                             for group in groups:
#                                 if group.attrs['id'] == parent_parent_group_id:
#                                     last_parent = detect_parent(group)
#                                     if last_parent == edu2_id:
#                                         relation = group.attrs['relname']
#                                         nuclearity = 'SN'
#                     elif group.attrs['id'] == parent_group_id2:
#                         parent_parent_group_id = detect_parent(group)
#                         if parent_parent_group_id:
#                             for group in groups:
#                                 if group.attrs['id'] == parent_parent_group_id:
#                                     last_parent = detect_parent(group)
#                                     if last_parent == edu1_id:
#                                         relation = group.attrs['relname'] 
#                                         nuclearity = 'NS'                               
#     if not found:
#         relation = 'no_relation'
#         nuclearity = None
#     detected_relation = '_'.join([i for i in [relation, nuclearity] if i is not None])
#     return EDU(edu1.text, edu1.attrs['id']), EDU(edu2.text, edu2.attrs['id']), detected_relation

In [73]:
def pairs_to_csv(edupairs, filepath='../all_data.csv'):
    with open(filepath, 'w') as outfile:
        writer = csv.writer(outfile, delimiter=';', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['EDU1_pos', 'EDU1_text', 'EDU2_pos', 'EDU2_text', 'Text_id', 'Relation'])
        for pair in edupairs:
            writer.writerow([pair.edu1.position, pair.edu1.text, pair.edu2.position, 
                             pair.edu2.text, pair.text_id, pair.relation])

In [74]:
def generate_pairs_rs3(text_soup, text_id, window):
    pairs = []
    edus = soup.find_all('segment')
    groups = soup.find_all('group')
    for i in range(len(edus)-window):
        if ends_sentence(edus[i]):
                continue
        else:
            for j in range(1, window+1):
                pairs.append(EDUPair(*detect_relation(edus[i], edus[i+j], groups)+(text_id, )))
                if ends_sentence(edus[i+j]):
                    break
    return pairs

In [79]:
def generate_matrix_all(window=5, dir_path='../corpus_rs3/corpus'):
    text_soups = read_corpus(dir_path)
    all_pairs = []
    for i, text_soup in enumerate(text_soups):
        print(str(i+1)+'/'+str(len(text_soups)))
        pairs = generate_pairs_rs3(text_soup, i, window)
        all_pairs.extend(pairs)
    pairs_to_csv(all_pairs)

In [80]:
generate_matrix_all()

0/79
1/79
2/79
3/79
4/79
5/79
6/79
7/79
8/79
9/79
10/79
11/79
12/79
13/79
14/79
15/79
16/79
17/79
18/79
19/79
20/79
21/79
22/79
23/79
24/79
25/79
26/79
27/79
28/79
29/79
30/79
31/79
32/79
33/79
34/79
35/79
36/79
37/79
38/79
39/79
40/79
41/79
42/79
43/79
44/79
45/79
46/79
47/79
48/79
49/79
50/79
51/79
52/79
53/79
54/79
55/79
56/79
57/79
58/79
59/79
60/79
61/79
62/79
63/79
64/79
65/79
66/79
67/79
68/79
69/79
70/79
71/79
72/79
73/79
74/79
75/79
76/79
77/79
78/79
