# REALEC

In [2]:
import thesis_utils
import os
import spacy
import re
from collections import defaultdict, Counter
import csv

comet_ml is installed but `COMET_API_KEY` is not set.


In [3]:
def get_list_ids_tokens_gold (list_sentences):
    all_sentences_info = []
    
    for idx, sentence in enumerate(list_sentences):
        sent_info = []
        for idx_tok, line in enumerate(sentence):
            tok_tuple = (f'T{idx_tok}',line[0],line[-1])
            sent_info.append(tok_tuple)
            
        all_sentences_info.append(sent_info)
    
    return all_sentences_info

In [4]:
multiged_realec_dev =  thesis_utils.read_tsv_file_and_find_sentences_without_headers('./MULTI-GED2023 DATA/en_realec_dev.tsv')
multiged_dev_info= get_list_ids_tokens_gold(multiged_realec_dev)
print(len(multiged_dev_info))

4067


In [5]:
# multiged_realec_test =  thesis_utils.read_tsv_file_and_find_sentences('./MULTI-GED2023 DATA/en_realec_test_unlabelled.tsv')
# multiged_test_tokens = [[token[0] for token in sentence] for sentence in multiged_realec_test]
# print(len(multiged_realec_test))

In [6]:
def clean_multiged(sentences_tokens_lists):
    for sentence in sentences_tokens_lists:
        for i, (idx, token, label) in enumerate(sentence):
            if token.startswith('\\'):
                cleaned_token = token.lstrip('\\')
                sentence[i] = (idx, cleaned_token, label)
    return sentences_tokens_lists


multiged_dev_info = clean_multiged(multiged_dev_info)

In [12]:
realec_dataset = './exam/Exam_to_examine'

In [14]:
nlp = spacy.load("en_core_web_md")

In [16]:
def find_overlapping_spans(token_start, token_end, annotations):
    overlapping = []
    for ann in annotations:
        if not (token_end <= ann['start'] or token_start >= ann['end']):
            overlapping.append(ann)
    return overlapping

wrong_re = re.compile(r'^T(\d+)\s+([a-zA-Z_]+)\s+(\d+)\s+(\d+)\s+(.+)$', re.IGNORECASE)
correct_re = re.compile(r'^#(\d+)\s+AnnotatorNotes\s+T(\d+)\s+(.+)$', re.IGNORECASE)

all_sentences = []
max_error_pairs = 0

for root_dir, _, files in os.walk(realec_dataset):
    for file in files:
        if '.ipynb_checkpoints' in root_dir or '-checkpoint' in file or not file.endswith('.ann'):
            continue

        ann_path = os.path.join(root_dir, file)
        txt_path = ann_path.replace('.ann', '.txt')

        if not os.path.exists(txt_path):
            print(f"Missing .txt for: {ann_path}, skipping.")
            continue

        with open(txt_path, 'r', encoding='utf-8') as f_txt, \
             open(ann_path, 'r', encoding='utf-8') as f_ann:

            txt_content = f_txt.read().replace('\n', ' ')
            txt_content = txt_content.replace('\t', ' ')
            doc = nlp(txt_content)

            error_annotations = []
            lines = f_ann.read().splitlines()

            i = 0
            while i < len(lines):
                line = lines[i].strip()
                if not line or (line.startswith('#') and 'lemma =' in line.lower()):
                    i += 1
                    continue

                wrong_match = wrong_re.match(line)
                if wrong_match:
                    error_id = wrong_match.group(1)
                    error_type = wrong_match.group(2)
                    if error_type.isupper():
                        i += 1
                        continue

                    wrong_beg = int(wrong_match.group(3))
                    wrong_end = int(wrong_match.group(4))
                    wrong_text = wrong_match.group(5)
                    correction = ""

                    if i + 1 < len(lines):
                        next_line = lines[i + 1].strip()
                        if not (next_line.startswith('#') and 'lemma =' in next_line.lower()):
                            correct_match = correct_re.match(next_line)
                            if correct_match and correct_match.group(2) == error_id:
                                correction = correct_match.group(3)
                                i += 1

                    error_annotations.append({
                        'start': wrong_beg,
                        'end': wrong_end,
                        'type': error_type,
                        'correction': correction
                    })
                i += 1

            token_id = 0
            for sent in doc.sents:
                sent_tokens = []
                for token in sent:
                    overlapping_errors = find_overlapping_spans(token.idx, token.idx + len(token.text), error_annotations)
                    max_error_pairs = max(max_error_pairs, len(overlapping_errors))

                    token_data = [file, f"T{token_id}", token.text]
                    token_id += 1
                    
                    for error in overlapping_errors:
                        token_data.extend([error['type'], error['correction']])

                    sent_tokens.append(token_data)

                all_sentences.append(sent_tokens)

for sent in all_sentences:
    for token_data in sent:
        while len(token_data) < 3 + max_error_pairs * 2:
            token_data.extend(['', ''])

print(len(all_sentences))

Missing .txt for: ./exam/Exam_to_examine/EGe_100-199/2017_EGe_13_2.ann, skipping.
113584


In [17]:
print(all_sentences[0])

[['2019_ABu_241_1.ann', 'T0', 'Three', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], ['2019_ABu_241_1.ann', 'T1', 'chats', 'Spelling', 'charts', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], ['2019_ABu_241_1.ann', 'T2', 'indicate', 'lex_item_choice', 'show', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], ['2019_ABu_241_1.ann', 'T3', 'the', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], ['2019_ABu_241_1.ann', 'T4', 'proportion', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], ['2019_ABu_241_1.ann', 'T5', 'of', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], ['2019_ABu_241_1.ann', 'T6', 'people', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], ['2019_ABu_241_1.ann', 'T7', 'of', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], ['2019_ABu_241_1.ann', 'T8', 'different', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], ['2019_ABu_241_1.ann', 'T9', 'ages

In [68]:
def split_tokens_and_renumber(sentences):
    processed_sentences = []

    for sentence in sentences:
        new_sentence = []
        token_counter = 0  # Reset for each sentence

        for row in sentence:
            file_name, _, token_text = row[:3]
            other_fields = row[3:]

            if token_text.startswith('-') and len(token_text) > 1:
                # Split into '-' and the rest
                minus_token = [file_name, f'T{token_counter}', '-', *other_fields]
                token_counter += 1
                rest_token = [file_name, f'T{token_counter}', token_text[1:], *other_fields]
                token_counter += 1
                new_sentence.extend([minus_token, rest_token])
            else:
                new_token = [file_name, f'T{token_counter}', token_text, *other_fields]
                token_counter += 1
                new_sentence.append(new_token)

        processed_sentences.append(new_sentence)

    return processed_sentences


idk_if_it_works=split_tokens_and_renumber(all_sentences)

In [89]:
cleaned_sentences = []
merged_sentences = []

for sent in idk_if_it_works:
    if sent and sent[-1][2].strip() == '':
        sent = sent[:-1]
    cleaned_sentences.append(sent)

n = len(cleaned_sentences)
i = 0

while i < n:
    sentence = cleaned_sentences[i]
    merged_sentences.append(sentence)

    if i + 1 < n:
        merged_2 = sentence + cleaned_sentences[i + 1]
        merged_sentences.append(merged_2)

    if i + 2 < n:
        merged_3 = sentence + cleaned_sentences[i + 1] + cleaned_sentences[i + 2]
        merged_sentences.append(merged_3)
    i += 1

In [12]:
def match_realec_to_multiged(sentences_realec, multiged_tokens):
    token_sequence_to_realec_occurrences = defaultdict(list)
    
    for realec_sentence in sentences_realec:
        token_sequence = tuple(token_info[2] for token_info in realec_sentence)
        token_sequence_to_realec_occurrences[token_sequence].append(realec_sentence)

    multiged_sequence_counts = Counter(tuple(token[1] for token in sent) for sent in multiged_tokens)

    matched_sentences = []
    matched_sequences = set()

    for token_sequence, realec_occurrences in token_sequence_to_realec_occurrences.items():
        multiged_count = multiged_sequence_counts.get(token_sequence, 0)

        if multiged_count > 0:
            used = 0
            for multiged_sent in multiged_tokens:
                if tuple(token[1] for token in multiged_sent) == token_sequence:
                    if used >= len(realec_occurrences):
                        break
                    realec_sentence = realec_occurrences[used]

                    realec_with_labels = [
                        tuple(token_info) + (multiged_label,)
                        for token_info, (_, _, multiged_label) in zip(realec_sentence, multiged_sent)
                    ]

                    matched_sentences.append(realec_with_labels)
                    used += 1
                    multiged_count -= 1
                    matched_sequences.add(token_sequence)

    return matched_sentences, matched_sequences


In [13]:
dev_matches_spacy, dev_sequences_spacy = match_realec_to_multiged(merged_sentences, multiged_dev_info)

print(f'Number of matched dev sentences: {len(dev_matches_spacy)}')       

all_matched_sequences_spacy = dev_sequences_spacy #| test_sequences_spacy

unmatched_multiged_dev_spacy = []
for sentence in multiged_dev_info:
    token_sequence = tuple(token[1] for token in sentence)
    if token_sequence not in all_matched_sequences_spacy:
        unmatched_multiged_dev_spacy.append(sentence)

print(f'Dev sentences still to match: {len(unmatched_multiged_dev_spacy)}')


unmatched_realec_sentences = []
for realec_sentence in merged_sentences:
    token_sequence = tuple(token_info[2] for token_info in realec_sentence)
    if token_sequence not in all_matched_sequences_spacy:
        unmatched_realec_sentences.append(realec_sentence)

print(f"Total unmatched REALEC sentences: {len(unmatched_realec_sentences)}")

Number of matched dev sentences: 4015
Dev sentences still to match: 52
Total unmatched REALEC sentences: 335354


In [14]:
print(dev_matches_spacy[2:4])

[[('2019_ABu_42_2.ann', 'T0', 'To', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2019_ABu_42_2.ann', 'T1', 'sum', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2019_ABu_42_2.ann', 'T2', 'up', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2019_ABu_42_2.ann', 'T3', ',', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2019_ABu_42_2.ann', 'T4', 'I', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2019_ABu_42_2.ann', 'T5', 'would', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2019_ABu_42_2.ann', 'T6', 'like', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2019_ABu_42_2.ann', 'T7', 'to', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2019_ABu_42_2.ann', 'T8', 'say', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2019_ABu_42_2.ann', 'T9', 'that', '', '', '', '', '', '

In [15]:
def extract_tokens_idx_and_filenames(list_of_lines_for_sentence):
    all_sentences_tok = []
    all_indexes= []
    all_filenames = []
    
    for sent in list_of_lines_for_sentence:
        sent_tok = []
        sent_idx = []
        sent_nam = []
        for line in sent:
            file_name = line[0]
            tok_idx=line[1]
            token = line[2]
            sent_tok.append(token)
            sent_idx.append(tok_idx)
            sent_nam.append(file_name)
        all_sentences_tok.append(sent_tok)
        all_indexes.append(sent_idx)
        all_filenames.append(sent_nam)
    return all_sentences_tok,all_indexes,all_filenames
    
all_unmatched_dev_tok_sent,all_unmatched_realec_indexes, all_unmatched_realec_filenames = extract_tokens_idx_and_filenames(unmatched_realec_sentences)

In [16]:
filtered_unmatched_dev_tok_sent = []
filtered_unmatched_realec_indexes = []
filtered_unmatched_realec_filenames = []

for sentence, idx, name in zip(all_unmatched_dev_tok_sent, all_unmatched_realec_indexes, all_unmatched_realec_filenames):
    if len(sentence) > 0 and len(idx)>0 and len(name) > 0:
        filtered_unmatched_dev_tok_sent.append(sentence)
        filtered_unmatched_realec_indexes.append(idx)
        filtered_unmatched_realec_filenames.append(name)

In [17]:
sentences_as_single_strings_multiged_dev = []

for sentence in unmatched_multiged_dev_spacy:
    cleaned_sentence = ''
    sentence_indices = ''
    cleaned_labels = ''

    for token in sentence:
        token_text = token[1]
        token_idx = token[0]
        token_label = token[-1]

        if token_text == '\\"':
            cleaned_sentence += '"'
        else:
            cleaned_sentence += token_text

        sentence_indices += token_idx * len(token_text)

        cleaned_labels = ''.join(token[-1].replace('\\\\\\', '') for token in sentence)


    sentences_as_single_strings_multiged_dev.append((sentence_indices,cleaned_sentence, cleaned_labels))

In [18]:
print(sentences_as_single_strings_multiged_dev[2])

('T0T0T0T0T0T0T1T1T2T2T2T2T2T2T2T3T3T4T4T4T4T4T4T4T5T5T5T6T6T6T6T6T6T7T7T7T8T9T9T9T9T9T9T9T9T9T9T10T10T10T11T11T12T12T12T12T12T12T12T12T13T13T13T14T14T14T14T14T14T15T15T15T15T16T16T16T17', "Spring'smonthesinYakutskarefreezytoo,tempreturecanbepossiblenothigherthen0'C.", 'ccicccicciccicccic')


In [19]:
sentences_as_single_strings_realec = []
prev_cleaned = None
prev_clean_indices = None
prev_filename = None

for sentence, idx_list, filename in zip(filtered_unmatched_dev_tok_sent, filtered_unmatched_realec_indexes, filtered_unmatched_realec_filenames):
    cleaned = ''
    clean_indices = ''
    
    for idx, token in zip(idx_list, sentence):
        clean_token = token.replace('\\\\\\', '')
        cleaned += clean_token
        clean_indices += idx * len(clean_token)

    sentences_as_single_strings_realec.append((filename[0], cleaned, clean_indices))
    
    if prev_cleaned is not None and prev_clean_indices is not None:
        combined_cleaned = prev_cleaned + cleaned
        combined_indices = prev_clean_indices + clean_indices
        sentences_as_single_strings_realec.append((prev_filename[0], combined_cleaned, combined_indices))
    
    prev_cleaned = cleaned
    prev_clean_indices = clean_indices
    prev_filename = filename

In [20]:
set_most_possible_multi_idx = list()
possible = []
matched_sentences = set()
for name, sentence1, idx in sentences_as_single_strings_realec:
    exact_match_found = False
    
    for multi_idx, sentence2, label in sentences_as_single_strings_multiged_dev:
        if sentence1 == sentence2:
            set_most_possible_multi_idx.append((name, idx, sentence1, label))
            matched_sentences.add((sentence2, label))
            exact_match_found = True
            break
    
    if not exact_match_found:
        for sentence2 in sentences_as_single_strings_multiged_dev:
            if sentence2 in matched_sentences:
                continue
            if sentence1 in sentence2:
                if abs(len(sentence2) - len(sentence1)) > 4:
                    continue
                possible.append((name, sentence1, sentence2))

In [26]:
print(len(set_most_possible_multi_idx))

3


In [29]:
for sentence in set_most_possible_multi_idx:
    print(sentence)
    print()

('2017_NMya_21_1.ann', 'T0T0T0T0T0T0T0T0T0T1T1T2T2T2T3T3T3T3T3T3T3T3T3T4T5T5T5T5T5T5T5T5T6T6T7T7T7T8T8T8T8T9T9T9T9T9T9T9T10T10T10T10T10T10T11T11T11T11T11T11T11T12T12T13T13T13T14T14T15T15', 'Accordingtothestatistic,FacebookisthemostpopularsocialnetworkintheU.S.', 'ccciccccccccccccc')

('2017_NMya_8_1.ann', 'T0T0T0T1T1T1T1T1T1T1T1T1T2T2T2T3T3T3T3T3T4T4T4T4T4T4T4T4T4T4T4T5T5T5T6T6T6T7T7T8T8T8T8T8T9T9T9T9T9T9T10T10T10T10T10T10T10T10T11T12T12T12T12T12T12T12T12T13T14T14T14T14T14T14T14T14T14T15T15T15T16T16T16T16T16T16T16T16T17T17T17T17T17T18T18T18T18T19T19T19T19T19T19T20T20T20T20T20T20T20T21T21T21T21T22T23T23T23T24T24T24T24T24T24T25', 'Thefollowingbarchartillustratestheuseofmajorsocialnetworks:Facebook,InstagramandLinkedInamongU.S.adultsdividedinto4agegroups.', 'cccccccccccccccccciiccccccc')

('2017_EGe_247_1.ann', 'T0T0T0T0T0T0T1T1T2T2T2T2T2T2T2T3T3T4T4T4T4T4T4T4T5T5T5T6T6T6T6T6T6T7T7T7T8T9T9T9T9T9T9T9T9T9T9T10T10T10T11T11T12T12T12T12T12T12T12T12T13T13T13T14T14T14T14T14T14T15T15T15T15T16T16T

In [30]:
def process_data(data_list):
    output = []
    
    for data in data_list:
        filename, indices, sentence, gold_labels = data
        
        numeric_indices = []
        i = 0
        while i < len(indices):
            if indices[i] == 'T':
                i += 1
                num_str = ''
                while i < len(indices) and indices[i].isdigit():
                    num_str += indices[i]
                    i += 1
                if num_str:
                    numeric_indices.append(num_str)
            else:
                i += 1
        
        grouped_indices = []
        if not numeric_indices:
            continue
        
        current_group = [numeric_indices[0]]
        for num in numeric_indices[1:]:
            if num == current_group[-1]:
                current_group.append(num)
            else:
                grouped_indices.append(current_group)
                current_group = [num]
        grouped_indices.append(current_group)
        
        start = 0
        word_index_pairs = []
        
        for idx, group in enumerate(grouped_indices):
            word_length = len(group)
            if start + word_length > len(sentence):
                break 
            word = sentence[start:start + word_length]
            unique_index = f"T{group[0]}"
            label = gold_labels[idx]
            word_index_pairs.append([unique_index, word, label])
            start += word_length
        
        output.append({filename: word_index_pairs})
    
    return output

In [31]:
#processed_most_possible_matches_realec = process_data(set_most_possible_realec_idx)
processed_most_possible_matches= process_data(set_most_possible_multi_idx)

In [32]:
print(processed_most_possible_matches)

[{'2017_NMya_21_1.ann': [['T0', 'According', 'c'], ['T1', 'to', 'c'], ['T2', 'the', 'c'], ['T3', 'statistic', 'i'], ['T4', ',', 'c'], ['T5', 'Facebook', 'c'], ['T6', 'is', 'c'], ['T7', 'the', 'c'], ['T8', 'most', 'c'], ['T9', 'popular', 'c'], ['T10', 'social', 'c'], ['T11', 'network', 'c'], ['T12', 'in', 'c'], ['T13', 'the', 'c'], ['T14', 'U.', 'c'], ['T15', 'S.', 'c']]}, {'2017_NMya_8_1.ann': [['T0', 'The', 'c'], ['T1', 'following', 'c'], ['T2', 'bar', 'c'], ['T3', 'chart', 'c'], ['T4', 'illustrates', 'c'], ['T5', 'the', 'c'], ['T6', 'use', 'c'], ['T7', 'of', 'c'], ['T8', 'major', 'c'], ['T9', 'social', 'c'], ['T10', 'networks', 'c'], ['T11', ':', 'c'], ['T12', 'Facebook', 'c'], ['T13', ',', 'c'], ['T14', 'Instagram', 'c'], ['T15', 'and', 'c'], ['T16', 'LinkedIn', 'c'], ['T17', 'among', 'c'], ['T18', 'U.S.', 'i'], ['T19', 'adults', 'i'], ['T20', 'divided', 'c'], ['T21', 'into', 'c'], ['T22', '4', 'c'], ['T23', 'age', 'c'], ['T24', 'groups', 'c'], ['T25', '.', 'c']]}, {'2017_EGe_247_1.

In [33]:
all_ = []

for pr_sentence in processed_most_possible_matches:
    for pr_filename, pr_lines in pr_sentence.items():  # Unpack filename and actual lines
        for dev_sentence in unmatched_realec_sentences:
            matched_sentence = []
            for dev_line in dev_sentence:
                for pr_line in pr_lines:
                    if (
                        pr_filename == dev_line[0] and   # filename
                        pr_line[0] == dev_line[1] and    # token id
                        pr_line[1] == dev_line[2]        # word
                    ):
                        extended_line = dev_line + [pr_line[-1]]
                        matched_sentence.append(tuple(extended_line))
                        break  # Optional: avoid duplicate matches per dev_line
            if matched_sentence:
                all_.append(matched_sentence)
                break  # Stop comparing this pr_sentence after first match


In [64]:
print(all_)

[[('2017_NMya_21_1.ann', 'T0', 'According', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2017_NMya_21_1.ann', 'T1', 'to', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2017_NMya_21_1.ann', 'T2', 'the', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2017_NMya_21_1.ann', 'T3', 'statistic', 'Spelling', 'statistics', 'Category_confusion', 'statistics', '', '', '', '', '', '', '', '', '', '', '', '', 'i'), ('2017_NMya_21_1.ann', 'T4', ',', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2017_NMya_21_1.ann', 'T5', 'Facebook', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2017_NMya_21_1.ann', 'T6', 'is', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2017_NMya_21_1.ann', 'T7', 'the', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'c'), ('2017_NMya_21_1.ann', 'T8', 'most', '', '', '', '', '', '', '', '', '', '', '', '', '', ''

In [65]:
dev_matches_spacy.extend(all_)

In [55]:
print(len(data))

4015


In [53]:
data = [[list(token) for token in sentence] for sentence in dev_matches_spacy]

In [60]:
# num_columns = len(dev_matches_spacy[0][0])

# headers = ['filename', 'token'] + [f'col{i}' for i in range(1, num_columns - 11)] + ['gold_label']

# with open('processed_realec.tsv', 'w', newline='', encoding='utf-8') as f_out:
#     writer = csv.writer(f_out, delimiter='\t')
#     writer.writerow(headers)

#     for sentence in dev_matches_spacy:
#         for row in sentence:
#             row_without_id = [row[0], row[2]] + list(row[3:])
#             writer.writerow(row_without_id)
#         writer.writerow([])

with open("processed_realec.tsv", "w", encoding="utf-8") as f_out:
    headers = ["filename", "token"] + [f"col{i+1}" for i in range(8)] + ["gold_label"]
    f_out.write("\t".join(headers) + "\n")

    for sentence in dev_matches_spacy:
        for token in sentence:
            filename = token[0]
            word = token[2]
            col_values = list(token[3:11])  # converte la slice in lista
            gold_label = token[-1]

            row = [filename, word] + col_values + [gold_label]
            f_out.write("\t".join(row) + "\n")
        
        f_out.write("\n")



### from collections import Counter

column_counts = Counter()

with open("processed_realec.tsv", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            num_cols = len(line.strip().split('\t'))
            column_counts[num_cols] += 1

print("Column count frequencies:")
for num_cols, count in sorted(column_counts.items()):
    print(f"{num_cols} columns: {count} lines")

In [76]:
#there are 49 senteces not aligned. the others are fixed
sentences=[]
for sentence in unmatched_multiged_dev_spacy:
    sent_token = [tuple[1] for tuple in sentence]
    sentences.append(sent_token)

In [78]:
print(len(sentences))

52


In [80]:
for sentence in sentences:
    print(sentence)
    print()

['To', 'sum', 'up', ',', 'all', 'three', 'trends', 'went', 'up', 'and', 'the', 'number', 'of', 'people', 'aged', '65', 'and', 'over', 'became', 'the', 'biggest', 'among', 'the', 'Japanese', '.']

['The', 'bar', 'chart', 'illustrates', 'the', 'difference', 'in', 'the', 'rate', 'of', 'unemployment', 'in', 'two', 'years', '(', '2014', 'and', '2015', ')', 'in', '5', 'regions']

['Spring', "'s", 'monthes', 'in', 'Yakutsk', 'are', 'freezy', 'too', ',', 'tempreture', 'can', 'be', 'possible', 'not', 'higher', 'then', "0'C", '.']

['The', 'The', 'same', 'same', 'thing', 'thing', 'happens', 'happens', 'with', 'with', 'buildings', 'buildings', '-', '-', 'for', 'for', 'most', 'most', 'of', 'of', 'the', 'the', 'people', 'people', 'it', 'it', 'would', 'would', 'be', 'be', 'more', 'more', 'comfortable', 'comfortable', 'to', 'to', 'visit', 'visit', 'a', 'a', 'good', 'good', '-', '-', 'looking', 'looking', ',', ',', 'bright', 'bright', '-', '-', 'coloured', 'coloured', 'building', 'building', ',', ',',