In [29]:
import json
import nltk

from nltk.tokenize import word_tokenize

In [30]:
with open('../data/sentences.json', 'r') as file:
    lines = json.load(file)

In [31]:
[token.lower() for token in word_tokenize(lines[3]['sentence'])]

['these',
 'call',
 'for',
 'a',
 'better',
 'understanding',
 'of',
 'the',
 'mechanisms',
 'leading',
 'to',
 'maternal-fetal',
 'tolerance',
 '.']

In [32]:
lines

[{'sentence': 'Development of the allogeneic fetus in the maternal uterus represents an immunological paradox.',
  'metaphors': [],
  'has_metaphor': 0},
 {'sentence': 'Successful pregnancy requires the maternal immune system to tolerate the semi-allogeneic fetus.',
  'metaphors': [],
  'has_metaphor': 0},
 {'sentence': 'A failure in immune tolerance may result in abnormal pregnancies, such as recurrent spontaneous abortion.',
  'metaphors': [],
  'has_metaphor': 0},
 {'sentence': 'These call for a better understanding of the mechanisms leading to maternal-fetal tolerance.',
  'metaphors': [],
  'has_metaphor': 0},
 {'sentence': 'As the only exception to the traditional immunological principles, maternal-fetal tolerance has always been the focus of attention in the fields of reproductive immunology.',
  'metaphors': [],
  'has_metaphor': 0},
 {'sentence': 'Embryos express paternal antigens that are foreign to the mother, but the mother provides a special immune milieu at the fetal-mate

In [33]:
# def generate_annotations(lines):
#     metaphors = ["invasion", "invaders", "invading", "soldier", "attack", "foreign", "battle", "weapon", "destroy"]

#     annotations = []

#     for line in lines:
#         sentence = line['sentence']
        
#         cursor = 0

#         annotation = [
#             sentence,
#             {
#                 'entities': []
#             }
#         ]

#         tokens = [token.lower() for token in word_tokenize(sentence)]

#         for token in tokens:
#             if token in metaphors:
#                 entity = [
#                     cursor,
#                     cursor + len(token),
#                     "MET"
#                 ]
#             else:
#                 entity = [
#                     cursor,
#                     cursor + len(token),
#                     "O"
#                 ]

#             cursor += (len(token) + 1)

#             annotation[1]['entities'].append(entity)

#         annotations.append(annotation)
    
#     return annotations

In [34]:
# spacy_annotations = generate_annotations(lines)

In [35]:
def generate_annotations(lines):
    metaphors = ["invasion", "invaders", "invading", "soldier", "attack", "foreign", "battle", "weapon", "destroy"]

    met_annotations = []
    non_met_annotations = []

    for line in lines:
        sentence = line['sentence']
        
        cursor = 0

        annotation = [
            sentence,
            {
                'entities': []
            }
        ]

        tokens = [token.lower() for token in word_tokenize(sentence)]

        for token in tokens:
            if token in metaphors:
                entity = [
                    cursor,
                    cursor + len(token),
                    "MET"
                ]
            else:
                entity = [
                    cursor,
                    cursor + len(token),
                    "O"
                ]

            cursor += (len(token) + 1)

            annotation[1]['entities'].append(entity)

        if line['has_metaphor'] == 1:
            met_annotations.append(annotation)
        else:
            non_met_annotations.append(annotation)
    
    return met_annotations, non_met_annotations

In [36]:
met_annotations, non_met_annotations = generate_annotations(lines)

In [37]:
combined_annotations = []

for i in range(100):
    if i % 4 == 0:
        combined_annotations.append(met_annotations[i])
    else:
        combined_annotations.append(non_met_annotations[i])

In [38]:
classes = ["O", "METAPHOR"]

In [39]:
spacy_annotations = {
    'classes': classes,
    'annotations': combined_annotations
}

In [40]:
to_text_file = []

for annotation in combined_annotations:
    to_text_file.append(annotation[0])

In [41]:
# combined_annotations[1]

In [42]:
with open('../data/spacy_annotations.json', 'w') as json_file:
    json.dump(spacy_annotations, json_file)

with open('../data/spacy_feed.txt', 'w') as f:
    for text in to_text_file:
        f.write(text)
        f.write('\n')

In [None]:
# pattern = r'\w+[\w\'\-\$]*|[\.,!?;()]+'

# tokens = nltk.regexp_tokenize(text, pattern)

# print(tokens)

In [88]:
test_tokens = "Embryos express paternal antigens that are foreign to the mother, but the mother provides a special immune milieu at the fetal-maternal interface to permit rather than reject the embryo growth in the uterus until parturition by establishing precise crosstalk between the mother and the fetus.".lower().split(' ')
test_tokens

['embryos',
 'express',
 'paternal',
 'antigens',
 'that',
 'are',
 'foreign',
 'to',
 'the',
 'mother,',
 'but',
 'the',
 'mother',
 'provides',
 'a',
 'special',
 'immune',
 'milieu',
 'at',
 'the',
 'fetal-maternal',
 'interface',
 'to',
 'permit',
 'rather',
 'than',
 'reject',
 'the',
 'embryo',
 'growth',
 'in',
 'the',
 'uterus',
 'until',
 'parturition',
 'by',
 'establishing',
 'precise',
 'crosstalk',
 'between',
 'the',
 'mother',
 'and',
 'the',
 'fetus.']

In [70]:
cursor = 0
indexes = []
for token in test_tokens:
    indexes.append([cursor, cursor + len(token)])
    cursor += len(token) + 1

In [71]:
indexes

[[0, 7],
 [8, 15],
 [16, 24],
 [25, 33],
 [34, 38],
 [39, 42],
 [43, 50],
 [51, 53],
 [54, 57],
 [58, 64],
 [65, 66],
 [67, 70],
 [71, 74],
 [75, 81],
 [82, 90],
 [91, 92],
 [93, 100],
 [101, 107],
 [108, 114],
 [115, 117],
 [118, 121],
 [122, 136],
 [137, 146],
 [147, 149],
 [150, 156],
 [157, 163],
 [164, 168],
 [169, 175],
 [176, 179],
 [180, 186],
 [187, 193],
 [194, 196],
 [197, 200],
 [201, 207],
 [208, 213],
 [214, 225],
 [226, 228],
 [229, 241],
 [242, 249],
 [250, 259],
 [260, 267],
 [268, 271],
 [272, 278],
 [279, 282],
 [283, 286],
 [287, 292],
 [293, 294]]

In [84]:
letter_tokens = [*"Embryos express paternal antigens that are foreign to the mother, but the mother provides a special immune milieu at the fetal-maternal interface to permit rather than reject the embryo growth in the uterus until parturition by establishing precise crosstalk between the mother and the fetus.".lower()]

for index in indexes:
    start_index = index[0]
    end_index = index[1]
    print([start_index, end_index])

    X = ''.join(letter_tokens[start_index:end_index])

print(X)

[0, 7]
[8, 15]
[16, 24]
[25, 33]
[34, 38]
[39, 42]
[43, 50]
[51, 53]
[54, 57]
[58, 64]
[65, 66]
[67, 70]
[71, 74]
[75, 81]
[82, 90]
[91, 92]
[93, 100]
[101, 107]
[108, 114]
[115, 117]
[118, 121]
[122, 136]
[137, 146]
[147, 149]
[150, 156]
[157, 163]
[164, 168]
[169, 175]
[176, 179]
[180, 186]
[187, 193]
[194, 196]
[197, 200]
[201, 207]
[208, 213]
[214, 225]
[226, 228]
[229, 241]
[242, 249]
[250, 259]
[260, 267]
[268, 271]
[272, 278]
[279, 282]
[283, 286]
[287, 292]
[293, 294]



In [86]:
''.join(letter_tokens[8:15])

'express'

In [80]:
regen_token

''