# Generate Train txt file for thunlp/TensorFlow-NRE from wikipedia-biography dataset

In [43]:
# Checking Python Version 3+ 
import sys
print(sys.version)

3.6.2 |Anaconda custom (64-bit)| (default, Jul 20 2017, 13:51:32) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


## Link files

train.title -1-1-> train.box -1-1-> train.nb -[number]-1-> train.sent


## Resule Schema:
title, non_na_box, accumulated_sent_context

## Sample Training Data from thunlp/TensorFlow-NRE:
#### Format: (fb_mid_e1, fb_mid_e2, e1_name, e2_name, relation, sentence)   

### Sample:    

fb_mid_e1 - m.0ccvx   
fb_mid_e2 - m.05gf08   
e1_name   - queens   
e2_name   - belle_harbor   
relation  - /location/location/contains   
sentence  - sen. charles e. schumer called on federal safety officials yesterday to reopen their investigation into the fatal crash of a passenger jet in belle_harbor , queens , because equipment failure , not pilot error , might have been the cause . ###END###   


In [44]:
# wikipedia biography training files
# substitute test and validate files
data_type = 'valid' # test, valid
train_title_file = data_type + '/' + data_type + ".title"
train_nb_file    = data_type + '/' + data_type + ".nb"
train_sent_file  = data_type + '/' + data_type + ".sent"
train_box_file   = data_type + '/' + data_type + ".box"

In [45]:
# add line indexer for the sent file
nbs_dict = {}
with open(train_sent_file) as sent:
    for line, content in enumerate(sent):
        nbs_dict[line] = content
    
nbs_dict.get(0)

'pope michael iii of alexandria -lrb- also known as khail iii -rrb- was the coptic pope of alexandria and patriarch of the see of st. mark -lrb- 880 -- 907 -rrb- .\n'

In [46]:
# remove words from a context string, i.e., -lrb-
useless_words_to_remove = ['-lrb-', '-rrb-', '.\n', '']

def cleanUpSentence(input_sent):
    keywords_to_remove = useless_words_to_remove
    querywords = input_sent.split()
    resultwords  = [word for word in querywords if word.lower() not in keywords_to_remove]
    result = ' '.join(resultwords)
    return result

In [47]:
send_index = 0

sent_dict = {}

with open(train_nb_file) as nbs:
    accumulated_lines = 0
    for nb in nbs:
        current_lines_to_read = int(nb)
        
        current_accumulated_sent_context = ''
        
        for i in range(current_lines_to_read):
            line_to_read = accumulated_lines + i
            current_accumulated_sent_context += cleanUpSentence(nbs_dict.get(line_to_read))
        
        accumulated_lines += current_lines_to_read

        sent_dict[send_index] = current_accumulated_sent_context
        send_index += 1      

In [48]:
sent_dict[1]

'hui jun is a male former table tennis player from china .'

## Get top 100 most shown relationships

In [49]:
import re
from collections import defaultdict

In [50]:
relations_stat_dict = defaultdict(int)

with open(train_box_file) as boxes:
    for one_entry in boxes:
        
        all_target_attributes = re.split(r'\t+', one_entry)
        
        filtered_attrs = [attr for attr in all_target_attributes if '<none>' not in attr]
        
        for attr in filtered_attrs:
            if '_1:' in attr:
                attr_label = attr.split(':')[0][:-2]
                relations_stat_dict[attr_label] += 1

In [51]:
sumVal = 0
for i in relations_stat_dict:
    value = relations_stat_dict[i]

In [52]:
top_most_shown_relations = {} # relation_label, number_of_times_it_shown
sortedValues = sorted(relations_stat_dict.values(), reverse=True)
max_value = sortedValues[0]
top_50_value = sortedValues[49]
for attr_label in relations_stat_dict:
    attr_rep = relations_stat_dict[attr_label]
    if attr_rep >= top_50_value:
        top_most_shown_relations[attr_label] = attr_rep

In [53]:
print(top_most_shown_relations)

{'name': 66734, 'predecessor': 6966, 'successor': 6266, 'birth_place': 57282, 'death_date': 21656, 'nationality': 13886, 'religion': 3968, 'residence': 4581, 'article_title': 72831, 'fullname': 13223, 'height': 13002, 'birth_date': 63212, 'currentclub': 6813, 'clubnumber': 4161, 'position': 19150, 'years': 12920, 'clubs': 11739, 'caps': 10374, 'goals': 10775, 'pcupdate': 4466, 'term_start': 7742, 'term_end': 6418, 'party': 6705, 'alma_mater': 6891, 'spouse': 8928, 'image': 26790, 'caption': 15578, 'occupation': 18943, 'years_active': 7638, 'image_size': 6682, 'debutyear': 3232, 'background': 5886, 'origin': 3587, 'genre': 6299, 'label': 3775, 'associated_acts': 3161, 'death_place': 16746, 'birth_name': 9367, 'nationalyears': 5278, 'nationalteam': 5685, 'nationalcaps': 5051, 'nationalgoals': 5028, 'youthyears': 4125, 'youthclubs': 5942, 'website': 3952, 'children': 4527, 'weight': 4594, 'office': 5115, 'known_for': 3347, 'awards': 3727}


## Generate the relation2id.txt file

In [54]:
#### Remove the following relation labels for speeding up:
del top_most_shown_relations['clubs']
del top_most_shown_relations['years']
del top_most_shown_relations['image']
del top_most_shown_relations['name']

In [55]:
if data_type == 'train':
    relation2idFile = open(data_type+'/relation2id.generate.txt', "w") 
    relation2idFile.write('NA 0\n')
    relationid = 1
    for relation in top_most_shown_relations:
        relation2idFile.write(relation+' '+str(relationid)+'\n')
        relationid += 1
    relation2idFile.close()

## Get the Concatenated Relation Labels:
Current, the relation labels are splitted in the wikipedia biography dataset.  
In order to adapt to NRE code, we have to concatenate all splitted labels into one single string with underscore in between.  

In [56]:
real_relation_label_and_value_list = []

counter = 0

with open(train_box_file) as boxes:
    for one_entry in boxes:
        
        all_target_attributes = re.split(r'\t+', one_entry)
        filtered_attrs = [attr for attr in all_target_attributes if '<none>' not in attr]
        
        current_box_dict = {}
        for oneLabel in filtered_attrs:
            labelStringOnly = oneLabel.split(':')[0].split('_')[0]
            if labelStringOnly in top_most_shown_relations:
                currentValue = cleanUpSentence(oneLabel.split(':')[1])
                if labelStringOnly in current_box_dict:
                    current_box_dict[labelStringOnly] = current_box_dict[labelStringOnly] + '_' + currentValue
                else:
                    current_box_dict[labelStringOnly] = currentValue
               
        real_relation_label_and_value_list.append(current_box_dict)

In [57]:
print(len(real_relation_label_and_value_list))
print(len(real_relation_label_and_value_list) == len(sent_dict))

72831
True


In [58]:
real_relation_label_and_value_list[0]

{'nationality': 'egyptian',
 'predecessor': 'shenouda_i',
 'religion': 'coptic_orthodox_christian',
 'residence': "saint_mark_'s_church",
 'successor': 'gabriel_i'}

## Generate the training data txt file:

#### Read the titles into a list, in original order:

In [59]:
title_list = []
with open(train_title_file) as titles:
    for title in titles:
        title_arr = title.split(' ')
        current_title = ''
        for t in title_arr:
            current_title += t+'_'
        title_list.append(current_title[:-2])

#### Verify that the three lists (title_list, sent_dict, and real_relation_label_and_value_list) start with index 0:

In [60]:
title_list[3]

'marie_stephan'

In [61]:
sent_dict[3]

'marie stephan , born march 14 , 1996 is a professional squash player who represents france .she reached a career-high world ranking of world no. 101 in july 2015 .'

In [62]:
for i in real_relation_label_and_value_list[0]:
    print(real_relation_label_and_value_list[0][i])

shenouda_i
gabriel_i
egyptian
coptic_orthodox_christian
saint_mark_'s_church


#### Join three list by the biography title and write to text file:
This is going to generate a 1+ GB large file

In [63]:
resultFile = open(data_type+'/'+data_type+".generate.txt", "w") 

index = 0

for relationAndValueEntry in real_relation_label_and_value_list:
    for relationName in relationAndValueEntry:
        resultFile.write('SH\tSH\t')
        resultFile.write(title_list[index])
        resultFile.write('\t')
        resultFile.write(relationAndValueEntry[relationName])
        resultFile.write('\t')
        resultFile.write(relationName)
        resultFile.write('\t')
        resultFile.write(sent_dict[index])
        resultFile.write(' ###END###\n')
    index += 1

resultFile.close()