# Generate Train txt file for thunlp/TensorFlow-NRE from wikipedia-biography dataset

In [1]:
# Checking Python Version 3+ 
import sys
print(sys.version)

3.6.2 |Anaconda custom (64-bit)| (default, Jul 20 2017, 13:51:32) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


## Link files

train.title -1-1-> train.box -1-1-> train.nb -[number]-1-> train.sent


## Resule Schema:
title, non_na_box, accumulated_sent_context

## Sample Training Data from thunlp/TensorFlow-NRE:
#### Format: (fb_mid_e1, fb_mid_e2, e1_name, e2_name, relation, sentence)   

### Sample:    

fb_mid_e1 - m.0ccvx   
fb_mid_e2 - m.05gf08   
e1_name   - queens   
e2_name   - belle_harbor   
relation  - /location/location/contains   
sentence  - sen. charles e. schumer called on federal safety officials yesterday to reopen their investigation into the fatal crash of a passenger jet in belle_harbor , queens , because equipment failure , not pilot error , might have been the cause . ###END###   


In [2]:
# wikipedia biography training files
# substitute test and validate files
data_type = 'test' # test, valid
train_title_file = data_type + '/' + data_type + ".title"
train_nb_file    = data_type + '/' + data_type + ".nb"
train_sent_file  = data_type + '/' + data_type + ".sent"
train_box_file   = data_type + '/' + data_type + ".box"

In [3]:
# add line indexer for the sent file
nbs_dict = {}
with open(train_sent_file) as sent:
    for line, content in enumerate(sent):
        nbs_dict[line] = content
    
nbs_dict.get(0)

'leonard shenoff randle -lrb- born february 12 , 1949 -rrb- is a former major league baseball player .\n'

In [4]:
# remove words from a context string, i.e., -lrb-
useless_words_to_remove = ['-lrb-', '-rrb-', '.\n', '']

def cleanUpSentence(input_sent):
    keywords_to_remove = useless_words_to_remove
    querywords = input_sent.split()
    resultwords  = [word for word in querywords if word.lower() not in keywords_to_remove]
    result = ' '.join(resultwords)
    return result

In [5]:
send_index = 0

sent_dict = {}

with open(train_nb_file) as nbs:
    accumulated_lines = 0
    for nb in nbs:
        current_lines_to_read = int(nb)
        
        current_accumulated_sent_context = ''
        
        for i in range(current_lines_to_read):
            line_to_read = accumulated_lines + i
            current_accumulated_sent_context += cleanUpSentence(nbs_dict.get(line_to_read))
        
        accumulated_lines += current_lines_to_read

        sent_dict[send_index] = current_accumulated_sent_context
        send_index += 1      

In [6]:
sent_dict[0]

'leonard shenoff randle born february 12 , 1949 is a former major league baseball player .he was the first-round pick of the washington senators in the secondary phase of the june 1970 major league baseball draft , tenth overall .'

In [8]:
from collections import defaultdict
sent_length_dict = defaultdict(int)
for t in sent_dict:
    i = sent_dict[t]
    if len(i) < 100:
        sent_length_dict['100-'] += 1
    if len(i) < 150:
        sent_length_dict['100-150'] += 1
    elif len(i) < 200:
        sent_length_dict['150-200'] += 1
    elif len(i) < 500:
        sent_length_dict['200-500'] += 1
    else:
        sent_length_dict['500+'] += 1

In [9]:
sent_length_dict

defaultdict(int,
            {'100-': 4339,
             '100-150': 14414,
             '150-200': 8933,
             '200-500': 27159,
             '500+': 22325})

In order to make the training more efficient, we want to limit the length of text upto 150 char.

## Get top most shown relationships

In [10]:
import re
from collections import defaultdict

In [11]:
relations_stat_dict = defaultdict(int)

with open(train_box_file) as boxes:
    for one_entry in boxes:
        
        all_target_attributes = re.split(r'\t+', one_entry)
        
        filtered_attrs = [attr for attr in all_target_attributes if '<none>' not in attr]
        
        for attr in filtered_attrs:
            if '_1:' in attr:
                attr_label = attr.split(':')[0][:-2]
                relations_stat_dict[attr_label] += 1

In [12]:
top_relation_upto = 95

top_most_shown_relations = {} # relation_label, number_of_times_it_shown
sortedValues = sorted(relations_stat_dict.values(), reverse=True)
max_value = sortedValues[0]
top_top_value = sortedValues[top_relation_upto]
print('Max Reps = '+str(max_value)+' minimum reps = '+str(top_top_value))

for attr_label in relations_stat_dict:
    attr_rep = relations_stat_dict[attr_label]
    if attr_rep >= top_top_value:
        top_most_shown_relations[attr_label] = attr_rep

Max Reps = 72831 minimum reps = 1264


## Generate the relation2id.txt file

In [13]:
#### Remove the following relation labels for speeding up:
del top_most_shown_relations['clubs']
del top_most_shown_relations['years']
del top_most_shown_relations['image']
del top_most_shown_relations['name']
del top_most_shown_relations['statlabel']
del top_most_shown_relations['label']

In [14]:
print('Top '+str(len(top_most_shown_relations))+' mostly shown relations:')
print('Minimum repetitions of an relationship = '+str(top_top_value))
print(top_most_shown_relations)

Top 90 mostly shown relations:
Minimum repetitions of an relationship = 1264
{'position': 19386, 'bats': 2114, 'throws': 2118, 'birth_date': 63231, 'birth_place': 57087, 'debutdate': 2153, 'debutyear': 3285, 'debutteam': 2916, 'finaldate': 1644, 'finalyear': 2513, 'finalteam': 2249, 'statvalue': 3103, 'article_title': 72831, 'caption': 15338, 'residence': 4438, 'office': 5068, 'term_start': 7567, 'term_end': 6308, 'party': 6477, 'occupation': 18850, 'nationality': 13920, 'image_size': 6786, 'fullname': 13399, 'height': 12948, 'death_date': 21696, 'death_place': 16802, 'youthyears': 4163, 'youthclubs': 6033, 'caps': 10477, 'goals': 10875, 'nationalyears': 5350, 'nationalteam': 5787, 'nationalcaps': 5106, 'nationalgoals': 5080, 'background': 5965, 'birth_name': 9340, 'origin': 3620, 'instrument': 2784, 'genre': 6349, 'years_active': 7624, 'associated_acts': 3173, 'predecessor': 6879, 'successor': 6212, 'religion': 3923, 'parents': 1569, 'alma_mater': 6860, 'currentclub': 6975, 'clubnumbe

In [15]:
if data_type == 'train':
    relation2idFile = open(data_type+'/relation2id.generate.txt', "w") 
    relation2idFile.write('NA 0\n')
    relationid = 1
    for relation in top_most_shown_relations:
        relation2idFile.write(relation+' '+str(relationid)+'\n')
        relationid += 1
    relation2idFile.close()

## Get the Concatenated Relation Labels:
Current, the relation labels are splitted in the wikipedia biography dataset.  
In order to adapt to NRE code, we have to concatenate all splitted labels into one single string with underscore in between.  

In [16]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

In [17]:
real_relation_label_and_value_list = []
chars_to_eliminate = ['\'s',',','--',',','\w','\t','\n','i','e.','m.','$','&','%','?','w.','.','-rsb-','\'\'','-lsb-','-']
counter = 0

with open(train_box_file) as boxes:
    for one_entry in boxes:
        
        all_target_attributes = re.split(r'\t+', one_entry)
        filtered_attrs = [attr for attr in all_target_attributes if '<none>' not in attr]
        
        current_box_dict = {}
        for oneLabel in filtered_attrs:
            labelStringOnly = oneLabel.split(':')[0].split('_')[0]
            
            if labelStringOnly in top_most_shown_relations:
                currentValue = cleanUpSentence(oneLabel.split(':')[1])
                if not hasNumbers(currentValue) and currentValue not in chars_to_eliminate and len(currentValue) > 0:
                    if labelStringOnly in current_box_dict:
                        current_box_dict[labelStringOnly] = current_box_dict[labelStringOnly] + '_' + currentValue
                    else:
                        current_box_dict[labelStringOnly] = currentValue
               
        real_relation_label_and_value_list.append(current_box_dict)

In [18]:
print(len(real_relation_label_and_value_list))
print(len(real_relation_label_and_value_list) == len(sent_dict))

72831
True


## Generate the training data txt file:

#### Read the titles into a list, in original order:

In [19]:
title_list = []
with open(train_title_file) as titles:
    for title in titles:
        title_arr = title.split(' ')
        current_title = ''
        for t in title_arr:
            current_title += t+'_'
        title_list.append(current_title[:-2])

#### Verify that the three lists (title_list, sent_dict, and real_relation_label_and_value_list) start with index 0:

In [20]:
title_list[2]

'miroslav_popov'

In [21]:
sent_dict[2]

'miroslav popov born 14 june 1995 in dvůr králové nad labem is a czech grand prix motorcycle racer .he currently races in the fim cev moto2 championship for montaze broz racing team aboard a suter .'

In [22]:
for i in real_relation_label_and_value_list[2]:
    print(real_relation_label_and_value_list[2][i])

cze_czech


## Select Most Frequently Shown Second Entities

In [23]:
second_entity_length_dict = defaultdict(int)

for r in real_relation_label_and_value_list:
    for lable in r:
        entity = r[lable]
        l = len(entity)

        if l < 10:
            second_entity_length_dict['10-'] += 1
        elif l < 20:
            second_entity_length_dict['20-'] += 1
        elif l < 50:
            second_entity_length_dict['50-'] += 1
        else:
            second_entity_length_dict['50+'] += 1
        
print(second_entity_length_dict)   

# second entity name, number of repetitions 
second_entity_dict = defaultdict(int) 
for r in real_relation_label_and_value_list:
    for n in r:
        entity = r[n]
        second_entity_dict[entity] += 1
        
sortedSecondEntityDict = {} # relation_label, number_of_times_it_shown, in sorted order
rank_upto = 100
sortedValues = sorted(second_entity_dict.values(), reverse=True)
max_value = sortedValues[0]
top_top_value = sortedValues[rank_upto]
print('Max Reps = '+str(max_value)+' minimum reps = '+str(top_top_value))

for attr_label in second_entity_dict:
    attr_rep = second_entity_dict[attr_label]
    if attr_rep >= top_top_value:
        sortedSecondEntityDict[attr_label] = attr_rep
        
print('\nTop '+str(rank_upto)+' most frequently shown second entities:')
print(sortedSecondEntityDict)

defaultdict(<class 'int'>, {'50-': 63965, '10-': 79890, '20-': 88233, '50+': 15605})
Max Reps = 3378 minimum reps = 201

Top 100 most frequently shown second entities:
{'right': 3378, 'june': 1105, 'solo_singer': 2741, 'vocals': 401, 'm_ftin_on': 2039, 'august': 1131, 'defender': 1925, 'attorney': 233, 'midfielder': 2506, 'goalkeeper': 1053, 'american': 3322, 'yes': 1079, 'england': 814, 'http_cricketarchive': 298, 'united_kingdom': 604, 'non_vocal_instrumentalist': 1137, 'non_performing_personnel': 327, 'french': 324, 'usa': 238, 'forward': 1441, 'september': 1380, 'right_wing': 223, 'left': 1773, 'june_utc': 368, 'october': 710, 'australian': 503, 'fencing': 317, 'canadian': 354, 'actress': 1023, 'present': 1023, 'roman_catholicism': 236, 'on': 272, 'kg_lb_on': 1176, 'athletics': 269, 'may': 1187, 'april': 1057, 'british': 1162, 'republican': 1027, 'http_cricinfo': 702, 'ft_in_m_on': 209, 'lb_kg_on': 230, 'actor': 1257, 'm_on': 435, 'striker': 1222, 'italian': 280, 'united_states': 9

## Join three list by the biography title and write to text file:
This is going to generate a 1+ GB large file

In [24]:
resultFile = open(data_type+'/'+data_type+".generate.txt", "w") 

index = 0

for relationAndValueEntry in real_relation_label_and_value_list:
    for relationName in relationAndValueEntry:
        first_entity  = title_list[index]
        second_entity = relationAndValueEntry[relationName]
        if len(sent_dict[index]) < 150 and second_entity in sortedSecondEntityDict.keys():
            resultFile.write('SH\tSH\t')
            resultFile.write(first_entity)
            resultFile.write('\t')
            resultFile.write(second_entity)
            resultFile.write('\t')
            resultFile.write(relationName)
            resultFile.write('\t')
            resultFile.write(sent_dict[index])
            resultFile.write(' ###END###\n')
            
    index += 1

resultFile.close()