In [1]:
import numpy as np

In [2]:
import random
import string
import math


In [3]:
def gen_word(word_length):
    word_len = np.random.randint(*word_length)
    return ''.join(random.sample(string.ascii_lowercase, word_len))

In [4]:
import sys
sys.path.append('src')

In [5]:
from utils.loader import MentionsLoader

In [6]:
from utils.loader import MentionsLoader
def transform_to_relations(file):
    loader = MentionsLoader(filename=file, read_size=500,batch_size=2000, dict_size=None, tokenizer=None, ngrams_flag=None)
    with open(file + '.rels', 'w') as out:
        for batch in loader.iter_pairs_batch():
            a, b, match = batch
            target = ((np.array(match) + 1) // 2).astype(int)
            for label, sa, sb in zip(target, a, b):
                out.write("{}\t{}\t{}\n".format(label, sa, sb))

In [7]:
class SimpleTopicGenerator:
    def __init__(self, num_topics, num_words_per_topic, word_length):
        self.topics = []
        for i in range(num_topics):
            source = []
            word_count = np.random.randint(*num_words_per_topic)
            for j in range(word_count):
                source.append(gen_word(word_length))
            self.topics.append(source)
    
    def get_words(self, topic_id):
        return random.sample(self.topics[topic_id], 1)

In [57]:
class NoisyTopicGenerator(SimpleTopicGenerator):
    
    def get_words(self, topic_id):
        noise_topic = random.randint(0, len(self.topics) - 1)
        return random.sample(self.topics[topic_id], 1) + random.sample(self.topics[noise_topic], 1)

In [9]:
class TopicCoocGenerator:
    
    def __init__(self, num_topics, num_sources, num_words_per_source, word_length):
        self.sources = []
        for i in range(num_sources):
            source = []
            word_count = np.random.randint(*num_words_per_source)
            for j in range(word_count):
                source.append(gen_word(word_length))
            self.sources.append(source)
            
        self.topics = sorted(list(set([
            tuple(sorted(random.sample(range(0, num_sources), 2))) for _ in range(num_topics * 10)
        ]))[:num_topics])
        
    def get_words(self, topic_id):
        source_ids = self.topics[topic_id]
        out = []
        for source_id in source_ids:
            out += random.sample(self.sources[source_id], 1)
        random.shuffle(out)
        return out

In [10]:
class TopicWordPrefix(TopicCoocGenerator):
    
    def augment_word(self, word):
        prefix_len = np.random.randint(*(0, 3))
        prefix = ''.join(random.sample(string.ascii_lowercase, prefix_len))
        suffix_len = np.random.randint(*(0, 3))
        suffix = ''.join(random.sample(string.ascii_lowercase, suffix_len))
        return prefix + word + suffix
        
    
    def get_words(self, topic_id):
        out = list(map(self.augment_word, super(TopicWordPrefix, self).get_words(topic_id)))
        return out

In [11]:
class TopicNgramGenerator(TopicCoocGenerator):
    
    def get_words(self, topic_id):
        out = super(TopicNgramGenerator, self).get_words(topic_id)
        return [" ".join(out)]
        

In [12]:
def generate(
    output,
    topic_generator,
    topics_count,
    entities_count,
    instance_per_entiry,
    word_length,
    topics_in_entity,
    topic_words_per_entity,
    random_words_per_entity,
    random_words_count
    ):
    train_size = entities_count * instance_per_entiry // 4 * 3
    
    random_words = [gen_word(word_length) for _ in range(random_words_count)]
    
    entity_to_topics = []
    lines = []
    
    for entity in range(entities_count):
        entity_topics_count = np.random.randint(*topics_in_entity)
        entity_topics = random.sample(range(topics_count), k=entity_topics_count)

        entity_to_topics.append(entity_topics)

        for i in range(instance_per_entiry):
            words = []
            entity_topic_words_count = np.random.randint(*topic_words_per_entity)
            for w in range(entity_topic_words_count):
                topic_id = random.choice(entity_topics)
                words += topic_generator.get_words(topic_id)

            random_words_count = np.random.randint(*random_words_per_entity)
            words += random.sample(random_words, random_words_count)
            random.shuffle(words)
            mid = np.random.randint(0, len(words))

            lines.append('{}\t'.format(entity) +  ' '.join(words[:mid]) + "\t{}\t".format(entity) + ' '.join(words[mid:]) + '\n')

    with open(output + '_train.tsv', 'w') as fd:
        for line in lines[:train_size]:
            fd.write(line)
            
    with open(output + '_valid.tsv', 'w') as fd:
        for line in lines[train_size:]:
            fd.write(line)
            
    transform_to_relations(output + '_train.tsv')
    transform_to_relations(output + '_valid.tsv')


In [47]:
topic_generator = TopicCoocGenerator(
        num_topics=10,
        num_sources=5,
        num_words_per_source=(1, 4),
        word_length = (4, 7)
    )

generate(
    './data/debug_data/syntetic_7',
    topic_generator,
    topics_count = 6,
    entities_count = 200,
    instance_per_entiry = 5,
    word_length = (4, 7),
    topics_in_entity = (1, 2),
    topic_words_per_entity = (1, 4),
    random_words_per_entity = (1, 4),  # up to 3 random words
    random_words_count = 10,
)

In [11]:
generate(
    './data/debug_data/syntetic_8',
    topic_generator = TopicNgramGenerator(
        num_topics=25,
        num_sources=15,
        num_words_per_source=(1, 40),
        word_length = (4, 7)
    ),
    topics_count = 25,
    entities_count = 10000,
    instance_per_entiry = 5,
    word_length = (4, 7),
    topics_in_entity = (1, 2),
    topic_words_per_entity = (1, 4),
    random_words_per_entity = (1, 4),  # up to 3 random words
    random_words_count = 100,
)

In [88]:
topic_generator = TopicWordPrefix(
        num_topics=25,
        num_sources=15,
        num_words_per_source=(1, 40),
        word_length = (4, 7)
)

generate(
    './data/debug_data/syntetic_9',
    topic_generator=topic_generator,
    topics_count = 25,
    entities_count = 10000,
    instance_per_entiry = 5,
    word_length = (4, 7),
    topics_in_entity = (1, 2),
    topic_words_per_entity = (1, 4),
    random_words_per_entity = (1, 4),  # up to 3 random words
    random_words_count = 100,
)

In [97]:
topic_generator = SimpleTopicGenerator(
        num_topics=100,
        num_words_per_topic=(1, 2),
        word_length = (4, 7)
)

generate(
    './data/debug_data/syntetic_10',
    topic_generator=topic_generator,
    topics_count = 100   ,
    entities_count = 1000,
    instance_per_entiry = 5, 
    word_length = (4, 7),
    topics_in_entity = (1, 2),
    topic_words_per_entity = (1, 2),
    random_words_per_entity = (1, 2),
    random_words_count = 20,
)

In [13]:
topic_generator = SimpleTopicGenerator(
        num_topics=50,
        num_words_per_topic=(1, 5), # More words per topic
        word_length = (4, 7)
)

generate(
    './data/debug_data/syntetic_11',
    topic_generator=topic_generator,
    topics_count = 50   ,
    entities_count = 1000,
    instance_per_entiry = 5, 
    word_length = (4, 7),
    topics_in_entity = (1, 2),
    topic_words_per_entity = (1, 2),
    random_words_per_entity = (1, 2),
    random_words_count = 20,
)

In [59]:
topic_generator = NoisyTopicGenerator(
        num_topics=100,
        num_words_per_topic=(1, 2),
        word_length = (4, 7)
)

generate(
    './data/debug_data/syntetic_12',
    topic_generator=topic_generator,
    topics_count = 100   ,
    entities_count = 1000,
    instance_per_entiry = 5, 
    word_length = (4, 7),
    topics_in_entity = (1, 2),
    topic_words_per_entity = (1, 2),
    random_words_per_entity = (1, 2),
    random_words_count = 20,
)

In [60]:
topic_generator = NoisyTopicGenerator(
        num_topics=400,
        num_words_per_topic=(1, 5),
        word_length = (4, 7)
)

generate(
    './data/debug_data/syntetic_13',
    topic_generator=topic_generator,
    topics_count = 400   ,
    entities_count = 10000,
    instance_per_entiry = 5, 
    word_length = (4, 7),
    topics_in_entity = (1, 2),
    topic_words_per_entity = (1, 2),
    random_words_per_entity = (1, 2),
    random_words_count = 200,
)