In [1]:
import collections
import math
import os
import sys
import argparse
import random
from tempfile import gettempdir
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
import l3

from tensorflow.contrib.tensorboard.plugins import projector


  from ._conv import register_converters as _register_converters



>>>>> This is L3Morpho, version 3.0 <<<<<
>>>>>  and HornMorpho, version 2.5  <<<<<
Instructions for updating:
Use the retry module or similar alternatives.


In [2]:

filename = "data/news.txt"

vocabulary = tf.compat.as_str(open(filename, encoding='utf8').read()).split()
print('Data size', len(vocabulary))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary



Data size 4089439


In [3]:
data, count, dictionary, reverse_dictionary = build_dataset(
    vocabulary, vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0

Most common words (+UNK) [['UNK', 401045], ('*', 239982), ('ነው', 56425), ('#', 45253), ('ላይ', 37050)]
Sample data [2752, 3645, 997, 0, 1633, 44513, 1968, 417, 953, 113] ['ጋዜጠኛ', 'ተመስገን', 'ደሳለኝ', 'UNK', 'በአቶ', 'አምሐ', 'መኮንን', 'አማካይነት', 'በፌዴራል', 'ጠቅላይ']


In [4]:

def get_similarity_on(word1, word2, startIndex):
    sim = 0
    for i in range(len(word2)):
        if word1[i + startIndex] == word2[i]:
            sim += 1
    return sim


def get_similarity(word1, word2):
    max_sim = 0
    if len(word1) == 0 or len(word2) == 0:
        return 0
    if len(word1) < len(word2):
        word2, word1 = word1, word2
    r = len(word1) - len(word2)
    for j in range(r + 1):
        sim = get_similarity_on(word1, word2, j)
        if sim > max_sim:
            max_sim = sim
    return max_sim / len(word1)

In [5]:
sim_dict  = {}
j = 0
file = open('data/sims.txt', encoding='utf8', mode='a')
for word1 in dictionary.keys():
    sims = []
    j += 1
    for word2 in  dictionary.keys():
        if word1 != word2:
            sim = get_similarity(word1, word2)
            if sim > 0.6:
                sims.append(word2)
        if len(sims) >= 10:
            break
            
    if len(sims) == 0:
        sims.append(word1)
    line = word1 + ' ' + ' '.join(sims) + '\n'
    file.write(line)
    if j % 1000 == 0:
        print("{0:.2}".format(j/len(dictionary)))

file.close()
    

    

0.02
0.04
0.06
0.08
0.1
0.12
0.14
0.16
0.18
0.2
0.22
0.24
0.26
0.28
0.3
0.32
0.34
0.36
0.38
0.4
0.42
0.44
0.46
0.48
0.5
0.52
0.54
0.56
0.58
0.6
0.62
0.64
0.66
0.68
0.7
0.72
0.74
0.76
0.78
0.8
0.82
0.84
0.86
0.88
0.9
0.92
0.94
0.96
0.98
1.0
