In [43]:
import collections
import math
import os
import sys
import argparse
import random
from tempfile import gettempdir
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
import l3

from tensorflow.contrib.tensorboard.plugins import projector


In [44]:

filename = "data/news.txt"

vocabulary = tf.compat.as_str(open(filename, encoding='utf8').read()).split()
print('Data size', len(vocabulary))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary



Data size 4089439


In [45]:
data, count, dictionary, reverse_dictionary = build_dataset(
    vocabulary, vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0

Most common words (+UNK) [['UNK', 401045], ('*', 239982), ('ነው', 56425), ('#', 45253), ('ላይ', 37050)]
Sample data [2752, 3645, 997, 0, 1633, 44513, 1968, 417, 953, 113] ['ጋዜጠኛ', 'ተመስገን', 'ደሳለኝ', 'UNK', 'በአቶ', 'አምሐ', 'መኮንን', 'አማካይነት', 'በፌዴራል', 'ጠቅላይ']


In [46]:

def get_similarity_on(word1, word2, startIndex):
    sim = 0
    for i in range(len(word2)):
        if word1[i + startIndex] == word2[i]:
            sim += 1
    return sim


def get_similarity(word1, word2):
    max_sim = 0
    if len(word1) == 0 or len(word2) == 0:
        return 0
    if len(word1) < len(word2):
        word2, word1 = word1, word2
    r = len(word1) - len(word2)
    for j in range(r + 1):
        sim = get_similarity_on(word1, word2, j)
        if sim > max_sim:
            max_sim = sim
    return max_sim / len(word1)

In [None]:
sim_dict  = {}
j = 0
file = open('data/sims.txt', encoding='utf8', mode='a')
for word1 in dictionary.keys():
    sims = []
    j += 1
    for word2 in  dictionary.keys():
        if word1 != word2:
            sim = get_similarity(word1, word2)
            if sim > 0.6:
                sims.append(word2)
        if len(sims) >= 10:
            break
            
    if len(sims) == 0:
        sims.append(word1)
    line = word1 + ' ' + ' '.join(sims) + '\n'
    file.write(line)
    if j % 1000 == 0:
        print("{0:.2}".format(j/len(dictionary)))

file.close()
    

    

2e-05
4e-05
6e-05
8e-05
0.0001
0.00012
0.00014
0.00016
0.00018
0.0002
0.00022
0.00024
0.00026
0.00028
0.0003
0.00032
0.00034
0.00036
0.00038
0.0004
0.00042
0.00044
0.00046
0.00048
0.0005
0.00052
0.00054
0.00056
0.00058
0.0006
0.00062
0.00064
0.00066
0.00068
0.0007
0.00072
0.00074
0.00076
0.00078
0.0008
0.00082
0.00084
0.00086
0.00088
0.0009
0.00092
0.00094
0.00096
0.00098
0.001
0.001
0.001
0.0011
0.0011
0.0011
0.0011
0.0011
0.0012
0.0012
0.0012
0.0012
0.0012
0.0013
0.0013
0.0013
0.0013
0.0013
0.0014
0.0014
0.0014
0.0014
0.0014
0.0015
0.0015
0.0015
0.0015
0.0015
0.0016
0.0016
0.0016
0.0016
0.0016
0.0017
