In [182]:
import numpy as np
import re


def preprocess(text):
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')

    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word

    corpus = np.array([word_to_id[w] for w in words])

    return corpus, word_to_id, id_to_word

def create_co_matrix(corpus, vocab_size, window_size=1):
    # store the number of elements in corpus
    corpus_size = len(corpus)
    # initialize the matrix to explain co-occurence matrix of sentence
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
    
    # check co-occurence word of corpus
    for idx, word_id in enumerate(corpus):
        # check left and right side of words from current focus word according to window_size
        for i in range(1, window_size + 1):
            # srote the left and right word index based on the focued word
            left_idx = idx - i
            right_idx = idx + i
            
            if left_idx >= 0: # it has a word in the left side
                # store the word id of the word in the left side
                left_word_id = corpus[left_idx]
                # add one count to the index of the left word's id
                co_matrix[word_id, left_word_id] += 1
            
            if right_idx < corpus_size: # it has a word in the right side
                # same process of the left side version
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
    
    return co_matrix

def cos_similarity(x, y, eps=1e-8):
    nx = x / (np.sqrt(np.sum(x**2)) + eps)
    ny = y / (np.sqrt(np.sum(y**2)) + eps)
    return np.dot(nx, ny)

def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    query = query.lower()
    # extract query
    if query not in word_to_id:
        print('{} is not found'.format(query))
        return
    
    print('\n[query] {}'.format(query))
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]
    
    # calculate the cosin-similarity
    vocab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)
    
    # output the similarity order by high value
    count = 0
    for i in (-1*similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity[i]))
        
        count += 1
        if count >= top:
            return
    

In [185]:
text = "Once you’re sure you are running the correct version of the mac operating system, you can get started with downloading Xcode 10 through the Mac App Store. Open the App Store app on your Mac. By default the App Store is in the Dock. You can also find it in your Launchpad. You can obtain a developer license from Apple and it costs $99 per year. Unless you are ready to release your app to the App Store and start clawing back that fee, it is not recommended to start paying Apple right away. But when you are ready to start the registration process, go to the iOS Developer Program Center and enrolment should only take a few minutes."
corpus, word_to_id, id_to_word = preprocess(text)
C = create_co_matrix(corpus, len(word_to_id))

most_similar("apple", word_to_id, id_to_word, C)


[query] apple
 developed: 0.49999999292893216
 the: 0.49999999292893216
 xcode: 0.0
 10: 0.0
 can: 0.0
