In [141]:
import numpy as np
import re


def preprocess(text):
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')

    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word

    corpus = np.array([word_to_id[w] for w in words])

    return corpus, word_to_id, id_to_word

def create_co_matrix(corpus, vocab_size, window_size=1):
    # store the number of elements in corpus
    corpus_size = len(corpus)
    # initialize the matrix to explain co-occurence matrix of sentence
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
    
    # check co-occurence word of corpus
    for idx, word_id in enumerate(corpus):
        # check left and right side of words from current focus word according to window_size
        for i in range(1, window_size + 1):
            # srote the left and right word index based on the focued word
            left_idx = idx - i
            right_idx = idx + i
            
            if left_idx >= 0: # it has a word in the left side
                # store the word id of the word in the left side
                left_word_id = corpus[left_idx]
                # add one count to the index of the left word's id
                co_matrix[word_id, left_word_id] += 1
            
            if right_idx < corpus_size - 1: # it has a word in the right side
                # same process of the left side version
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
    
    return co_matrix

def cos_similarity(x, y, eps=1e-8):
    nx = x / (np.sqrt(np.sum(x**2)) + eps)
    ny = y / (np.sqrt(np.sum(y**2)) + eps)
    return np.dot(nx, ny)

In [150]:
text = "I used scrapy to crawl a website to get thousands of \
.txt files, each containing a text in natural language \
(description of a drug-induced experience). \
The name of each of these files is a unique number. \
I also have a .csv file with metadata associated \
with each of these unique numbers (i.e. I have a column for text_number, and other columns for the metadata corresponding to this particular number). One of the category of metadata is a dosage number (in mg)."
corpus, word_to_id, id_to_word = preprocess(text)
C = create_co_matrix(corpus, len(word_to_id))

c0 = C[word_to_id['one']]
c1 = C[word_to_id['this']]
cos_similarity(c0, c1)

0.7071067691154799