In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

# Sample corpus
documents = ['Machine learning is the study of computer algorithms that improve automatically through experience.\
Machine learning algorithms build a mathematical model based on sample data, known as training data.\
The discipline of machine learning employs various approaches to teach computers to accomplish tasks \
where no fully satisfactory algorithm is available.',
'Machine learning is closely related to computational statistics, which focuses on making predictions using computers.\
The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.',
'Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. \
It involves computers learning from data provided so that they carry out certain tasks.',
'Machine learning approaches are traditionally divided into three broad categories, depending on the nature of the "signal"\
or "feedback" available to the learning system: Supervised, Unsupervised and Reinforcement',
'Software engineering is the systematic application of engineering approaches to the development of software.\
Software engineering is a computing discipline.',
'A software engineer creates programs based on logic for the computer to execute. A software engineer has to be more concerned\
about the correctness of the program in all the cases. Meanwhile, a data scientist is comfortable with uncertainty and variability.\
Developing a machine learning application is more iterative and explorative process than software engineering.'
]

documents_df=pd.DataFrame(documents,columns=['documents'])

# removing special characters and stop words from the text
stop_words_l=stopwords.words('english')
documents_df['documents_cleaned']=documents_df.documents.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )

tfidfvectoriser=TfidfVectorizer()
tfidfvectoriser.fit(documents_df.documents_cleaned)
tfidf_vectors=tfidfvectoriser.transform(documents_df.documents_cleaned)

pairwise_similarities=np.dot(tfidf_vectors,tfidf_vectors.T).toarray()
pairwise_differences=euclidean_distances(tfidf_vectors)

def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Document: {documents_df.iloc[doc_id]["documents"]}')
    print ('\n')
    print ('Similar Documents:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix:
        if ix==doc_id:
            continue
        print('\n')
        print (f'Document: {documents_df.iloc[ix]["documents"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

most_similar(0,pairwise_similarities,'Cosine Similarity')
most_similar(0,pairwise_differences,'Euclidean Distance')  

Document: Machine learning is the study of computer algorithms that improve automatically through experience.Machine learning algorithms build a mathematical model based on sample data, known as training data.The discipline of machine learning employs various approaches to teach computers to accomplish tasks where no fully satisfactory algorithm is available.


Similar Documents:


Document: Machine learning is closely related to computational statistics, which focuses on making predictions using computers.The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.
Cosine Similarity : 0.22860560787391593


Document: Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks.
Cosine Similarity : 0.22581304743529423


Document: Machine learning approaches are traditionally divided

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ericcruz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import tensorflow as tf


In [3]:
# tokenize and pad every document to make them of the same size
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer=Tokenizer()
tokenizer.fit_on_texts(documents_df.documents_cleaned)
tokenized_documents=tokenizer.texts_to_sequences(documents_df.documents_cleaned)
tokenized_paded_documents=pad_sequences(tokenized_documents,maxlen=64,padding='post')
vocab_size=len(tokenizer.word_index)+1
print (tokenized_paded_documents[0])


[ 2  1 10 11 12 20 21 22  2  1 12 23 13 24 14 25  4 26 27  4 15 16  2  1
 28 29  7 30  5 31  8 32 33 34 17  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


In [4]:
pip show tensorflow

Name: tensorflow
Version: 2.7.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /opt/anaconda3/lib/python3.7/site-packages
Requires: absl-py, astunparse, flatbuffers, gast, google-pasta, grpcio, h5py, keras, keras-preprocessing, libclang, numpy, opt-einsum, protobuf, six, tensorboard, tensorflow-estimator, tensorflow-io-gcs-filesystem, termcolor, typing-extensions, wheel, wrapt
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip show keras

Name: keras
Version: 2.7.0
Summary: Deep learning for humans.
Home-page: https://keras.io/
Author: Keras team
Author-email: keras-users@googlegroups.com
License: Apache 2.0
Location: /opt/anaconda3/lib/python3.7/site-packages
Requires: 
Required-by: keras-utils, tensorflow
Note: you may need to restart the kernel to use updated packages.


In [6]:
# loading pre-trained embeddings, each word is represented as a 300 dimensional vector
import gensim
W2V_PATH="GoogleNews-vectors-negative300.bin.gz"
model_w2v = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)


In [7]:
# creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 
embedding_matrix=np.zeros((vocab_size,300))
for word,i in tokenizer.word_index.items():
    if word in model_w2v:
        embedding_matrix[i]=model_w2v[word]
# creating document-word embeddings
document_word_embeddings=np.zeros((len(tokenized_paded_documents),64,300))
for i in range(len(tokenized_paded_documents)):
    for j in range(len(tokenized_paded_documents[0])):
        document_word_embeddings[i][j]=embedding_matrix[tokenized_paded_documents[i][j]]
document_word_embeddings.shape

(6, 64, 300)

In [8]:
vocab_size

92

In [9]:
len(tokenizer.word_index)
#tokenizer.word_index

91

In [10]:
#embedding_matrix.shape
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.08837891,  0.1484375 , -0.06298828, ...,  0.02026367,
         0.11621094,  0.17578125],
       [ 0.25585938, -0.02209473,  0.02905273, ...,  0.04541016,
        -0.33984375, -0.08154297],
       ...,
       [-0.0612793 , -0.07861328, -0.09326172, ...,  0.05371094,
        -0.18554688,  0.22558594],
       [-0.07226562, -0.04467773, -0.05224609, ...,  0.09619141,
        -0.02478027,  0.33007812],
       [ 0.11035156,  0.25585938,  0.03417969, ...,  0.21582031,
        -0.13378906, -0.06494141]])

In [11]:
len(tokenized_paded_documents[0])

64

In [13]:
#print (document_embeddings.shape)
#document_embeddings

In [14]:
#len(embedding_matrix)
embedding_matrix.shape
#embedding_matrix

(92, 300)

In [16]:
#len(words)
#words
#len(document_word_embeddings)
#document_word_embeddings
#document_word_embeddings=np.zeros((len(tokenized_paded_documents),64,300))
#document_embeddings
#len(document_embeddings)
#document_embeddings.shape

In [18]:
#len(embedding_matrix[tokenizer.word_index[words[0]]])
#embedding_matrix[tokenizer.word_index[words[90]]].shape
#tokenizer.word_index[words[0]]
#len(tokenizer.word_index)
#tfidf_vectors[i][j]
#tfidf_vectors[0][0]
#embedding_matrix[tokenizer.word_index[words[0]]]
#test = embedding_matrix[tokenizer.word_index[words[0]]].multiply(tfidf_vectors[0][0])
#test = np.multiply(embedding_matrix[tokenizer.word_index[words[0]]],(tfidf_vectors[0][0]))
#test = embedding_matrix[tokenizer.word_index[words[0]]]*(tfidf_vectors[0][0])
#test = embedding_matrix[tokenizer.word_index[words[j]]]*tfidf_vectors[i][j]
#test = embedding_matrix[tokenizer.word_index[words[j]]].dot(tfidf_vectors[i][j])
#test
#test = tfidf_vectors[i][j].dot(embedding_matrix[tokenizer.word_index[words[j]]])
#test
#test = np.multiply(embedding_matrix[tokenizer.word_index[words[j]]],tfidf_vectors[i][j])


In [20]:
#len(document_embeddings)
#tfidf_vectors = tfidf_vectors.todense()
#tfidf_vectors.get_shape()
tfidf_vectors
#tfidf_vectors.shape
#document_embeddings[i]
#embedding_matrix[tokenizer.word_index[words[j]]]
#from scipy import sparse
#test = sparse.csr_matrix.dot(embedding_matrix[tokenizer.word_index[words[j]]],(tfidf_vectors[i][j]))
#test = sparse.csr_matrix(embedding_matrix[tokenizer.word_index[words[j]]].dot(tfidf_vectors[i][j]))
#test = embedding_matrix[tokenizer.word_index[words[j]]].dot((tfidf_vectors[i][j]))
#test = embedding_matrix[tokenizer.word_index[words[j]]].dot(tfidf_vectors[i][j])
#test[1][0]
#test
#test2
#test2 = tfidf_vectors.todense(test[1][0]) 
#td = tf.sparse_tensor_to_dense(tfidf_vectors)
#td = csr_matrix.todense(tfidf_vectors)
#td = tfidf_vectors.todense()
#td.shape
#test.shape

<6x91 sparse matrix of type '<class 'numpy.float64'>'
	with 117 stored elements in Compressed Sparse Row format>

In [22]:
#td = tf.sparse.to_dense(tfidf_vectors)
#document_embeddings[0]+= (tfidf_vectors[0][0]).dot(embedding_matrix[tokenizer.word_index[words[0]]])
#document_embeddings[0]+= (tfidf_vectors[0][0]).dot(embedding_matrix[tokenizer.word_index[words[0]]])
#document_embeddings.shape

In [None]:
#sp.sparse.csr_matrix()*sp.sparse.csr_matrix(c)

In [27]:
# calculating average of word vectors of a document weighted by tf-idf
document_embeddings=np.zeros((len(tokenized_paded_documents),300))
words=tfidfvectoriser.get_feature_names()
for i in range(len(document_word_embeddings)):
    for j in range(len(words)):
#        document_embeddings[i]+=embedding_matrix[tokenizer.word_index[words[j]]]*tfidf_vectors[i][j]
        document_embeddings[i]+= embedding_matrix[tokenizer.word_index[words[j]]].dot(tfidf_vectors[i][j])
#        document_embeddings[i]+= tf.sparse_tensor_to_dense(embedding_matrix[tokenizer.word_index[words[j]]]*(tfidf_vectors[i][j])
#        document_embeddings[i] = np.add(document_embeddings[i],embedding_matrix[tokenizer.word_index[words[j]]].dot(tfidf_vectors[i][j]))
print (document_embeddings.shape)
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)
most_similar(0,pairwise_similarities,'Cosine Similarity')
most_similar(0,pairwise_differences,'Euclidean Distance')


TypeError: ufunc 'add' output (typecode 'O') could not be coerced to provided output parameter (typecode 'd') according to the casting rule ''same_kind''

In [None]:
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)
most_similar(0,pairwise_similarities,'Cosine Similarity')
most_similar(0,pairwise_differences,'Euclidean Distance')


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer=Tokenizer()
tokenizer.fit_on_texts(documents_df.documents_cleaned)
tokenized_documents=tokenizer.texts_to_sequences(documents_df.documents_cleaned)
tokenized_paded_documents=pad_sequences(tokenized_documents,maxlen=64,padding='post')
vocab_size=len(tokenizer.word_index)+1

# reading Glove word embeddings into a dictionary with "word" as key and values as word vectors
embeddings_index = dict()

with open('glove.6B.100d.txt') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    
# creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 
embedding_matrix=np.zeros((vocab_size,100))

for word,i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
# calculating average of word vectors of a document weighted by tf-idf
document_embeddings=np.zeros((len(tokenized_paded_documents),100))
words=tfidfvectoriser.get_feature_names()

# instead of creating document-word embeddings, directly creating document embeddings
for i in range(documents_df.shape[0]):
    for j in range(len(words)):
        document_embeddings[i]+=embedding_matrix[tokenizer.word_index[words[j]]]*tfidf_vectors[i][j]
        

pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)
