<a href="https://colab.research.google.com/github/dhanaabhirajk/readfire/blob/master/python_modules/readfire.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#pip install firebase-admin

In [2]:
import firebase_admin
from firebase_admin import credentials,firestore

#connecting to db
cred = credentials.Certificate("./ServiceAccountKey.json")
default_app = firebase_admin.initialize_app(cred)
db = firestore.client()

In [3]:
#pip freeze > requirements.txt

In [4]:
import nltk
#used to remove stop words
nltk.download('stopwords')
#used in word tokenize
nltk.download('punkt')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


##Prepocessing

In [5]:
import unicodedata #to convert the sentence to unicode
import re #used to remove punctuations


#Converting unicode to ascii 
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    
def preprocess(w):
  #lowercase all the text
  w = unicode_to_ascii(w.lower().strip())

  #Remove puntuations
  w = re.sub(r"([?.!,¿])", r" ", w)
  w = re.sub(r'[" "]+', " ", w)
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  #tokenizes into words
  word_tokens = word_tokenize(w)
  
  #remove stopwords
  new_sent = [w for w in word_tokens if w not in stop_words]
  
  #join the words
  new_sent = ' '.join(str(elem) for elem in new_sent)
  
  return new_sent

In [6]:
import pandas as pd
import numpy as np
import heapq

In [7]:
# function to return the dataframe with the preprocessed articles
def get_articles():
  articles = pd.DataFrame()
  for doc in docs:
    new_dic = doc.to_dict()
    articles = articles.append({"title":new_dic["title"],"id":doc.id,"content":preprocess(new_dic["content"]),"similar_id":list(),"similar_per":list()},ignore_index=True)
  return articles

In [8]:
#getting the docs from the database
docs = db.collection(u'articles').stream()

#get the preprocessed articles
articles = get_articles()

In [9]:
articles.head()

Unnamed: 0,title,id,content,similar_id,similar_per
0,Metal Fabrication,7TDu6iZTicYvbNMDJOoU,metal fabrication process building machines st...,[],[]
1,Bio gas,G2q76a8fpjzXg3PAaGLi,biogas mixture gases primarily consisting meth...,[],[]
2,Natural language processing (NLP),L1oTvamoCndM7wLYvQ4S,natural language processing nlp refers branch ...,[],[]
3,Web Development,MAVau2RZpWtDwC9YkpoH,web development work involved developing websi...,[],[]
4,process scheduling,Y4RYcntx6PUpP9uMAoBZ,process scheduling activity process manager ha...,[],[]


##Finding similarity with word2vec techniques

In [10]:
#similarity threshold value for unique words technique
threshold_value = 0.7

In [11]:
def visualize_similarity(similarities):
  # Visualizing the similarityby turning into a dataframe 
  return pd.DataFrame(similarities,
             index=articles["title"],
             columns=articles["title"]) \
            .style \
            .background_gradient(axis=None)
  

In [12]:
def add_similarity(similarities):
  length = len(articles)
  for i in range(length):
    articles.iloc[i]["similar_per"]=[]
    articles.iloc[i]["similar_id"]=[]
    for j in range(length):
      #condition to append the articles with similarity above threshold value
      if(i!=j and similarities[i][j]>threshold_value):
        #round of the value to 3 decimal points
        articles.iloc[i]["similar_per"].append(round(similarities[i][j],3))
        articles.iloc[i]["similar_id"].append(articles.iloc[j]["id"])

In [13]:
def update_article_similarity():
  for index in range(len(articles)):
    related = list()
    #atmost top 3 related articles
    larg3 = heapq.nlargest(3, zip( articles['similar_per'][index] , articles['similar_id'][index]))
    for i in larg3:
      related.append({"id":i[1],"per":i[0]})
    db.collection(u"articles").document(articles['id'][index]).update({"related":related})

##Finding Similarity with unique words technique

##Tokenizing and storing the vocubulary

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')

tokenizer.fit_on_texts(articles["content"])

In [16]:
maxlen = 100

from tensorflow.keras.preprocessing.sequence import pad_sequences

#A function to return padded article
def get_sequences(article):
  sequences = tokenizer.texts_to_sequences([article])
  padded = pad_sequences(sequences, truncating='post', padding='post', maxlen=maxlen)
  return padded[0]

In [17]:
#stores the articles in vectors
vectors = [get_sequences(article) for article in articles["content"] ]

In [18]:
#Generating the similarity between the articles using pairwise cosine similarity 

similarities = cosine_similarity(vectors).astype('float64') #firestore needs data in float64
visualize_similarity(similarities)

title,Metal Fabrication,Bio gas,Natural language processing (NLP),Web Development,process scheduling,Hybrid Vechicle,Machine Learning,Gobar Gas,Artificial Intelligence,Data Structures,Preservatives
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Metal Fabrication,1.0,0.669798,0.383488,0.611319,0.398675,0.520715,0.718529,0.73775,0.21637,0.650051,0.682915
Bio gas,0.669798,1.0,0.288206,0.507877,0.420765,0.463284,0.660073,0.592038,0.158584,0.573745,0.594691
Natural language processing (NLP),0.383488,0.288206,1.0,0.426009,0.488615,0.427788,0.392692,0.288857,0.527628,0.476399,0.334013
Web Development,0.611319,0.507877,0.426009,1.0,0.564559,0.592934,0.573467,0.630678,0.295768,0.512281,0.537111
process scheduling,0.398675,0.420765,0.488615,0.564559,1.0,0.396771,0.399497,0.43968,0.489856,0.5437,0.326773
Hybrid Vechicle,0.520715,0.463284,0.427788,0.592934,0.396771,1.0,0.516622,0.512673,0.276662,0.523297,0.511709
Machine Learning,0.718529,0.660073,0.392692,0.573467,0.399497,0.516622,1.0,0.558234,0.208712,0.641325,0.696528
Gobar Gas,0.73775,0.592038,0.288857,0.630678,0.43968,0.512673,0.558234,1.0,0.265842,0.516184,0.680042
Artificial Intelligence,0.21637,0.158584,0.527628,0.295768,0.489856,0.276662,0.208712,0.265842,1.0,0.276736,0.297807
Data Structures,0.650051,0.573745,0.476399,0.512281,0.5437,0.523297,0.641325,0.516184,0.276736,1.0,0.606622


##Spacy

In [19]:
#downloading module from spacy
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 1.1 MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-py3-none-any.whl size=98051301 sha256=b6e53902b8c9911c31e6c7257dfac910f82f4e94829824a806618a67bba0cf97
  Stored in directory: /tmp/pip-ephem-wheel-cache-gh16r3l_/wheels/69/c5/b8/4f1c029d89238734311b3269762ab2ee325a42da2ce8edb997
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [20]:
#loading the smodel
import en_core_web_md
nlp = en_core_web_md.load()

In [50]:
#finding the vector for each article based on word embedding
vectors = [nlp(article).vector for article in articles["content"] ]

In [51]:
thersold_value = 0.85
similarities = cosine_similarity(vectors).astype('float64')
visualize_similarity(similarities)

title,Metal Fabrication,Bio gas,Natural language processing (NLP),Web Development,process scheduling,Hybrid Vechicle,Machine Learning,Gobar Gas,Artificial Intelligence,Data Structures,Preservatives
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Metal Fabrication,1.0,0.775775,0.725345,0.756499,0.816373,0.761018,0.797471,0.793707,0.798724,0.835291,0.716789
Bio gas,0.775775,0.999999,0.65437,0.608777,0.710187,0.820622,0.700339,0.984539,0.687438,0.731804,0.795383
Natural language processing (NLP),0.725345,0.65437,1.0,0.764634,0.769396,0.641291,0.903405,0.697867,0.896673,0.834448,0.669616
Web Development,0.756499,0.608777,0.764634,1.0,0.816647,0.642386,0.809013,0.659433,0.814347,0.862604,0.561274
process scheduling,0.816373,0.710187,0.769396,0.816647,1.0,0.751493,0.83967,0.746916,0.842014,0.894034,0.651671
Hybrid Vechicle,0.761018,0.820622,0.641291,0.642386,0.751493,0.999999,0.71843,0.838852,0.695665,0.752383,0.634955
Machine Learning,0.797471,0.700339,0.903405,0.809013,0.83967,0.71843,1.0,0.751714,0.869555,0.876129,0.694481
Gobar Gas,0.793707,0.984539,0.697867,0.659433,0.746916,0.838852,0.751714,1.0,0.717275,0.773394,0.763262
Artificial Intelligence,0.798724,0.687438,0.896673,0.814347,0.842014,0.695665,0.869555,0.717275,1.0,0.863553,0.662716
Data Structures,0.835291,0.731804,0.834448,0.862604,0.894034,0.752383,0.876129,0.773394,0.863553,1.0,0.667477


In [52]:
#adding the similarity to the dataframe
add_similarity(similarities)
articles.head()

Unnamed: 0,title,id,content,similar_id,similar_per
0,Metal Fabrication,7TDu6iZTicYvbNMDJOoU,metal fabrication process building machines st...,"[G2q76a8fpjzXg3PAaGLi, L1oTvamoCndM7wLYvQ4S, M...","[0.776, 0.725, 0.756, 0.816, 0.761, 0.797, 0.7..."
1,Bio gas,G2q76a8fpjzXg3PAaGLi,biogas mixture gases primarily consisting meth...,"[7TDu6iZTicYvbNMDJOoU, L1oTvamoCndM7wLYvQ4S, M...","[0.776, 0.654, 0.609, 0.71, 0.821, 0.7, 0.985,..."
2,Natural language processing (NLP),L1oTvamoCndM7wLYvQ4S,natural language processing nlp refers branch ...,"[7TDu6iZTicYvbNMDJOoU, G2q76a8fpjzXg3PAaGLi, M...","[0.725, 0.654, 0.765, 0.769, 0.641, 0.903, 0.6..."
3,Web Development,MAVau2RZpWtDwC9YkpoH,web development work involved developing websi...,"[7TDu6iZTicYvbNMDJOoU, G2q76a8fpjzXg3PAaGLi, L...","[0.756, 0.609, 0.765, 0.817, 0.642, 0.809, 0.6..."
4,process scheduling,Y4RYcntx6PUpP9uMAoBZ,process scheduling activity process manager ha...,"[7TDu6iZTicYvbNMDJOoU, G2q76a8fpjzXg3PAaGLi, L...","[0.816, 0.71, 0.769, 0.817, 0.751, 0.84, 0.747..."


In [53]:
# updating the article similarity in the database
update_article_similarity()

##WORD embedding

##Generating the vocabulary

In [25]:
import tensorflow as tf
import string
import io

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape, Lambda,TextVectorization
from tensorflow import keras
import keras.backend as K

In [27]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and the number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)


In [28]:
vectorize_layer.adapt(articles["content"])
vocab = vectorize_layer.get_vocabulary()

##CBOW Implementation

In [29]:
from keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
# Prepare the data for the CBOW model
def generate_data_cbow(corpus, window_size, V):
    all_in = []
    all_out = []

    # Iterate over all sentences
    for sentence in corpus:
        L = len(sentence)
        for index, word in enumerate(sentence):
            start = index - window_size
            end = index + window_size + 1

            # Empty list which will store the context words
            context_words = []
            for i in range(start, end):
                # Skip the 'same' word
                if i != index:
                    # Add a word as a context word if it is within the window size
                    if 0 <= i < L:
                        context_words.append(sentence[i])
                    else:
                        # Pad with zero if there are no words 
                        context_words.append(0)
            # Append the list with context words
            all_in.append(context_words)

            # Add one-hot encoding of the target word
            all_out.append(to_categorical(word, V))
                 
    return (np.array(all_in), np.array(all_out))

In [30]:
# Parameters
window_size = 2 
window_size_corpus = 4

# Set numpy seed for reproducible results
np.random.seed(42)
V = len(vocab)
corpus = tokenizer.texts_to_sequences(articles["content"])

In [31]:
# Create the training data
X_cbow, y_cbow = generate_data_cbow(corpus, window_size, V)
X_cbow.shape, y_cbow.shape

((799, 4), (799, 460))

In [32]:

# Create the CBOW architecture
dims = [50, 150, 300]
cbow_models = []

for dim in dims:
    cbow = Sequential()

    # Add an Embedding layer
    cbow.add(Embedding(input_dim=V, 
                       output_dim=dim, 
                       input_length=window_size*2, # Note that we now have 2L words for each input entry
                       embeddings_initializer='glorot_uniform'))

    cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim, )))

    cbow.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

    cbow.compile(optimizer=keras.optimizers.Adam(),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    
    cbow.summary()
    print("")
    cbow_models.append(cbow)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 50)             23000     
                                                                 
 lambda (Lambda)             (None, 50)                0         
                                                                 
 dense (Dense)               (None, 460)               23460     
                                                                 
Total params: 46,460
Trainable params: 46,460
Non-trainable params: 0
_________________________________________________________________

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4, 150)            69000     
                                                                 
 lambda_1 (Lambda)           

In [33]:
# Train CBOW model
for cbow in cbow_models:
    cbow.fit(X_cbow, y_cbow, batch_size=64, epochs=50, verbose=1)
    print("")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/

Exporting the weigths of the best model

In [34]:
# Save embeddings for vectors of length 300 using cbow model
weights = cbow_models[2].get_weights()[0]

In [35]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [36]:
try:
  from google.colab import files
  files.download('vectors.tsv')
  files.download('metadata.tsv')
except Exception:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Stroing the vocubulary and vectors in a dataframe

In [37]:
vocabulary = pd.DataFrame(weights, index = vocab)

In [38]:
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split(" ") if word in vocab]
    return np.mean(vocabulary.loc[doc], axis=0)

In [39]:
vectors = [document_vector(article) for article in articles["content"]]

In [40]:
#Generating the similarity between the articles using pairwise cosine similarity 
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(vectors).astype('float64')

In [41]:
visualize_similarity(similarities)

title,Metal Fabrication,Bio gas,Natural language processing (NLP),Web Development,process scheduling,Hybrid Vechicle,Machine Learning,Gobar Gas,Artificial Intelligence,Data Structures,Preservatives
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Metal Fabrication,1.0,0.106261,0.153621,0.112858,0.308509,0.001812,-0.058989,0.109139,0.258089,0.115074,0.079705
Bio gas,0.106261,1.0,0.028035,0.115691,-0.021274,0.305103,0.160167,0.925588,0.264326,0.16213,0.18619
Natural language processing (NLP),0.153621,0.028035,1.0,0.18342,0.160457,0.220016,0.196787,0.073008,0.462221,0.241856,0.174615
Web Development,0.112858,0.115691,0.18342,1.0,0.026719,0.210972,0.180782,0.157666,0.219367,0.154046,0.272382
process scheduling,0.308509,-0.021274,0.160457,0.026719,1.0,-0.035458,-0.161064,-0.05404,0.063909,0.150223,0.026788
Hybrid Vechicle,0.001812,0.305103,0.220016,0.210972,-0.035458,1.0,0.275533,0.338832,0.411627,0.152626,0.240262
Machine Learning,-0.058989,0.160167,0.196787,0.180782,-0.161064,0.275533,1.0,0.284777,0.337556,0.168623,0.207321
Gobar Gas,0.109139,0.925588,0.073008,0.157666,-0.05404,0.338832,0.284777,1.0,0.338387,0.137984,0.186896
Artificial Intelligence,0.258089,0.264326,0.462221,0.219367,0.063909,0.411627,0.337556,0.338387,1.000001,0.230961,0.22998
Data Structures,0.115074,0.16213,0.241856,0.154046,0.150223,0.152626,0.168623,0.137984,0.230961,1.0,0.20377


In [42]:
threshold_value = 0.4
#adding the similarity to the dataframe
add_similarity(similarities)
articles.head()

Unnamed: 0,title,id,content,similar_id,similar_per
0,Metal Fabrication,7TDu6iZTicYvbNMDJOoU,metal fabrication process building machines st...,[],[]
1,Bio gas,G2q76a8fpjzXg3PAaGLi,biogas mixture gases primarily consisting meth...,[jDDngynd6QLl44LRO9BB],[0.926]
2,Natural language processing (NLP),L1oTvamoCndM7wLYvQ4S,natural language processing nlp refers branch ...,[mxRSMBi9Upvrx2L4oMSs],[0.462]
3,Web Development,MAVau2RZpWtDwC9YkpoH,web development work involved developing websi...,[],[]
4,process scheduling,Y4RYcntx6PUpP9uMAoBZ,process scheduling activity process manager ha...,[],[]


In [43]:
#Best of all the three methods is Spacy and CBOW implementation with custom neural network based word embeddings

##adding the similarity to the dataframe articles

In [44]:
add_similarity(similarities)

##Updating the top 3 related artocles to the article

In [45]:
update_article_similarity()