<a href="https://colab.research.google.com/github/dhanaabhirajk/readfire/blob/master/readfire.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#pip install firebase-admin

In [2]:
import firebase_admin
from firebase_admin import credentials,firestore

#connecting to db
cred = credentials.Certificate("./ServiceAccountKey.json")
default_app = firebase_admin.initialize_app(cred)
db = firestore.client()

In [3]:
#pip freeze > requirements.txt

In [4]:
import nltk
#used to remove stop words
nltk.download('stopwords')
#used in word tokenize
nltk.download('punkt')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


##Prepocessing

In [5]:
import unicodedata #to convert the sentence to unicode
import re #used to remove punctuations


#Converting unicode to ascii 
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    
def preprocess(w):
  #lowercase all the text
  w = unicode_to_ascii(w.lower().strip())

  #Remove puntuations
  w = re.sub(r"([?.!,¿])", r" ", w)
  w = re.sub(r'[" "]+', " ", w)
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  #tokenizes into words
  word_tokens = word_tokenize(w)
  
  #remove stopwords
  new_sent = [w for w in word_tokens if w not in stop_words]
  
  #join the words
  new_sent = ' '.join(str(elem) for elem in new_sent)
  
  return new_sent

In [34]:
import pandas as pd
import numpy as np
import heapq

In [7]:
# function to return the dataframe with the preprocessed articles
def get_articles():
  articles = pd.DataFrame()
  for doc in docs:
    new_dic = doc.to_dict()
    articles = articles.append({"title":new_dic["title"],"id":doc.id,"content":preprocess(new_dic["content"]),"similar_id":list(),"similar_per":list()},ignore_index=True)
  return articles

In [8]:
#getting the docs from the database
docs = db.collection(u'articles').stream()

#get the preprocessed articles
articles = get_articles()

In [9]:
articles.head()

Unnamed: 0,title,id,content,similar_id,similar_per
0,Bio gas,G2q76a8fpjzXg3PAaGLi,biogas mixture gases primarily consisting meth...,[],[]
1,Natural language processing (NLP),L1oTvamoCndM7wLYvQ4S,natural language processing nlp refers branch ...,[],[]
2,Web Development,MAVau2RZpWtDwC9YkpoH,web development work involved developing websi...,[],[]
3,process scheduling,Y4RYcntx6PUpP9uMAoBZ,process scheduling activity process manager ha...,[],[]
4,Hybrid Vechicle,aT99AvBvVW43U1bEXn9p,hybrid electric vehicles powered internal comb...,[],[]


##Finding similarity with word2vec techniques

In [10]:
#similarity threshold value for unique words technique
threshold_value = 0.4

In [11]:
def visualize_similarity(similarities):
  # Visualizing the similarityby turning into a dataframe 
  return pd.DataFrame(similarities,
             index=articles["title"],
             columns=articles["title"]) \
            .style \
            .background_gradient(axis=None)
  

In [12]:
def add_similarity(similarities):
  length = len(articles)
  for i in range(length):
    articles.iloc[i]["similar_per"]=[]
    articles.iloc[i]["similar_id"]=[]
    for j in range(length):
      #condition to append the articles with similarity above threshold value
      if(i!=j and similarities[i][j]>threshold_value):
        #round of the value to 3 decimal points
        articles.iloc[i]["similar_per"].append(round(similarities[i][j],3))
        articles.iloc[i]["similar_id"].append(articles.iloc[j]["id"])

In [13]:
def update_article_similarity():
  for index in range(len(articles)):
    related = list()
    #atmost top 3 related articles
    larg3 = heapq.nlargest(3, zip( articles['similar_per'][index] , articles['similar_id'][index]))
    for i in larg3:
      related.append({"id":i[1],"per":i[0]})
    db.collection(u"articles").document(articles['id'][index]).update({"related":related})

##Finding Similarity with unique words technique

##Tokenizing and storing the vocubulary

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')

tokenizer.fit_on_texts(articles["content"])

In [15]:
maxlen = 100

from tensorflow.keras.preprocessing.sequence import pad_sequences

#A function to return padded article
def get_sequences(article):
  sequences = tokenizer.texts_to_sequences([article])
  padded = pad_sequences(sequences, truncating='post', padding='post', maxlen=maxlen)
  return padded[0]

In [16]:
#stores the articles in vectors
vectors = [get_sequences(article) for article in articles["content"] ]

In [19]:
#Generating the similarity between the articles using pairwise cosine similarity 

similarities = cosine_similarity(vectors).astype('float64') #firestore needs data in float64
visualize_similarity(similarities)

title,Bio gas,Natural language processing (NLP),Web Development,process scheduling,Hybrid Vechicle,Gobar Gas,Artificial Intelligence
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bio gas,1.0,0.332942,0.543996,0.426816,0.518778,0.612573,0.232432
Natural language processing (NLP),0.332942,1.0,0.486157,0.623858,0.424902,0.323824,0.714578
Web Development,0.543996,0.486157,1.0,0.594141,0.64634,0.621359,0.480245
process scheduling,0.426816,0.623858,0.594141,1.0,0.411414,0.454452,0.615517
Hybrid Vechicle,0.518778,0.424902,0.64634,0.411414,1.0,0.502552,0.344375
Gobar Gas,0.612573,0.323824,0.621359,0.454452,0.502552,1.0,0.294951
Artificial Intelligence,0.232432,0.714578,0.480245,0.615517,0.344375,0.294951,1.0


##Spacy

In [22]:
#downloading module from spacy
!python -m spacy download en_core_web_md

Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-py3-none-any.whl size=98051301 sha256=680ee05c8c380be6934e16b1e9f61fcd4afbd6ecc468bf3667ee76eeebb8a7b0
  Stored in directory: /tmp/pip-ephem-wheel-cache-hsfoqaqy/wheels/69/c5/b8/4f1c029d89238734311b3269762ab2ee325a42da2ce8edb997
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [23]:
#loading the smodel
import en_core_web_md
nlp = en_core_web_md.load()

In [24]:
#finding the vector for each article based on word embedding
vectors = [nlp(article).vector for article in articles["content"] ]

In [25]:
similarities = cosine_similarity(vectors).astype('float64')
visualize_similarity(similarities)

title,Bio gas,Natural language processing (NLP),Web Development,process scheduling,Hybrid Vechicle,Gobar Gas,Artificial Intelligence
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bio gas,0.999999,0.65437,0.608777,0.710187,0.820622,0.984539,0.687438
Natural language processing (NLP),0.65437,1.0,0.764634,0.769396,0.641291,0.697867,0.896673
Web Development,0.608777,0.764634,1.0,0.816647,0.642386,0.659433,0.814347
process scheduling,0.710187,0.769396,0.816647,1.0,0.751493,0.746916,0.842014
Hybrid Vechicle,0.820622,0.641291,0.642386,0.751493,0.999999,0.838852,0.695665
Gobar Gas,0.984539,0.697867,0.659433,0.746916,0.838852,1.0,0.717275
Artificial Intelligence,0.687438,0.896673,0.814347,0.842014,0.695665,0.717275,1.0


In [26]:
#adding the similarity to the dataframe
add_similarity(similarities)
articles.head()

Unnamed: 0,title,id,content,similar_id,similar_per
0,Bio gas,G2q76a8fpjzXg3PAaGLi,biogas mixture gases primarily consisting meth...,"[L1oTvamoCndM7wLYvQ4S, MAVau2RZpWtDwC9YkpoH, Y...","[0.654, 0.609, 0.71, 0.821, 0.985, 0.687]"
1,Natural language processing (NLP),L1oTvamoCndM7wLYvQ4S,natural language processing nlp refers branch ...,"[G2q76a8fpjzXg3PAaGLi, MAVau2RZpWtDwC9YkpoH, Y...","[0.654, 0.765, 0.769, 0.641, 0.698, 0.897]"
2,Web Development,MAVau2RZpWtDwC9YkpoH,web development work involved developing websi...,"[G2q76a8fpjzXg3PAaGLi, L1oTvamoCndM7wLYvQ4S, Y...","[0.609, 0.765, 0.817, 0.642, 0.659, 0.814]"
3,process scheduling,Y4RYcntx6PUpP9uMAoBZ,process scheduling activity process manager ha...,"[G2q76a8fpjzXg3PAaGLi, L1oTvamoCndM7wLYvQ4S, M...","[0.71, 0.769, 0.817, 0.751, 0.747, 0.842]"
4,Hybrid Vechicle,aT99AvBvVW43U1bEXn9p,hybrid electric vehicles powered internal comb...,"[G2q76a8fpjzXg3PAaGLi, L1oTvamoCndM7wLYvQ4S, M...","[0.821, 0.641, 0.642, 0.751, 0.839, 0.696]"


In [27]:
# updating the article similarity in the database
update_article_similarity()

##WORD embedding

##Generating the vocabulary

In [28]:
import tensorflow as tf
import string
import io

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape, Lambda,TextVectorization
from tensorflow import keras
import keras.backend as K

In [30]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and the number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)


In [31]:
vectorize_layer.adapt(articles["content"])
vocab = vectorize_layer.get_vocabulary()

##CBOW Implementation

In [32]:
from keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
# Prepare the data for the CBOW model
def generate_data_cbow(corpus, window_size, V):
    all_in = []
    all_out = []

    # Iterate over all sentences
    for sentence in corpus:
        L = len(sentence)
        for index, word in enumerate(sentence):
            start = index - window_size
            end = index + window_size + 1

            # Empty list which will store the context words
            context_words = []
            for i in range(start, end):
                # Skip the 'same' word
                if i != index:
                    # Add a word as a context word if it is within the window size
                    if 0 <= i < L:
                        context_words.append(sentence[i])
                    else:
                        # Pad with zero if there are no words 
                        context_words.append(0)
            # Append the list with context words
            all_in.append(context_words)

            # Add one-hot encoding of the target word
            all_out.append(to_categorical(word, V))
                 
    return (np.array(all_in), np.array(all_out))

In [35]:
# Parameters
window_size = 2 
window_size_corpus = 4

# Set numpy seed for reproducible results
np.random.seed(42)
V = len(vocab)
corpus = tokenizer.texts_to_sequences(articles["content"])

In [36]:
# Create the training data
X_cbow, y_cbow = generate_data_cbow(corpus, window_size, V)
X_cbow.shape, y_cbow.shape

((444, 4), (444, 238))

In [37]:

# Create the CBOW architecture
dims = [50, 150, 300]
cbow_models = []

for dim in dims:
    cbow = Sequential()

    # Add an Embedding layer
    cbow.add(Embedding(input_dim=V, 
                       output_dim=dim, 
                       input_length=window_size*2, # Note that we now have 2L words for each input entry
                       embeddings_initializer='glorot_uniform'))

    cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim, )))

    cbow.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

    cbow.compile(optimizer=keras.optimizers.Adam(),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    
    cbow.summary()
    print("")
    cbow_models.append(cbow)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 50)             11900     
                                                                 
 lambda (Lambda)             (None, 50)                0         
                                                                 
 dense (Dense)               (None, 238)               12138     
                                                                 
Total params: 24,038
Trainable params: 24,038
Non-trainable params: 0
_________________________________________________________________

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4, 150)            35700     
                                                                 
 lambda_1 (Lambda)           

In [38]:
# Train CBOW model
for cbow in cbow_models:
    cbow.fit(X_cbow, y_cbow, batch_size=64, epochs=50, verbose=1)
    print("")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/

Exporting the weigths of the best model

In [39]:
# Save embeddings for vectors of length 300 using cbow model
weights = cbow_models[2].get_weights()[0]

In [40]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [41]:
try:
  from google.colab import files
  files.download('vectors.tsv')
  files.download('metadata.tsv')
except Exception:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Stroing the vocubulary and vectors in a dataframe

In [42]:
vocabulary = pd.DataFrame(weights, index = vocab)

In [43]:
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split(" ") if word in vocab]
    return np.mean(vocabulary.loc[doc], axis=0)

In [44]:
vectors = [document_vector(article) for article in articles["content"]]

In [45]:
#Generating the similarity between the articles using pairwise cosine similarity 
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(vectors).astype('float64')

In [46]:
visualize_similarity(similarities)

title,Bio gas,Natural language processing (NLP),Web Development,process scheduling,Hybrid Vechicle,Gobar Gas,Artificial Intelligence
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bio gas,1.0,0.156995,-0.14272,-0.051127,0.242414,0.942099,0.298123
Natural language processing (NLP),0.156995,1.0,-0.04469,-0.030622,0.194836,0.21369,0.463299
Web Development,-0.14272,-0.04469,1.0,0.022613,0.019174,-0.209064,-0.010305
process scheduling,-0.051127,-0.030622,0.022613,1.0,0.010521,-0.047312,0.156891
Hybrid Vechicle,0.242414,0.194836,0.019174,0.010521,1.0,0.251481,0.160158
Gobar Gas,0.942099,0.21369,-0.209064,-0.047312,0.251481,1.0,0.346066
Artificial Intelligence,0.298123,0.463299,-0.010305,0.156891,0.160158,0.346066,1.0


In [47]:
threshold_value = 0.4
#adding the similarity to the dataframe
add_similarity(similarities)
articles.head()

Unnamed: 0,title,id,content,similar_id,similar_per
0,Bio gas,G2q76a8fpjzXg3PAaGLi,biogas mixture gases primarily consisting meth...,[jDDngynd6QLl44LRO9BB],[0.942]
1,Natural language processing (NLP),L1oTvamoCndM7wLYvQ4S,natural language processing nlp refers branch ...,[mxRSMBi9Upvrx2L4oMSs],[0.463]
2,Web Development,MAVau2RZpWtDwC9YkpoH,web development work involved developing websi...,[],[]
3,process scheduling,Y4RYcntx6PUpP9uMAoBZ,process scheduling activity process manager ha...,[],[]
4,Hybrid Vechicle,aT99AvBvVW43U1bEXn9p,hybrid electric vehicles powered internal comb...,[],[]


In [48]:
#Best of all the three methods is CBOW implementation with custom neural network based word embeddings

##adding the similarity to the dataframe articles

In [49]:
add_similarity(similarities)

##Updating the top 3 related artocles to the article

In [50]:
update_article_similarity()