<a href="https://colab.research.google.com/github/dhanaabhirajk/readfire/blob/master/readfire.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#pip install firebase-admin

In [None]:
import firebase_admin
from firebase_admin import credentials,firestore

#connecting to db
cred = credentials.Certificate("./ServiceAccountKey.json")
default_app = firebase_admin.initialize_app(cred)
db = firestore.client()

In [None]:
import nltk
#used to remove stop words
nltk.download('stopwords')
#used in word tokenize
nltk.download('punkt')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


##Prepocessing

In [None]:
import unicodedata #to convert the sentence to unicode
import re #used to remove punctuations


#Converting unicode to ascii 
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    
def preprocess(w):
  #lowercase all the text
  w = unicode_to_ascii(w.lower().strip())

  #Remove puntuations
  w = re.sub(r"([?.!,¿])", r" ", w)
  w = re.sub(r'[" "]+', " ", w)
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  #tokenizes into words
  word_tokens = word_tokenize(w)
  
  #remove stopwords
  new_sent = [w for w in word_tokens if w not in stop_words]
  
  #join the words
  new_sent = ' '.join(str(elem) for elem in new_sent)
  
  return new_sent

In [None]:
import pandas as pd

In [None]:
# function to return the dataframe with the preprocessed articles
def get_articles():
  articles = pd.DataFrame()
  for doc in docs:
    new_dic = doc.to_dict()
    articles = articles.append({"id":doc.id,"content":preprocess(new_dic["content"]),"similar_id":list(),"similar_per":list()},ignore_index=True)
  return articles

In [None]:
#getting the docs from the database
docs = db.collection(u'articles').stream()

#get the preprocessed articles
articles = get_articles()

In [None]:
articles.head()

Unnamed: 0,id,content,similar_id,similar_per
0,G2q76a8fpjzXg3PAaGLi,biogas mixture gases primarily consisting meth...,[],[]
1,L1oTvamoCndM7wLYvQ4S,natural language processing nlp refers branch ...,[],[]
2,MAVau2RZpWtDwC9YkpoH,web development work involved developing websi...,[],[]
3,Y4RYcntx6PUpP9uMAoBZ,process scheduling activity process manager ha...,[],[]
4,aT99AvBvVW43U1bEXn9p,hybrid electric vehicles powered internal comb...,[],[]


##Tokenizing and storing the vocubulary

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')

tokenizer.fit_on_texts(articles["content"])

In [None]:
maxlen = 100

from tensorflow.keras.preprocessing.sequence import pad_sequences

#A function to return padded article
def get_sequences(tokenizer, article):
  sequences = tokenizer.texts_to_sequences([article])
  padded = pad_sequences(sequences, truncating='post', padding='post', maxlen=maxlen)
  return padded

In [None]:
articles["content"]

0    biogas mixture gases primarily consisting meth...
1    natural language processing nlp refers branch ...
2    web development work involved developing websi...
3    process scheduling activity process manager ha...
4    hybrid electric vehicles powered internal comb...
5    biogas compressed removal carbon dioxide hydro...
6    artificial intelligence simulation human intel...
Name: content, dtype: object

In [None]:

# import required libraries
import numpy as np
from numpy.linalg import norm
# define two lists or array
def get_cosine_similarity(l1,l2):
 
  return round(l1[0].dot(l2[0])/ (np.linalg.norm(l1) * np.linalg.norm(l2)),3)

##adding the similarity to the dataframe articles

In [None]:
length = len(articles)
for i in range(length):
  for j in range(length):
    if(i!=j):
      similarity = get_cosine_similarity(get_sequences(tokenizer,articles["content"][i]),get_sequences(tokenizer,articles['content'][j]))
      articles.iloc[i]["similar_per"].append(similarity)
      articles.iloc[i]["similar_id"].append(articles.iloc[j]["id"])

In [None]:
articles.head()

Unnamed: 0,id,content,similar_id,similar_per
0,G2q76a8fpjzXg3PAaGLi,biogas mixture gases primarily consisting meth...,"[L1oTvamoCndM7wLYvQ4S, MAVau2RZpWtDwC9YkpoH, Y...","[0.333, 0.544, 0.427, 0.519, 0.613, 0.232]"
1,L1oTvamoCndM7wLYvQ4S,natural language processing nlp refers branch ...,"[G2q76a8fpjzXg3PAaGLi, MAVau2RZpWtDwC9YkpoH, Y...","[0.333, 0.486, 0.624, 0.425, 0.324, 0.715]"
2,MAVau2RZpWtDwC9YkpoH,web development work involved developing websi...,"[G2q76a8fpjzXg3PAaGLi, L1oTvamoCndM7wLYvQ4S, Y...","[0.544, 0.486, 0.594, 0.646, 0.621, 0.48]"
3,Y4RYcntx6PUpP9uMAoBZ,process scheduling activity process manager ha...,"[G2q76a8fpjzXg3PAaGLi, L1oTvamoCndM7wLYvQ4S, M...","[0.427, 0.624, 0.594, 0.411, 0.454, 0.616]"
4,aT99AvBvVW43U1bEXn9p,hybrid electric vehicles powered internal comb...,"[G2q76a8fpjzXg3PAaGLi, L1oTvamoCndM7wLYvQ4S, M...","[0.519, 0.425, 0.646, 0.411, 0.503, 0.344]"


##Updating the top 3 related artocles to the article

In [None]:
import heapq

for index in range(len(articles)):
  related = list()
  #largest 3 related articles
  larg3 = heapq.nlargest(3, zip( articles['similar_per'][index] , articles['similar_id'][index]))
  for i in larg3:
    related.append({"id":i[1],"per":i[0]})
  db.collection(u"articles").document(articles['id'][index]).update({"related":related})