In [1]:
# Increasing recursion limit of python
import sys
sys.setrecursionlimit(5000)

In [2]:
class Node:
    def __init__(self, url):
        self.url = url
        self.title = None
        self.text = None

        self.inlinks = []
        self.outlinks = []

        self.labels = []
        self.label = None
        self.keyword = None
        self.tags = None
        self.features = None

In [None]:
# Loading NLP modules
import nltk
nltk.download("all") 
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer, PorterStemmer

nltk.download("stopwords")
stop = set(stopwords.words('english'))

In [4]:
from collections import Counter
from math import log

stoppers = set(stopwords.words('english'))
for x in [c for c in 'abcdefghijklmnopqrstuvwxyz']:stoppers.add(x)

# Processing words so that words they could be vectorized easily
def process_text(text, lem=False):

    wordnet=WordNetLemmatizer()
    sentences = nltk.sent_tokenize(text)
    corpus = []
    for i in range(len(sentences)):
        review = sentences[i]
        review = review.lower()
        review = review.split()
        review = [word for word in review if not word in stoppers]
        if lem:
            review = [wordnet.lemmatize(word) for word in review]
        review = ' '.join(review)
        corpus.append(review)    

    return corpus

# Calculating TFIDF score of a corpus by treating each sentence as a document
def tfidf(corpus):
    cnt_list = [Counter(doc.split()) for doc in corpus]
    tfidf_score = []

    for doc in cnt_list:
        if '.' in doc:
            doc.pop('.')
        s = sum([*doc.values()])
        for word in doc:
            tf = log(doc[word]/(s+1))
            idf = log(len(corpus)/(1+sum((word in d) for d in cnt_list)))

            tfidf_score.append([tf*idf, word])

    return {word: score for word, score in sorted(tfidf_score)[::-1]}

In [5]:
import pickle

with open('graph.txt','rb') as file:
    graph_dict = pickle.load(file)

graph = [*graph_dict.values()]

In [6]:
import numpy as np

for node in graph:
    text = node.text
    node.text = process_text(text)
    node.keyword = tfidf(process_text(text))

    # node.tags = tfidf(node.text, 5)
    
    if len(node.labels)>0:
        temp = [0,0,0,0]
        for x in node.labels:
            temp[int(x)]+=1
        node.labels = temp
        node.label = temp.index(max(temp))
    else:
        node.label = None
        node.labels = np.zeros(4)
    

In [7]:
graph[4].keyword

{-1.18164133370001: 'terms',
 -1.6620943476182115: 'expansion',
 -1.7778874154035074: 'expansion',
 -1.8770773617087897: 'terms',
 -1.9638338413573804: 'terms',
 -1.9839264708124529: 'expansion',
 -2.0992823467836748: 'products',
 -2.1733589754333065: 'expansion',
 -2.2455329753535485: 'products',
 -2.2845000312564268: 'expansion',
 -2.351102985779819: 'powers',
 -2.36328266740002: 'term',
 -2.6343572136354596: 'multiplied',
 -2.6831924662487343: 'sum',
 -2.69900011003203: 'polynomial',
 -2.745027162199949: 'products',
 -2.8178848839186164: 'sum',
 -2.9667616548783577: 'expanding',
 -2.975097229261034: 'multiplied',
 -3.1125960782457724: 'products',
 -3.324188695236423: 'also',
 -3.38581192303021: 'expression',
 -3.4446924766675435: 'obtained',
 -3.486602935836834: 'pascal',
 -3.5557748308070147: 'multiplication',
 -3.620846882437746: 'obtained',
 -3.65851029471775: 'coefficients',
 -3.7541547234175794: 'factors',
 -3.905949070844612: 'multiplied',
 -3.9276676827147607: 'coefficients',

In [8]:
graph_dict = {k:v for k,v in zip([*graph_dict.keys()], graph)}

with open('editedGraph.txt','wb') as file:
    pickle.dump(graph_dict, file)