In [30]:
import import_ipynb
from news_classes import *
import math
from textblob import TextBlob as tb
import os
import pickle
from collections import Counter
from textblob import TextBlob as tb
import numpy as np

In [160]:
class lexrank:
    def __init__(self,threshold,tolerance):
        self.idf_path='idf_path.pkl'
        self.tolerance=tolerance
        self.threshold=threshold
        
    def load_idf(self):
        with open(idf_path, 'rb') as input:
            self.idfblob = pickle.load(input)
            
    def get_idf(self,word):
        return math.log(self.idfblob.size() / (1 + self.idfblob.getIdf(word)))
    
    def cosine_distance(self, sentence1, sentence2):
        count_sent1=Counter(sentence1.words)
        count_sent2=Counter(sentence2.words)

        up=0
        downright=0
        downleft=0

        words=set()
        for word in count_sent1.keys():
            words.add(word)
        for word in count_sent2.keys():
            words.add(word)

        for word in words:
            cnt1=count_sent1.get(word)
            cnt2=count_sent2.get(word)
            if cnt1 is None:
                cnt1=0            
            elif cnt2 is None:
                cnt2=0

            up+=cnt1*cnt2*self.get_idf(word)*self.get_idf(word)
            downright+=cnt1*self.get_idf(word)*cnt1*self.get_idf(word)
            downleft+=cnt2*self.get_idf(word)*cnt2*self.get_idf(word)

        try:
            return up/(math.sqrt(downleft)*math.sqrt(downright))
        except:
            return 0
    
    def cosine_matrix(self,sentences):
        w,h=len(sentences), len(sentences)
        dist_matrix = np.zeros(shape=(w,h))
        for i in range(w):
            for j in range(h):
                dist = self.cosine_distance(sentences[i],sentences[j])
                if dist>=self.threshold:
                    dist=1
                else:
                    dist=0
                dist_matrix[i][j]=dist
                dist_matrix[j][i]=dist

        for i in range(w):
            dist_matrix[i]=dist_matrix[i]/sum(dist_matrix[i])
            
        return dist_matrix
    
    def power_method(self,cosine_matrix):
        n=len(cosine_matrix)
        eg_vec=np.zeros(shape=(n))
        eg_vec=(eg_vec+1)/n
        
        while True:
            new_vec=np.matmul(eg_vec,cosine_matrix)
            update=sum(np.absolute(new_vec-eg_vec))
            eg_vec=new_vec
            
            if update<self.tolerance:
                break
            
        return eg_vec
    
    

In [162]:
lx = lexrank(.1,.001)
lx.load_idf()
para="Trump was born and raised in the New York City borough of Queens. He received an economics degree from the Wharton School of the University of Pennsylvania and was appointed president of his family's real estate business in 1971, renamed it The Trump Organization, and expanded it from Queens and Brooklyn into Manhattan. The company built or renovated skyscrapers, hotels, casinos, and golf courses. Trump later started various side ventures, including licensing his name for real estate and consumer products. He managed the company until his 2017 inauguration. He co-authored several books, including The Art of the Deal. He owned the Miss Universe and Miss USA beauty pageants from 1996 to 2015, and he produced and hosted the reality television show, The Apprentice, from 2003 to 2015. Forbes estimates his net worth to be $3.1 billion."
sents=tb(para).sentences
mat=lx.cosine_matrix(sents)
vec=lx.power_method(mat)

print(vec)

0.19880952380952377
0.0408248299319727
0.01654491955512366
0.012713142720240045
0.008656856914892347
0.006179059462448164
0.00436208885082412
0.003087349093607389
0.0021838015589810827
0.0015449090348856193
0.0010928936812334283
0.0007731369073480931
[0.08508642 0.19769732 0.11270465 0.08508642 0.11270465 0.14086027
 0.14086027 0.125     ]


In [4]:
idf_path='idf_path.pkl'
with open(idf_path, 'rb') as input:
    idfblob = pickle.load(input)

In [5]:
def idf(word):
    return math.log(idfblob.size() / (1 + idfblob.getIdf(word)))

In [6]:
def sent_cosine(sentence1, sentence2):
    count_sent1=Counter(sentence1.words)
    count_sent2=Counter(sentence2.words)
    
    up=0
    downright=0
    downleft=0
    
    words=set()
    for word in count_sent1.keys():
        words.add(word)
    for word in count_sent2.keys():
        words.add(word)
    
    for word in words:
        cnt1=count_sent1.get(word)
        cnt2=count_sent2.get(word)
        if cnt1 is None:
            cnt1=0            
        elif cnt2 is None:
            cnt2=0
                
        up+=cnt1*cnt2*idf(word)*idf(word)
        downright+=cnt1*idf(word)*cnt1*idf(word)
        downleft+=cnt2*idf(word)*cnt2*idf(word)
        
    try:
        return up/(math.sqrt(downleft)*math.sqrt(downright))
    except:
        return 0

In [7]:
#w,h=len(sentences), len(sentences)
def sent_to_matrix(sentences):
    w,h=len(sentences), len(sentences)
    dist_matrix = np.zeros(shape=(w,h))
    for i in range(w):
        for j in range(h):
            dist = sent_cosine(sentences[i],sentences[j])
            dist_matrix[i][j]=dist
            dist_matrix[j][i]=dist
    return dist_matrix

In [165]:
def get_top_ranked_sentences(sentences, threshold, number_of_summarized_sentences):
    lower_sentences=[]
    for sentence in sentences:
        lower_sentences.append(sentence.lower())
    matrix = sent_to_matrix(lower_sentences)
    
    degrees=np.zeros(shape=(len(matrix)))
    for j in range(len(matrix)):
        degrees[j]=sum(i > threshold for i in matrix[j])
    
    indices=[]
    for i in range(len(degrees)):
        indices.append(i)
        
    sentences = [x for _, x in sorted(zip(degrees,sentences), reverse=True)]
    indices = [x for _, x in sorted(zip(degrees,indices))]
    
    new_sentences = []
    new_indices = []
    
    for i in range(number_of_summarized_sentences):
        new_sentences.append(sentences[i])
        new_indices.append(indices[i])
    
    new_sentences = [x for _, x in sorted(zip(new_indices,new_sentences))]
    return new_sentences

In [166]:
def summarize(para, number_of_summarized_sentences):
    sentences = tb(para).sentences
    number_of_summarized_sentences = min(number_of_summarized_sentences, len(sentences))

    sentences=get_top_ranked_sentences(sentences, .03, number_of_summarized_sentences)
    
    summ=""
    for sentence in sentences:
        summ+=str(sentence)
        
    return summ

In [167]:
para="Trump was born and raised in the New York City borough of Queens. He received an economics degree from the Wharton School of the University of Pennsylvania and was appointed president of his family's real estate business in 1971, renamed it The Trump Organization, and expanded it from Queens and Brooklyn into Manhattan. The company built or renovated skyscrapers, hotels, casinos, and golf courses. Trump later started various side ventures, including licensing his name for real estate and consumer products. He managed the company until his 2017 inauguration. He co-authored several books, including The Art of the Deal. He owned the Miss Universe and Miss USA beauty pageants from 1996 to 2015, and he produced and hosted the reality television show, The Apprentice, from 2003 to 2015. Forbes estimates his net worth to be $3.1 billion."

In [169]:
summarize(para, 2)

'Trump was born and raised in the New York City borough of Queens.'