In [25]:
import import_ipynb
from news_classes import *
import math
from textblob import TextBlob as tb
import os
import pickle
from collections import Counter
from textblob import TextBlob as tb
import numpy as np

In [28]:
class lexrank:
    def __init__(self,threshold,tolerance):
        self.idf_path="files/idf_today.pkl"
        self.tolerance=tolerance
        self.threshold=threshold
        with open(self.idf_path, 'rb') as input:
            self.idfblob = pickle.load(input)
        
    def load_idf(self):
        with open(self.idf_path, 'rb') as input:
            self.idfblob = pickle.load(input)
            
    def get_idf(self,word):
        return math.log(self.idfblob.size() / (1 + self.idfblob.getIdf(word)))
    
    def cosine_distance(self, sentence1, sentence2):
        count_sent1=Counter(sentence1.words)
        count_sent2=Counter(sentence2.words)

        up=0
        downright=0
        downleft=0

        words=set()
        for word in count_sent1.keys():
            words.add(word)
        for word in count_sent2.keys():
            words.add(word)

        for word in words:
            cnt1=count_sent1.get(word)
            cnt2=count_sent2.get(word)
            if cnt1 is None:
                cnt1=0            
            elif cnt2 is None:
                cnt2=0

            up+=cnt1*cnt2*self.get_idf(word)*self.get_idf(word)
            downright+=cnt1*self.get_idf(word)*cnt1*self.get_idf(word)
            downleft+=cnt2*self.get_idf(word)*cnt2*self.get_idf(word)

        try:
            return up/(math.sqrt(downleft)*math.sqrt(downright))
        except:
            return 0
    
    def cosine_matrix(self,sentences):
        w,h=len(sentences), len(sentences)
        dist_matrix = np.zeros(shape=(w,h))
        for i in range(w):
            for j in range(h):
                dist = self.cosine_distance(sentences[i],sentences[j])
                if dist>=self.threshold:
                    dist=1
                else:
                    dist=0
                dist_matrix[i][j]=dist
                dist_matrix[j][i]=dist

        for i in range(w):
            dist_matrix[i]=dist_matrix[i]/sum(dist_matrix[i])
            
        return dist_matrix
    
    def power_method(self,cosine_matrix):
        n=len(cosine_matrix)
        eg_vec=np.zeros(shape=(n))
        eg_vec=(eg_vec+1)/n
        
        for itr in range(100):
            new_vec=np.matmul(eg_vec,cosine_matrix)
            update=sum(np.absolute(new_vec-eg_vec))
            eg_vec=new_vec
            
            if update<self.tolerance:
                break
            
        return eg_vec
    
    def rank_sentences(self, sentences, number_of_summarized_sentences):
        lower_sentences=[]
        for sentence in sentences:
            lower_sentences.append(sentence.lower())
        matrix = self.cosine_matrix(lower_sentences)

        eigen_vector = self.power_method(matrix)
        eigen_vector[0]=1
        indices=np.arange(len(eigen_vector))
                
        sentences = [x for _, x in sorted(zip(eigen_vector,sentences), reverse=True)]
        indices = [x for _, x in sorted(zip(eigen_vector,indices), reverse=True)]
        
        new_sentences = []
        new_indices = []

        for i in range(number_of_summarized_sentences):
            new_sentences.append(sentences[i])
            new_indices.append(indices[i])

        new_sentences = [x for _, x in sorted(zip(new_indices,new_sentences))]
        return new_sentences
    
    def summarize(self, para, number_of_summarized_sentences=5):
        sentences = tb(para).sentences
        
        '''
        sent2 = []
        for i in range(min(len(sentences),50)):
            sent2.append(sentences[i])
        '''
        
        number_of_summarized_sentences = min(number_of_summarized_sentences, len(sentences))

        sentences=self.rank_sentences(sentences, number_of_summarized_sentences)

        summ=""
        for sentence in sentences:
            summ+=str(sentence)+" "

        return summ

In [29]:
lx = lexrank(.1,.001)
lx.load_idf()

para="Trump was born and raised in the New York City borough of Queens. He received an economics degree from the Wharton School of the University of Pennsylvania and was appointed president of his family's real estate business in 1971, renamed it The Trump Organization, and expanded it from Queens and Brooklyn into Manhattan. The company built or renovated skyscrapers, hotels, casinos, and golf courses. Trump later started various side ventures, including licensing his name for real estate and consumer products. He managed the company until his 2017 inauguration. He co-authored several books, including The Art of the Deal. He owned the Miss Universe and Miss USA beauty pageants from 1996 to 2015, and he produced and hosted the reality television show, The Apprentice, from 2003 to 2015. Forbes estimates his net worth to be $3.1 billion."

lx.summarize(para)

"Trump was born and raised in the New York City borough of Queens. He received an economics degree from the Wharton School of the University of Pennsylvania and was appointed president of his family's real estate business in 1971, renamed it The Trump Organization, and expanded it from Queens and Brooklyn into Manhattan. The company built or renovated skyscrapers, hotels, casinos, and golf courses. "