In [2]:
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.cluster.util import cosine_distance
import networkx as nx

In [3]:
def read_article(file_name):
    file = open(file_name, 'r')
    filedata = file.read()
    sentences = sent_tokenize(filedata)
    word_list = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        word_list.append(words)
        
    return word_list
#     filedata = file.readlines()
#     article = filedata[0].split(". ")
#     sentences = []
#     for sentence in article:
#         sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
#     sentences.pop()
#     return sentences

In [4]:
def sentence_similarity(sent1, sent2, stop_words):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list((set(sent1+sent2)))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    for w in sent1:
        if w not in stop_words:
            vector1[all_words.index(w)] += 1
    for w in sent2:
        if w not in stop_words:
            vector2[all_words.index(w)] += 1
    
    return 1-cosine_distance(vector1, vector2)

In [5]:
def gen_sim_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if(idx1 == idx2):
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    
    return similarity_matrix

In [6]:
def generate_summary(file_name, top_n=5):
    stop_words = stopwords.words('english')
    summarized_text = []
    sentences = read_article(file_name)
    sentence_similarity_matrix = gen_sim_matrix(sentences, stop_words)
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    ranked_sentences = sorted(((scores[i], s) for i,s in enumerate(sentences)), reverse=True)
    print(ranked_sentences)
    for i in range(top_n):
        summarized_text.append(" ".join(ranked_sentences[i][1]))
    print("Summary: \n", " ".join(summarized_text))

In [8]:
generate_summary("SampleData//sample_text.txt", 5)

[(0.057152823126770294, ['Research', 'in', 'AI', 'has', 'focused', 'chiefly', 'on', 'the', 'following', 'components', 'of', 'intelligence', ':', 'learning', ',', 'reasoning', ',', 'problem', 'solving', ',', 'perception', ',', 'and', 'using', 'language', '.']), (0.05212861292740548, ['Artificial', 'intelligence', 'systems', 'powered', 'by', 'machine', 'learning', 'enable', 'companies', 'to', 'leverage', 'large', 'amounts', 'of', 'available', 'data', 'to', 'uncover', 'insights', 'and', 'patterns', 'that', 'would', 'be', 'impossible', 'for', 'any', 'one', 'person', 'to', 'identify', ',', 'enabling', 'them', 'to', 'deliver', 'more', 'targeted', ',', 'personalized', 'communications', ',', 'predict', 'critical', 'care', 'events', ',', 'identify', 'likely', 'fraudulent', 'transactions', ',', 'and', 'more', '.']), (0.05110615621375527, ['When', 'the', 'female', 'wasp', 'returns', 'to', 'her', 'burrow', 'with', 'food', ',', 'she', 'first', 'deposits', 'it', 'on', 'the', 'threshold', ',', 'check