# Extractive Summarizer

In [85]:
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [84]:
stop_words = stopwords.words('english') # To be removed later, just for function testing

In [11]:
def read_func(filename):
    '''
    Takes the whole text file as input
    Returns list of list of words of every sentence
    '''
    file = open(filename,'r',encoding='utf-8') # open file
    filedata = file.readlines() # real all lines
    para = ' '.join(filedata) # join all lines together
    lines = nltk.sent_tokenize(para) # every line is different elements in list
    sentences = [] # nested list, sentences, containing list of words
    for line in lines: # iterating in lines
        sentences.append(line.split(" ")) # splitting lines into words to add into sentences
    for line in sentences: # iterating lines again to remove fullstop
        line[len(line)-1] = line[len(line)-1][:(len(line[len(line)-1])-1)] # removed full stops
    return sentences

In [14]:
sentences = read_func('trump.txt')

In [15]:
sentences

[['WASHINGTON',
  '-',
  'The',
  'Trump',
  'administration',
  'has',
  'ordered',
  'the',
  'military',
  'to',
  'start',
  'withdrawing',
  'roughly',
  '7,000',
  'troops',
  'from',
  'Afghanistan',
  'in',
  'the',
  'coming',
  'months,',
  'two',
  'defense',
  'officials',
  'said',
  'Thursday,',
  'an',
  'abrupt',
  'shift',
  'in',
  'the',
  '17-year-old',
  'war',
  'there',
  'and',
  'a',
  'decision',
  'that',
  'stunned',
  'Afghan',
  'officials,',
  'who',
  'said',
  'they',
  'had',
  'not',
  'been',
  'briefed',
  'on',
  'the',
  'plans'],
 ['President',
  'Trump',
  'made',
  'the',
  'decision',
  'to',
  'pull',
  'the',
  'troops',
  '-',
  'about',
  'half',
  'the',
  'number',
  'the',
  'United',
  'States',
  'has',
  'in',
  'Afghanistan',
  'now',
  '-',
  'at',
  'the',
  'same',
  'time',
  'he',
  'decided',
  'to',
  'pull',
  'American',
  'forces',
  'out',
  'of',
  'Syria,',
  'one',
  'official',
  'said.The',
  'announcement',
  'came'

In [66]:
def sentence_similarity(sent1, sent2, stopwords=[]):
    '''
    Takes 2 sentences at a time as input and also takes the ntlk stopwords
    Returns cosine similarity between the sentence vectors (1 - distance = similarity)
    '''
    sent1 = [word.lower() for word in sent1 if word.lower() not in stopwords] # converted to lower case and removed stop words
    sent2 = [word.lower() for word in sent2 if word.lower() not in stopwords]

    all_words = list(set(sent1 + sent2)) # get list of all words in the 2 sentences

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # build vector for first sentence
    for word in sent1:
        vector1[all_words.index(word)] += 1

    # build vector for second sentence
    for word in sent2:
        vector2[all_words.index(word)] += 1
    
    return 1 - cosine_distance(vector1, vector2)

In [68]:
sentence_similarity(sentences[3],sentences[0],stopwords=stop_words)

0.21437323142813602

In [82]:
def build_similarity_matrix(sentences, stopwords = []):
    '''
    Takes sentences and stopwords as input
    Makes a similarity matrix using sentence_similarity function
    Returns the similarity matrix
    '''
    similarity_matrix = np.zeros((len(sentences),len(sentences))) # Creates empty 2D matrix

    for index_1 in range(len(sentences)):
        for index_2 in range(len(sentences)):
            if index_1 == index_2: # ignore diagonal terms (where sentences are same)
                continue
            similarity_matrix[index_1][index_2] = sentence_similarity(sentences[index_1], sentences[index_2], stopwords)

    return similarity_matrix

In [83]:
build_similarity_matrix(sentences,stop_words)

array([[0.        , 0.18371173, 0.09095086, 0.21437323, 0.15990054,
        0.21740659, 0.08964215, 0.14173668],
       [0.18371173, 0.        , 0.0792118 , 0.14002801, 0.15666989,
        0.20655911, 0.13662601, 0.03086067],
       [0.09095086, 0.0792118 , 0.        , 0.04159452, 0.05170877,
        0.13421887, 0.        , 0.13750477],
       [0.21437323, 0.14002801, 0.04159452, 0.        , 0.03656362,
        0.16269784, 0.08199201, 0.12964074],
       [0.15990054, 0.15666989, 0.05170877, 0.03656362, 0.        ,
        0.08427498, 0.        , 0.        ],
       [0.21740659, 0.20655911, 0.13421887, 0.16269784, 0.08427498,
        0.        , 0.17008401, 0.23904572],
       [0.08964215, 0.13662601, 0.        , 0.08199201, 0.        ,
        0.17008401, 0.        , 0.13552619],
       [0.14173668, 0.03086067, 0.13750477, 0.12964074, 0.        ,
        0.23904572, 0.13552619, 0.        ]])

In [142]:
def generate_summary(file_name, num_of_lines = 5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # reads file and returns list of list of words of a sentence
    sentences = read_func(file_name)

    # builds a 2d matrix of cosine similarity between all sentences
    similarity_matrix = build_similarity_matrix(sentences,stop_words)

    sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)

    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    

    for i in range(num_of_lines):
        summarize_text.append(" ".join(ranked_sentence[i][1]))

    summarize_text.append('')

    print('Summarized Text:')
    summary = ". ".join(summarize_text)
    print(summary)

In [148]:
generate_summary('sample_text_1.txt',4)

Summarized Text:
Ashwin said the news of Hemil’s death reached them on February 23. “We humbly request our government to talk to the Russian authorities and get the body of my son to his home town Surat. He left for Russia on December 14, 2023 and remained in contact with us regularly,” said his father. A family source said Hemil left studies after Class 12 and started a small embroidery business with his cousins. 


In [146]:
generate_summary('sample_text_2.txt',4)

Summarized Text:
