In [1]:
# load main dictionaries for text summarization
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [2]:
# generate speech list
speech_list = ['John F. Kennedy_1961', 'John F. Kennedy_1962', 'John F. Kennedy_1963', 'Lyndon B. Johnson_1964', 'Lyndon B. Johnson_1965', 'Lyndon B. Johnson_1966', 'Lyndon B. Johnson_1967', 'Lyndon B. Johnson_1968', 'Lyndon B. Johnson_1969', 'Richard M. Nixon_1970', 'Richard M. Nixon_1971', 'Richard M. Nixon_1972', 'Richard M. Nixon_1974', 'Gerald Ford_1975', 'Gerald Ford_1976', 'Gerald Ford_1977', 'Jimmy Carter_1978', 'Jimmy Carter_1979', 'Jimmy Carter_1980', 'Ronald Reagan_1982', 'Ronald Reagan_1983', 'Ronald Reagan_1984', 'Ronald Reagan_1985', 'Ronald Reagan_1986', 'Ronald Reagan_1987', 'Ronald Reagan_1988', 'George H.W. Bush_1990', 'George H.W. Bush_1991', 'George H.W. Bush_1992', 'Bill Clinton_1994', 'Bill Clinton_1995', 'Bill Clinton_1996', 'Bill Clinton_1997', 'Bill Clinton_1998', 'Bill Clinton_1999', 'Bill Clinton_2000', 'George W. Bush_2002', 'George W. Bush_2003', 'George W. Bush_2004', 'George W. Bush_2005', 'George W. Bush_2006', 'George W. Bush_2007', 'George W. Bush_2008', 'Barack Obama_2010', 'Barack Obama_2011', 'Barack Obama_2012', 'Barack Obama_2013', 'Barack Obama_2014', 'Barack Obama_2015', 'Barack Obama_2016', 'Donald Trump_2018', 'Donald Trump_2019']

In [3]:
import pickle

# load all the transcripts again into a self-created dictionary called data 
data = {}
for i, c in enumerate(speech_list):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [4]:
# look at the keys within data
data.keys()

dict_keys(['John F. Kennedy_1961', 'John F. Kennedy_1962', 'John F. Kennedy_1963', 'Lyndon B. Johnson_1964', 'Lyndon B. Johnson_1965', 'Lyndon B. Johnson_1966', 'Lyndon B. Johnson_1967', 'Lyndon B. Johnson_1968', 'Lyndon B. Johnson_1969', 'Richard M. Nixon_1970', 'Richard M. Nixon_1971', 'Richard M. Nixon_1972', 'Richard M. Nixon_1974', 'Gerald Ford_1975', 'Gerald Ford_1976', 'Gerald Ford_1977', 'Jimmy Carter_1978', 'Jimmy Carter_1979', 'Jimmy Carter_1980', 'Ronald Reagan_1982', 'Ronald Reagan_1983', 'Ronald Reagan_1984', 'Ronald Reagan_1985', 'Ronald Reagan_1986', 'Ronald Reagan_1987', 'Ronald Reagan_1988', 'George H.W. Bush_1990', 'George H.W. Bush_1991', 'George H.W. Bush_1992', 'Bill Clinton_1994', 'Bill Clinton_1995', 'Bill Clinton_1996', 'Bill Clinton_1997', 'Bill Clinton_1998', 'Bill Clinton_1999', 'Bill Clinton_2000', 'George W. Bush_2002', 'George W. Bush_2003', 'George W. Bush_2004', 'George W. Bush_2005', 'George W. Bush_2006', 'George W. Bush_2007', 'George W. Bush_2008', '

In [5]:
# Check the first 3 lines of George H.W. Bush 1990 speech to confirm everything is correct
data['George W. Bush_2002'][:3]

['Thank you very much. Mr. Speaker, Vice President Cheney, members of Congress, distinguished guests, fellow citizens:',
 'As we gather tonight, our nation is at war; our economy is in recession; and the civilized world faces unprecedented dangers. Yet, the state of our Union has never been stronger.',
 "We last met in an hour of shock and suffering. In four short months, our nation has comforted the victims, begun to rebuild New York and the Pentagon, rallied a great coalition, captured, arrested, and rid the world of thousands of terrorists, destroyed Afghanistan's terrorist training camps, saved a people from starvation, and freed a country from brutal oppression."]

In [6]:
##import re
##def clean_text_round2(text):
##    text = re.sub('[‘’“”…]', '', text)
##    text = re.sub('\n', '', text)
##    text = re.sub('\r','',text)
##    text = re.sub('\u2011','',text)
##    return text

## round2 = lambda x: clean_text_round2(x)

In [7]:
## data['Barack Obama_2014'] = clean_text_round2(str(data['Barack Obama_2014']))

In [8]:
# save files we'd like to summarize as txt format
file1 = open("myfile.txt","w") 
L = data['George W. Bush_2002']
  
# \n is placed to indicate EOL (End of Line) 
file1.writelines(L) 
file1.close() #to change file access modes 

In [9]:
def read_article(file_name):
    file = open(file_name, "r")
    filedata = file.readlines()
    article = filedata[0].split(". ")
    sentences = []

    for sentence in article:
        print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    
    return sentences

In [10]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [11]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

In [12]:
def generate_summary(file_name, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    print("Summarize Text: \n", ". ".join(summarize_text))

In [13]:
generate_summary('myfile.txt',2)

Thank you very much
Mr
Speaker, Vice President Cheney, members of Congress, distinguished guests, fellow citizens:As we gather tonight, our nation is at war; our economy is in recession; and the civilized world faces unprecedented dangers
Yet, the state of our Union has never been stronger.We last met in an hour of shock and suffering
In four short months, our nation has comforted the victims, begun to rebuild New York and the Pentagon, rallied a great coalition, captured, arrested, and rid the world of thousands of terrorists, destroyed Afghanistan's terrorist training camps, saved a people from starvation, and freed a country from brutal oppression.The American flag flies again over our embassy in Kabul
Terrorists who once occupied Afghanistan now occupy cells at Guantanamo Bay
And terrorist leaders who urged followers to sacrifice their lives are running for their own.America and Afghanistan are now allies against terror
We'll be partners in rebuilding that country
And this evening 

Summarize Text: 
 History has called America and our allies to action, and it is both our responsibility and our privilege to fight freedom's fight.Our first priority must always be the security of our nation, and that will be reflected in the budget I send to Congress. And make no mistake about it: If they do not act, America will.Our second goal is to prevent regimes that sponsor terror from threatening America or our friends and allies with weapons of mass destruction
