In [44]:
import pandas as pd
df = pd.read_pickle('corpus.pkl')

In [45]:
df = df.reset_index(drop = True)
df = df.iloc[:,[1,0]]
df.columns = ['article_id','article_text']

In [46]:
df.head()

Unnamed: 0,article_id,article_text
0,Barack Obama_2010,"Madam Speaker, Vice President Biden, Members o..."
1,Barack Obama_2011,"Mr. Speaker, Mr. Vice President, members of Co..."
2,Barack Obama_2012,"Mr. Speaker, Mr. Vice President, members of C..."
3,Barack Obama_2013,"\r\nMr. Speaker, Mr. Vice President, members o..."
4,Barack Obama_2014,"Mr. Speaker, Mr. Vice President, Members of Co..."


#### Summarize one single President Obama Speech

In [47]:
# select one single article from our list above
df = df[df['article_id'] == 'Barack Obama_2011']

In [5]:
from nltk.tokenize import sent_tokenize
sentences = []
for s in df['article_text']:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list

In [6]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [7]:
# Extract word vectors
# download glove here https://nlp.stanford.edu/projects/glove/ if !wget doesn't work
import numpy as np
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [8]:
len(word_embeddings)

400000

In [9]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alanl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [12]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [13]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [14]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [15]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [16]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [19]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [20]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [21]:
# Extract top 3 sentences as the summary
for i in range(3):
  print(ranked_sentences[i][1])

As Kathy said, “I hope it tells them to never give up.”

      If we take these steps -– if we raise expectations for every child, and give them the best possible chance at an education, from the day they are born until the last job they take –- we will reach the goal that I set two years ago:  By the end of the decade, America will once again have the highest proportion of college graduates in the world.
We’re a nation that says, “I might not have a lot of money, but I have this great idea for a new company.”  “I might not come from a family of college graduates, but I will be the first to get my degree.”  “I might not know those people in trouble, but I think I can help them, and I need to try.”  “I’m not sure how we’ll reach that better place beyond the horizon, but I know we’ll get there.
Thirty years ago, we couldn’t know that something called the Internet would lead to an economic revolution.


#### Summarize All President Trump State of Union Address speeches

In [80]:
df = pd.read_pickle('corpus.pkl')
df = df.reset_index(drop = True)
df = df.iloc[:,[1,0]]
df.columns = ['article_id','article_text']
df

Unnamed: 0,article_id,article_text
0,Barack Obama_2010,"Madam Speaker, Vice President Biden, Members o..."
1,Barack Obama_2011,"Mr. Speaker, Mr. Vice President, members of Co..."
2,Barack Obama_2012,"Mr. Speaker, Mr. Vice President, members of C..."
3,Barack Obama_2013,"\r\nMr. Speaker, Mr. Vice President, members o..."
4,Barack Obama_2014,"Mr. Speaker, Mr. Vice President, Members of Co..."
5,Barack Obama_2015,"Mr. Speaker, Mr. Vice President, Members of Co..."
6,Barack Obama_2016,"Mr. Speaker, Mr. Vice President, Members of Co..."
7,Bill Clinton_1994,"Thank you very much. Mr. Speaker, Mr. Presiden..."
8,Bill Clinton_1995,"Mr. President, Mr. Speaker, members of the 104..."
9,Bill Clinton_1996,"Mr. Speaker, Mr. Vice President, members of th..."


In [81]:
df = df[df['article_id'].str.contains("Donald Trump")]

In [82]:
df

Unnamed: 0,article_id,article_text
14,Donald Trump_2018,"Mr. Speaker, Mr. Vice President, Members of Co..."
15,Donald Trump_2019,"Madam Speaker, Mr. Vice President, Members of ..."


In [25]:
sentences = []
for s in df['article_text']:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list

In [26]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [27]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [28]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [29]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [30]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [31]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [32]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [33]:
# Extract top 3 sentences as the summary
for i in range(3):
  print(ranked_sentences[i][1])

Each day since, we have gone forward with a clear vision and a righteous mission—to make America great again for all Americans.
It is time to begin moving towards a merit-based immigration system—one that admits people who are skilled, who want to work, who will contribute to our society, and who will love and respect our country.
We do not know whether we will achieve an agreement—but we do know that after two decades of war, the hour has come to at least try for peace.


#### Turn this into a function

In [39]:
# build the text summarization function
def summarize_text(x_df):
    # extract the sentences
    sentences = []
    for s in x_df['article_text']:
        sentences.append(sent_tokenize(s))
    sentences = [y for x in sentences for y in x]
    
    # clean the data
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
    clean_sentences = [s.lower() for s in clean_sentences]
    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]    
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
        
    # similarity matrix
    sim_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

    # Extract top 3 sentences as the summary
    textbox = []
    for i in range(3):
        textbox.append(ranked_sentences[i][1])
    return(textbox)
    

In [40]:
summarize_text(df)

['Each day since, we have gone forward with a clear vision and a righteous mission—to make America great again for all Americans.',
 'It is time to begin moving towards a merit-based immigration system—one that admits people who are skilled, who want to work, who will contribute to our society, and who will love and respect our country.',
 'We do not know whether we will achieve an agreement—but we do know that after two decades of war, the hour has come to at least try for peace.']