In [1]:
import pandas as pd
df = pd.read_pickle('corpus.pkl')

In [2]:
df = df.reset_index(drop = True)
df = df.iloc[:,[1,0]]
df.columns = ['article_id','article_text']

In [3]:
df.head()

Unnamed: 0,article_id,article_text
0,Barack Obama_2010,"Madam Speaker, Vice President Biden, Members o..."
1,Barack Obama_2011,"Mr. Speaker, Mr. Vice President, members of Co..."
2,Barack Obama_2012,"Mr. Speaker, Mr. Vice President, members of C..."
3,Barack Obama_2013,"\r\nMr. Speaker, Mr. Vice President, members o..."
4,Barack Obama_2014,"Mr. Speaker, Mr. Vice President, Members of Co..."


In [19]:
# only summarize Obama's speeches in 2010, 2011 and 2012
df = df[:3]

In [22]:
df = df[df['article_id'] == 'Barack Obama_2011']

In [23]:
from nltk.tokenize import sent_tokenize
sentences = []
for s in df['article_text']:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list

In [24]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [28]:
# Extract word vectors
# download glove here https://nlp.stanford.edu/projects/glove/ if !wget doesn't work
import numpy as np
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [29]:
len(word_embeddings)

400000

In [30]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [31]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alanl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [33]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [34]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [35]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [36]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [37]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [40]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [41]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [42]:
# Extract top 10 sentences as the summary
for i in range(10):
  print(ranked_sentences[i][1])

As Kathy said, “I hope it tells them to never give up.”

      If we take these steps -– if we raise expectations for every child, and give them the best possible chance at an education, from the day they are born until the last job they take –- we will reach the goal that I set two years ago:  By the end of the decade, America will once again have the highest proportion of college graduates in the world.
We’re a nation that says, “I might not have a lot of money, but I have this great idea for a new company.”  “I might not come from a family of college graduates, but I will be the first to get my degree.”  “I might not know those people in trouble, but I think I can help them, and I need to try.”  “I’m not sure how we’ll reach that better place beyond the horizon, but I know we’ll get there.
Thirty years ago, we couldn’t know that something called the Internet would lead to an economic revolution.
Amid all the noise and passion and rancor of our public debate, Tucson reminded us that