In [7]:
import wikipedia
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import networkx as nx
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

In [153]:
frmWiki = wikipedia.WikipediaPage('History_of_Western_civilization')

In [154]:
wikiText = frmWiki.content

# Rank Sentences

In [155]:
# We want to use the standard english-language parser.
parser = spacy.load('en')

# Parsing text.
text = parser(wikiText)

# Dividing the text into sentences and storing them as a list of strings.
sentences=[]
for span in text.sents:
    # go from the start to the end of each span, returning each token in the sentence
    # combine each token using join()
    sent = ''.join(text[i].string for i in range(span.start, span.end)).strip()
    sentences.append(sent)

# Creating the tf-idf matrix.
counter = TfidfVectorizer(lowercase=False, 
                          stop_words=None,
                          ngram_range=(1, 1), 
                          analyzer=u'word', 
                          max_df=.5, 
                          min_df=1,
                          max_features=None, 
                          vocabulary=None, 
                          binary=False)

#Applying the vectorizer
data_counts=counter.fit_transform(sentences)

In [156]:
# Calculating similarity
similarity = data_counts * data_counts.T

# Identifying the sentence with the highest rank.
nx_graph = nx.from_scipy_sparse_matrix(similarity)
ranks=nx.pagerank(nx_graph, alpha=.85, tol=.00000001)

rankedSents = sorted(((ranks[i],s) for i,s in enumerate(sentences)),
                reverse=True)
print(rankedSents[:3])


[(0.0015664193893260758, 'In Asia, China was defeated by Britain in the Opium War and later Britain and France in the Arrow War, forcing it to open up to trade with the West.'), (0.0014732242104492047, "Canada was unique in the British Empire in that it had a French-speaking province, Quebec, which Britain had gained rule over in the Seven Years' War.\n\n\n="), (0.0014102954682504935, 'The major Western players in this New Imperialism were Britain, Russia, France, Germany, Italy, and the United States.')]


# Rank Words

In [165]:
# final list of tokens
words = []
# used to make sure there are not duplicates
wordTracker = []
# Removing stop words and punctuation, 
# then getting a list of all unique words in the text
for word in text: 
    if (
        word.is_stop==False 
        and (word.pos_=='NOUN' or word.pos_=='ADJ') 
        and (word.text not in wordTracker)
        and (len(word.text) > 3)
    ):
        wordTracker.append(word.text)
        words.append(word)

In [166]:
#Creating a grid indicating whether words are within 4 places of the target word
adjacency=pd.DataFrame(columns=words,index=words,data=0)

#Iterating through each word in the text and indicating which of the unique words are its neighbors
for i,word in enumerate(text): 
    word=word
    # Checking if any of the word's next four neighbors are in the word list 
    if any([word == item for item in text_filt]):
        # Making sure to stop at the end of the string, even if there are less than four words left after the target.
        end=max(0,len(text)-(len(text)-(i+5)))
        # The potential neighbors.
        nextwords=text[i+1:end]
        # Filtering the neighbors to select only those in the word list
        inset=[x in text_filt for x in nextwords]
        neighbors=[nextwords[i] for i in range(len(nextwords)) if inset[i]]
        # Adding 1 to the adjacency matrix for neighbors of the target word
        if neighbors:
            adjacency.loc[word,neighbors]=adjacency.loc[word,neighbors]+1

print('done!')

done!


In [167]:
# Running TextRank
nx_words = nx.from_numpy_matrix(adjacency.as_matrix())
#nx_words = nx.from_numpy_matrix(adjacency.values())
ranks=nx.pagerank(nx_words, alpha=.85, tol=.00000001)

# Identifying the five most highly ranked keywords
rankedWrds = sorted(((ranks[i],s) for i,s in enumerate(words)),
                reverse=True)
print(rankedWrds[:5])

[(0.00042936882782308326, reading), (0.00042936882782308326, References), (0.00042936882782308326, View), (0.00042936882782308326, civilizationMediaCivilization), (0.00042936882782308326, History)]


  


In [205]:
wordsTested=[]
maxSent = 20
for index,sentWithRank in enumerate(rankedSents):  
    if index == maxSent:
        break
    sentence=sentWithRank[1]
    for w in rankedWrds:
        rankedWord=str(w[1])
        
        if (' '+rankedWord+' ' in sentence) and (rankedWord not in wordsTested):
            blankSent = sentence.replace(' '+rankedWord+' ',' ____ ')
            print (blankSent)
            print ('Answer: {}\n'.format(rankedWord))
            break

In Asia, China was defeated by Britain in the Opium War and later Britain and France in the Arrow War, forcing it to ____ up to trade with the West.
Answer: open

Canada was unique in the British ____ in that it had a French-speaking province, Quebec, which Britain had gained rule over in the Seven Years' War.


=
Answer: Empire

The major Western ____ in this New Imperialism were Britain, Russia, France, Germany, Italy, and the United States.
Answer: players

Germany was also forced to give up the lands it had gained in the Franco-Prussian War to France, accept responsibility for the war, reduce its military and pay ____ to Britain and France.
Answer: reparations

Missionaries sent from Ireland by the Pope helped to convert England to Christianity in the 6th century as well, restoring that ____ as the dominant in Western Europe.
Answer: faith

The years ____ Britain's victory in the Napoleonic Wars were a period of expansion for the United Kingdom and its former American colonies, whi

In [174]:
wordsTested

[]

In [50]:
stop_words
wordTokens = word_tokenize(rankedSents[0][1])
filteredSent = [w for w in wordTokens if not w in stop_words]

NameError: name 'stop_words' is not defined

In [46]:
for wrd in str(rankedSents[0][1]).split():
    print (wrd)
    
    [item for item in a if ]

The
theory
is
that
in
feature
searches,
it
is
easy
to
spot
the
target,
or
if
it
is
absent,
because
of
the
difference
in
color
between
the
target
and
the
distractors.
