In [7]:
import wikipedia
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import networkx as nx
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

In [30]:
frmWiki = wikipedia.WikipediaPage('Congnition')

In [31]:
wikiText = frmWiki.content

# Rank Sentences

In [32]:
# We want to use the standard english-language parser.
parser = spacy.load('en')

# Parsing text.
text = parser(wikiText)

# Dividing the text into sentences and storing them as a list of strings.
sentences=[]
for span in text.sents:
    # go from the start to the end of each span, returning each token in the sentence
    # combine each token using join()
    sent = ''.join(text[i].string for i in range(span.start, span.end)).strip()
    sentences.append(sent)

# Creating the tf-idf matrix.
counter = TfidfVectorizer(lowercase=False, 
                          stop_words=None,
                          ngram_range=(1, 1), 
                          analyzer=u'word', 
                          max_df=.5, 
                          min_df=1,
                          max_features=None, 
                          vocabulary=None, 
                          binary=False)

#Applying the vectorizer
data_counts=counter.fit_transform(sentences)

In [42]:
# Calculating similarity
similarity = data_counts * data_counts.T

# Identifying the sentence with the highest rank.
nx_graph = nx.from_scipy_sparse_matrix(similarity)
ranks=nx.pagerank(nx_graph, alpha=.85, tol=.00000001)

rankedSents = sorted(((ranks[i],s) for i,s in enumerate(sentences)),
                reverse=True)
print(rankedSents[:3])


[(0.012856438739566694, 'The theory is that in feature searches, it is easy to spot the target, or if it is absent, because of the difference in color between the target and the distractors.'), (0.012558682305585501, 'The serial position experiment is meant to test a theory of memory that states that when information is given in a serial manner, we tend to remember information in the beginning of the sequence, called the primacy effect, and information in the end of the sequence, called the recency effect.'), (0.012547081947981806, 'In the Brown-Peterson experiment, participants are briefly presented with a trigram and in one particular version of the experiment, they are then given a distractor task, asking them to identify whether a sequence of words are in fact words, or non-words (due to being misspelled, etc.).')]


In [108]:
rankedSents[3][1]

'What is expected is that in the feature searches, reaction time, that is the time it takes for a participant to identify whether a green circle is present or not, should not change as the number of distractors increases.'

# Rank Words

In [107]:
# Removing stop words and punctuation, then getting a list of all unique words in the text
text_filt = [word for word in text if word.is_stop==False and (word.pos_=='NOUN' or word.pos_=='ADJ')]
words=set(text_filt)

#Creating a grid indicating whether words are within 4 places of the target word
adjacency=pd.DataFrame(columns=words,index=words,data=0)

#Iterating through each word in the text and indicating which of the unique words are its neighbors
for i,word in enumerate(text): 
    word=word
    # Checking if any of the word's next four neighbors are in the word list 
    if any([word == item for item in text_filt]):
        # Making sure to stop at the end of the string, even if there are less than four words left after the target.
        end=max(0,len(text)-(len(text)-(i+5)))
        # The potential neighbors.
        nextwords=text[i+1:end]
        # Filtering the neighbors to select only those in the word list
        inset=[x in text_filt for x in nextwords]
        neighbors=[nextwords[i] for i in range(len(nextwords)) if inset[i]]
        # Adding 1 to the adjacency matrix for neighbors of the target word
        if neighbors:
            adjacency.loc[word,neighbors]=adjacency.loc[word,neighbors]+1

print('done!')

done!


In [102]:
list(words)[3].__class__

spacy.tokens.token.Token

In [39]:
# Running TextRank
nx_words = nx.from_numpy_matrix(adjacency.as_matrix())
#nx_words = nx.from_numpy_matrix(adjacency.values())
ranks=nx.pagerank(nx_words, alpha=.85, tol=.00000001)

# Identifying the five most highly ranked keywords
rankedWrds = sorted(((ranks[i],s) for i,s in enumerate(words)),
                reverse=True)
print(rankedWrds[:5])

  


[(0.0019875515489837787, stronger), (0.0019540896304408506, leveling), (0.0019540896304408506, history), (0.0017955244414052931, study), (0.001795524441405293, particular)]


In [77]:
from collections import Counter
print (rankedSents[4][1])
wrdsInSent=[]
for w in rankedWrds:
    # build the list of ranked words that show up
    # in sentence;
    if str(w[1]) in str(rankedSents[4][1]):
        wrdsInSent.append(w[1])
counts = Counter(wrdsInSent)
print (wrdsInSent)
print (counts)

The memory span is projected to be shorter with letters that sound similar and with longer words.
[words, span, word, memory, words, words, longer, memory, words, longer, memory, memory, words, memory, words, letters, words, short, span, words, memory, short, memory, memory, similar, letters, span, memory, memory, memory, words, word, words, longer, word, memory, memory, memory, word, similar, letters, memory, words, word, memory, span, words, words, words, words, letter, word, shorter, words, letters, words, memory, word, letter, letter, memory]
Counter({words: 1, span: 1, word: 1, memory: 1, words: 1, words: 1, longer: 1, memory: 1, words: 1, longer: 1, memory: 1, memory: 1, words: 1, memory: 1, words: 1, letters: 1, words: 1, short: 1, span: 1, words: 1, memory: 1, short: 1, memory: 1, memory: 1, similar: 1, letters: 1, span: 1, memory: 1, memory: 1, memory: 1, words: 1, word: 1, words: 1, longer: 1, word: 1, memory: 1, memory: 1, memory: 1, word: 1, similar: 1, letters: 1, memory: 

In [78]:
rankedWrds

[(0.0019875515489837787, stronger),
 (0.0019540896304408506, leveling),
 (0.0019540896304408506, history),
 (0.0017955244414052931, study),
 (0.001795524441405293, particular),
 (0.001795524441405293, time),
 (0.001795524441405293, nonexistent),
 (0.001795524441405293, analysis),
 (0.0017866124345135442, search),
 (0.0017866124345135442, circles),
 (0.0017866124345135442, middle),
 (0.0017866124345135442, term),
 (0.0017866124345135442, recollection),
 (0.0017866124345135442, empirical),
 (0.0017866124345135442, studies),
 (0.0017644092632437551, conclusion),
 (0.001744855963058043, objects),
 (0.0017439676240588503, people),
 (0.0017439676240588503, information),
 (0.0017423215656569586, Latin),
 (0.0017407051538231703, study),
 (0.0017384081636770295, objective),
 (0.0017368360030244451, human),
 (0.0017368360030244447, words),
 (0.0017368360030244443, reasoning),
 (0.0017344126789491515, Attention),
 (0.001730624270554089, capacity),
 (0.0017245141698075397, span),
 (0.0017245141698

In [50]:
stop_words
wordTokens = word_tokenize(rankedSents[0][1])
filteredSent = [w for w in wordTokens if not w in stop_words]

NameError: name 'stop_words' is not defined

In [46]:
for wrd in str(rankedSents[0][1]).split():
    print (wrd)
    
    [item for item in a if ]

The
theory
is
that
in
feature
searches,
it
is
easy
to
spot
the
target,
or
if
it
is
absent,
because
of
the
difference
in
color
between
the
target
and
the
distractors.
