# Improving Signal

In [1]:
#importing package
import spacy

#loading pipeline

nlp = spacy.load('en')

In [2]:
#retrieving data from Wiki

import wikipedia

def pages_to_sentences(*pages):
    sentences = []
    for page in pages:
        p = wikipedia.page(page)
        doc = nlp(p.content)
        sentences += [sent.text for sent in doc.sents]
    return sentences

In [3]:
#getting the data

animal_sents = pages_to_sentences("Python(programming language)")
language_sents = pages_to_sentences('Reticulated python','Ball python')

documents = animal_sents + language_sents

In [4]:
#training the bag of words model

from sklearn.feature_extraction.text import CountVectorizer

bag_of_words = CountVectorizer()
bag_of_words.fit(documents)
bag_of_words.transform(documents)

<833x2730 sparse matrix of type '<class 'numpy.int64'>'
	with 8776 stored elements in Compressed Sparse Row format>

In [5]:
# creating the word count

word_counts = bag_of_words.transform(documents)
print(word_counts)

  (0, 219)	1
  (0, 1073)	1
  (0, 1170)	1
  (0, 1310)	1
  (0, 1327)	1
  (0, 1396)	1
  (0, 1431)	1
  (0, 1923)	1
  (0, 1944)	1
  (0, 1964)	1
  (1, 37)	1
  (1, 224)	1
  (1, 423)	1
  (1, 516)	1
  (1, 663)	1
  (1, 729)	1
  (1, 846)	1
  (1, 991)	1
  (1, 1126)	1
  (1, 1234)	1
  (1, 1344)	1
  (1, 1697)	1
  (1, 1722)	1
  (1, 1833)	1
  (1, 1964)	1
  :	:
  (825, 274)	1
  (825, 323)	1
  (825, 1829)	1
  (825, 1969)	1
  (826, 928)	1
  (826, 1829)	1
  (827, 323)	1
  (827, 449)	1
  (827, 1964)	2
  (827, 2033)	1
  (828, 2064)	1
  (829, 449)	1
  (829, 938)	1
  (829, 1964)	2
  (829, 2033)	1
  (829, 2122)	1
  (829, 2140)	1
  (829, 2487)	1
  (829, 2510)	1
  (830, 461)	1
  (831, 323)	1
  (831, 449)	1
  (831, 1964)	1
  (831, 2221)	1
  (832, 194)	1


In [6]:
#training the tfidf

from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()
weights = tfidf.fit_transform(word_counts)
print(weights)

  (0, 1964)	0.1323802684885344
  (0, 1944)	0.4222847703750066
  (0, 1923)	0.2660125192587425
  (0, 1431)	0.3806664126941908
  (0, 1396)	0.2409306124340554
  (0, 1327)	0.16508710343184035
  (0, 1310)	0.39793959179012905
  (0, 1170)	0.39793959179012905
  (0, 1073)	0.35632123410931327
  (0, 219)	0.243933368024236
  (1, 2689)	0.13609411976303154
  (1, 2671)	0.2614763497862767
  (1, 2603)	0.2227801647688002
  (1, 2588)	0.18470529210827996
  (1, 2246)	0.2501265672755872
  (1, 2113)	0.2227801647688002
  (1, 2045)	0.21021636280120004
  (1, 2001)	0.2501265672755872
  (1, 1964)	0.08698382895855365
  (1, 1833)	0.2227801647688002
  (1, 1722)	0.10060722515467523
  (1, 1697)	0.2774729697823741
  (1, 1344)	0.1520907397591289
  (1, 1234)	0.09929201939125203
  (1, 1126)	0.24132299201455426
  :	:
  (825, 1969)	0.46636984727813474
  (825, 1829)	0.6617425768701939
  (825, 323)	0.4791666858634781
  (825, 274)	0.33910944349217925
  (826, 1829)	0.6695574161325341
  (826, 928)	0.7427603021849811
  (827, 2033)

In [7]:
#to improve signals, STOP_WORDS must be eliminated

from spacy.lang.en import STOP_WORDS
print(type(STOP_WORDS))
STOP_WORDS_python = STOP_WORDS.union({'python'})             #adding the word python to the set of unimportant words


<class 'set'>


In [10]:
#Lemmatizating through spaCy
print([word.lemma_ for word in nlp('run runs running ran')])
print([word.lemma_ for word in nlp('skip skips skipped skipping skeptic')])

['run', 'run', 'run', 'run']
['skip', 'skip', 'skip', 'skip', 'skeptic']


In [14]:
# Lemmatiing using SciKit Learn

from sklearn.feature_extraction.text import TfidfVectorizer

#defining function for that
def lemmatizer(text):
    return [word.lemma_ for word in nlp(text)]

stop_words_str = "".join(STOP_WORDS)
stop_words_lemma = set(word.lemma_ for word in nlp(stop_words_str))

#instantiating the vectorizer
tfidf_lemma = TfidfVectorizer(max_features = 100,
                       stop_words = stop_words_lemma.union({'Python'}),
                       tokenizer = lemmatizer)

tfidf_lemma.fit(documents)
print(tfidf_lemma.get_feature_names())

  'stop_words.' % sorted(inconsistent))


['\n', '\n\n', '\n\n\n', '"', "'s", '(', ')', ',', '-', '-PRON-', '.', '1', '2', '3', ':', ';', '<', '=', 'a', 'also', 'an', 'and', 'as', 'at', 'ball', 'be', 'block', 'but', 'by', 'c', 'can', 'captivity', 'class', 'code', 'design', 'development', 'division', 'do', 'eat', 'example', 'expression', 'feature', 'find', 'for', 'from', 'ft', 'function', 'have', 'implementation', 'in', 'include', 'into', 'java', 'language', 'large', 'length', 'library', 'like', 'list', 'long', 'm', 'many', 'module', 'more', 'most', 'name', 'not', 'object', 'of', 'on', 'one', 'operator', 'or', 'other', 'program', 'programming', 'python', 'reference', 'release', 'reticulate', 'small', 'snake', 'some', 'standard', 'statement', 'such', 'support', 'syntax', 'than', 'that', 'the', 'this', 'time', 'to', 'type', 'use', 'version', 'which', 'with', 'write']


In [15]:
lemmatizer('Dogs are running faster than homos can run quickly')

['dog', 'be', 'run', 'faster', 'than', 'homos', 'can', 'run', 'quickly']

In [17]:
# BiGrams Model

bigram_counter = CountVectorizer(max_features=20,         #for dispaying top 20 bigrams
                               ngram_range = (2,2),
                               stop_words = STOP_WORDS.union({'python'}))

bigram_counter.fit(documents)
bigram_counter.get_feature_names()

  'stop_words.' % sorted(inconsistent))


['23 ft',
 'auliya et',
 'ball pythons',
 'et al',
 'floating point',
 'ft length',
 'functional programming',
 'isbn 978',
 'list comprehensions',
 'number incremented',
 'object oriented',
 'oriented programming',
 'programming language',
 'programming languages',
 'reference implementation',
 'reticulated pythons',
 'scripting language',
 'standard library',
 'van rossum',
 'year old']