In [1]:
import spacy

In [3]:
import en_core_web_sm

nlp = en_core_web_sm.load()

In [4]:
text = nlp('coding is good and healthy for mind joging')
for sent in text:
    print(sent)

coding
is
good
and
healthy
for
mind
joging


In [8]:
print(text[:2])
print(text[:7])

coding is
coding is good and healthy for mind


In [12]:
doc = nlp('The brown fox has jumped over the lazy dog. Mr peanut wears a top hat')
tags = set()

for word in doc:
    tags.add(word.tag_)
    print((word.text,word.pos_,word.tag_))

print()
for tag in tags:
    print(tag, spacy.explain(tag))
    

('The', 'DET', 'DT')
('brown', 'ADJ', 'JJ')
('fox', 'NOUN', 'NN')
('has', 'AUX', 'VBZ')
('jumped', 'VERB', 'VBN')
('over', 'ADP', 'IN')
('the', 'DET', 'DT')
('lazy', 'ADJ', 'JJ')
('dog', 'NOUN', 'NN')
('.', 'PUNCT', '.')
('Mr', 'PROPN', 'NNP')
('peanut', 'NOUN', 'NN')
('wears', 'VERB', 'VBZ')
('a', 'DET', 'DT')
('top', 'ADJ', 'JJ')
('hat', 'NOUN', 'NN')

VBZ verb, 3rd person singular present
NN noun, singular or mass
VBN verb, past participle
. punctuation mark, sentence closer
NNP noun, proper singular
IN conjunction, subordinating or preposition
JJ adjective
DT determiner


In [13]:
import wikipedia

def pages_to_sentences(*pages):
    sentenses = []
    for page in pages:
        p = wikipedia.page(page)
        doc = nlp(p.content)
        sentenses += [sent.text for sent in doc.sents]
    return sentenses
animal_sents = pages_to_sentences('Reticulated python','Ball Python')
program_sents = pages_to_sentences('Python (programming language)')
documents = animal_sents + program_sents

print(animal_sents[:5])
print()
print(program_sents[:5])

['The reticulated python (Malayopython reticulatus) is a snake species in the family Pythonidae native to South and Southeast Asia.', "It is the world's longest snake and listed as least concern on the IUCN Red List because of its wide distribution.", 'In several range countries, it is hunted for its skin, for use in traditional medicine, and for sale as a pet.', 'It is an excellent swimmer, has been reported far out at sea and has colonized many small islands within its range.\n', 'It is among the three heaviest snakes.']

['Python is an interpreted, high-level, general-purpose programming language.', "Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace.", 'Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects.', 'Python is dynamically typed and garbage-collected.', 'It supports multiple programmin

In [19]:
type(documents)

list

In [22]:
documents[0]

'The reticulated python (Malayopython reticulatus) is a snake species in the family Pythonidae native to South and Southeast Asia.'

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

bag_of_words = CountVectorizer()
bag_of_words.fit(documents)
word_counts = bag_of_words.transform(documents)

print(word_counts)
word_counts

  (0, 228)	1
  (0, 279)	1
  (0, 987)	1
  (0, 1279)	1
  (0, 1371)	1
  (0, 1546)	1
  (0, 1699)	1
  (0, 2028)	1
  (0, 2031)	1
  (0, 2151)	1
  (0, 2153)	1
  (0, 2350)	1
  (0, 2372)	1
  (0, 2373)	1
  (0, 2383)	1
  (0, 2555)	2
  (0, 2592)	1
  (1, 228)	1
  (1, 278)	1
  (1, 344)	1
  (1, 600)	1
  (1, 797)	1
  (1, 1371)	1
  (1, 1376)	1
  (1, 1388)	1
  :	:
  (847, 1372)	1
  (848, 51)	1
  (848, 1571)	1
  (848, 2485)	1
  (849, 1279)	1
  (849, 1986)	1
  (849, 2028)	1
  (850, 71)	1
  (850, 845)	1
  (851, 162)	1
  (851, 1979)	1
  (851, 2751)	1
  (852, 77)	1
  (852, 114)	1
  (852, 130)	1
  (852, 1372)	1
  (854, 975)	1
  (854, 1492)	1
  (855, 293)	1
  (855, 689)	1
  (855, 1440)	1
  (855, 1781)	1
  (855, 1986)	1
  (855, 2028)	1
  (855, 2741)	1


<856x2828 sparse matrix of type '<class 'numpy.int64'>'
	with 9166 stored elements in Compressed Sparse Row format>

In [25]:
bag_of_words.get_feature_names()

['000',
 '10',
 '11',
 '111',
 '116',
 '12',
 '125',
 '13',
 '130',
 '14',
 '15',
 '1500',
 '158',
 '15806',
 '16',
 '165',
 '17',
 '18',
 '1801',
 '1802',
 '1803',
 '182',
 '1830',
 '1849',
 '19',
 '1910s',
 '1927',
 '1932',
 '1950s',
 '1956',
 '1960s',
 '1963',
 '1972',
 '1978',
 '1980',
 '1980s',
 '1989',
 '1991',
 '1992',
 '1993',
 '1995',
 '20',
 '200',
 '2000',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2015',
 '2017',
 '2018',
 '2019',
 '2020',
 '20th',
 '21',
 '22',
 '23',
 '2415',
 '25',
 '26',
 '28',
 '29',
 '2d',
 '2nd',
 '2to3',
 '30',
 '300',
 '31',
 '32',
 '321',
 '333',
 '35',
 '350',
 '356',
 '36',
 '3d',
 '3ds',
 '3rd',
 '40',
 '400',
 '42',
 '43',
 '4302',
 '44',
 '45',
 '47',
 '48',
 '4d',
 '50',
 '500',
 '51',
 '521',
 '53',
 '54',
 '55',
 '561',
 '59',
 '59059',
 '596',
 '5th',
 '60',
 '600',
 '604259',
 '61',
 '635',
 '67',
 '68056',
 '69',
 '70',
 '72',
 '72596',
 '74',
 '75',
 '78',
 '79',
 '80',

In [28]:
bag_of_words.vocabulary_

{'the': 2555,
 'reticulated': 2151,
 'python': 2028,
 'malayopython': 1546,
 'reticulatus': 2153,
 'is': 1371,
 'snake': 2350,
 'species': 2383,
 'in': 1279,
 'family': 987,
 'pythonidae': 2031,
 'native': 1699,
 'to': 2592,
 'south': 2372,
 'and': 228,
 'southeast': 2373,
 'asia': 279,
 'it': 1376,
 'world': 2794,
 'longest': 1518,
 'listed': 1496,
 'as': 278,
 'least': 1463,
 'concern': 600,
 'on': 1790,
 'iucn': 1390,
 'red': 2081,
 'list': 1495,
 'because': 344,
 'of': 1776,
 'its': 1388,
 'wide': 2771,
 'distribution': 797,
 'several': 2279,
 'range': 2047,
 'countries': 671,
 'hunted': 1234,
 'for': 1039,
 'skin': 2336,
 'use': 2677,
 'traditional': 2609,
 'medicine': 1602,
 'sale': 2207,
 'pet': 1886,
 'an': 223,
 'excellent': 936,
 'swimmer': 2502,
 'has': 1178,
 'been': 346,
 'reported': 2121,
 'far': 989,
 'out': 1827,
 'at': 293,
 'sea': 2242,
 'colonized': 543,
 'many': 1565,
 'small': 2346,
 'islands': 1374,
 'within': 2784,
 'among': 221,
 'three': 2579,
 'heaviest': 1192

In [32]:
counts_animal = bag_of_words.transform(animal_sents)
counts_program = bag_of_words.transform(program_sents)

ind_programming = bag_of_words.vocabulary_['programming']

print(counts_animal.sum(axis=0)[0, ind_programming])
print(counts_program.sum(axis=0)[0, ind_programming])

0
30


In [35]:
from sklearn.feature_extraction.text import HashingVectorizer

hashing_bag_of_words = HashingVectorizer(norm=None)
hashing_bag_of_words.fit(documents)
hashing_bag_of_words.transform(documents)

<856x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 9166 stored elements in Compressed Sparse Row format>

In [38]:
import time 

t_0 = time.time()
CountVectorizer().fit_transform(documents)
time_elapse = time.time()-t_0
print('count vectorizer time {}'.format(time_elapse))

t_0 = time.time()
HashingVectorizer(norm=None).fit_transform(documents)
time_elapse =time.time() - t_0
print('hash vectorizer time {}'.format(time_elapse))

count vectorizer time 0.12081050872802734
hash vectorizer time 0.051918745040893555


In [40]:
#word_counts = counts_animal + counts_program
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
tfidf_weights = tfidf.fit_transform(word_counts)
print(tfidf_weights)

  (0, 2592)	0.11586225203909407
  (0, 2555)	0.1981108895875376
  (0, 2383)	0.26885735503913727
  (0, 2373)	0.31533309122087
  (0, 2372)	0.3017015278652415
  (0, 2350)	0.20467339368465554
  (0, 2153)	0.27518462327317567
  (0, 2151)	0.1975879632720815
  (0, 2031)	0.33454570069134576
  (0, 2028)	0.10482194760429729
  (0, 1699)	0.27518462327317567
  (0, 1546)	0.2911280658791825
  (0, 1371)	0.1306013411949939
  (0, 1279)	0.11704988626970254
  (0, 987)	0.33454570069134576
  (0, 279)	0.3017015278652415
  (0, 228)	0.10408443660292804
  (1, 2794)	0.2632706988439691
  (1, 2771)	0.2564633278074306
  (1, 2555)	0.18463305621576495
  (1, 2350)	0.19074910158006575
  (1, 2081)	0.27132210988183536
  (1, 1790)	0.1705309634271401
  (1, 1776)	0.11091752516850925
  (1, 1518)	0.27132210988183536
  :	:
  (847, 65)	0.5460962275187211
  (848, 2485)	0.6409874803477728
  (848, 1571)	0.5577995622744075
  (848, 51)	0.5272520254715888
  (849, 2028)	0.3937897826095356
  (849, 1986)	0.8071986741004371
  (849, 1279)	0

In [42]:
top_tfidf_score = tfidf.idf_.argsort()[:-20:-1]
ind_feature = bag_of_words.get_feature_names()

for ind in top_tfidf_score:
    print(tfidf.idf_[ind],ind_feature[ind])

7.060290738037835 zope
7.060290738037835 handle
7.060290738037835 gripped
7.060290738037835 gripping
7.060290738037835 groovy
7.060290738037835 grow
7.060290738037835 growing
7.060290738037835 grumpy
7.060290738037835 guard
7.060290738037835 guide
7.060290738037835 guinea
7.060290738037835 gutted
7.060290738037835 göttingen
7.060290738037835 habitation
7.060290738037835 hall
7.060290738037835 hamilton
7.060290738037835 handled
7.060290738037835 functools
7.060290738037835 handling


In [45]:
from spacy.lang.en import STOP_WORDS
print(type(STOP_WORDS))
STOP_WORDS_python = STOP_WORDS.union({'python'})
STOP_WORDS_python

<class 'set'>


{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [49]:
print([word.lemma_ for word in nlp('run runs ran running runs')])
print([word.lemma_ for word in nlp('dance dancing dances')])
print([word.lemma_ for word in nlp('buy buys bought buying')])
print([word.lemma_ for word in nlp('sea saw see seen seeing')])

['run', 'run', 'run', 'run', 'run']
['dance', 'dance', 'dance']
['buy', 'buys', 'buy', 'buying']
['sea', 'see', 'see', 'see', 'see']


In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer

def lemmatizer(text):
    return [word.lemma_ for word in nlp(text)]
    
stop_word = ''.join(STOP_WORDS)
lemma_words = set(word.lemma_ for word in nlp(stop_word))
tfidf_vector = TfidfVectorizer(max_features=100,
                               stop_words = STOP_WORDS.union({'python'}),
                               tokenizer=lemmatizer)
tfidf_vector.fit(documents)
print(tfidf_vector.get_feature_names())

  'stop_words.' % sorted(inconsistent))


['\n', '\n\n', '\n\n\n', ' ', '"', "'", '(', ')', ',', '-', '-PRON-', '.', '1', '2', '3', ':', ';', '<', '=', 'allow', 'b', 'ball', 'block', 'body', 'breed', 'c', 'captivity', 'class', 'code', 'common', 'cpython', 'describe', 'design', 'development', 'division', 'e.g.', 'eat', 'egg', 'example', 'expression', 'feature', 'female', 'find', 'ft', 'function', 'generator', 'human', 'implementation', 'include', 'integer', 'island', 'java', 'kill', 'language', 'large', 'later', 'length', 'library', 'like', 'list', 'long', 'm', 'm.', 'male', 'measure', 'method', 'module', 'new', 'number', 'object', 'old', 'operator', 'pattern', 'pet', 'program', 'programming', 'propose', 'provide', 'r.', 'range', 'reference', 'release', 'report', 'reticulate', 'small', 'snake', 'standard', 'statement', 'string', 'support', 'syntax', 'system', 'time', 'type', 'use', 'value', 'variable', 'version', 'write', 'year']


In [69]:
bgrams_counter = CountVectorizer(max_features=20,ngram_range=(2,2),stop_words=STOP_WORDS.union({"python"}))
bgrams_counter.fit(documents)
bgrams_counter.get_feature_names()

  'stop_words.' % sorted(inconsistent))


['23 ft',
 'ball pythons',
 'block code',
 'floating point',
 'ft length',
 'guido van',
 'isbn 978',
 'list comprehensions',
 'new features',
 'object oriented',
 'oriented programming',
 'programming language',
 'programming languages',
 'reference implementation',
 'reticulated pythons',
 'scripting language',
 'spam eggs',
 'standard library',
 'van rossum',
 'year old']

In [79]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

documents = animal_sents + program_sents
label = ['animal']*len(animal_sents) + ['program']*len(program_sents)

stop_words_str = ''.join(STOP_WORDS)
set_lemma_word = set(word.lemma_ for word in nlp(stop_words_str))

tfidf_lemma = TfidfVectorizer(stop_words = set_lemma_word,tokenizer=lemmatizer, ngram_range=(1, 2))
pipe = Pipeline([('vectorizer',tfidf_lemma),('classifier',MultinomialNB())])
pipe.fit(documents, label)
print('Training accuracy: {}'.format(pipe.score(documents,label)))

Training accuracy: 0.9170560747663551


In [85]:
test_docs = ["My Python program is only 100 bytes long.",
             "A python's bite is not venomous but still hurts.",
             "I can't find the error in the python code.",
             "Where is my pet python; I can't find her!",
             "I use for and while loops when writing Python.",
             "The python will loop and wrap itself onto me.",
             "I use snake case for naming my variables.",
             "My python has grown to over 10 ft long!",
             "I use virtual environments to manage package versions.",
             "Pythons are the largest snakes in the environment."]
class_label = ['animal','language']
y_prob = pipe.predict_proba(test_docs)
predicted_indices = (y_prob[:, 1] > 0.5).astype(int)

for i, index in enumerate (predicted_indices):
    print(test_docs[i], '--> {} at {:g}%'.format(class_label[index], 100 * y_prob[i,index]))

My Python program is only 100 bytes long. --> language at 68.796%
A python's bite is not venomous but still hurts. --> language at 64.8024%
I can't find the error in the python code. --> language at 74.2181%
Where is my pet python; I can't find her! --> language at 62.5303%
I use for and while loops when writing Python. --> language at 86.4272%
The python will loop and wrap itself onto me. --> language at 70.8047%
I use snake case for naming my variables. --> language at 71.6721%
My python has grown to over 10 ft long! --> language at 51.7361%
I use virtual environments to manage package versions. --> language at 84.5939%
Pythons are the largest snakes in the environment. --> animal at 63.4977%


In [53]:
lemmatizer('Dogs are running quickly.')

['dog', 'be', 'run', 'quickly', '.']


In [59]:
TfidfVectorizer?

In [None]:
,