# Natural Language Processing with Python

In [1]:
corpus = ['To be, or not to be, that is the question:',
          'Whether \'tis nobler in the mind to suffer',
          'The slings and arrows of outrageous fortune']

print(corpus)

['To be, or not to be, that is the question:', "Whether 'tis nobler in the mind to suffer", 'The slings and arrows of outrageous fortune']


In [2]:
from sklearn.feature_extraction import text

In [3]:
vectorizor = text.CountVectorizer()
print(vectorizor)

CountVectorizer()


In [7]:
vectorizor.fit(corpus)

In [9]:
print('Vocabulary: ', vectorizor.vocabulary_)

Vocabulary:  {'to': 18, 'be': 2, 'or': 10, 'not': 8, 'that': 15, 'is': 5, 'the': 16, 'question': 12, 'whether': 19, 'tis': 17, 'nobler': 7, 'in': 4, 'mind': 6, 'suffer': 14, 'slings': 13, 'and': 0, 'arrows': 1, 'of': 9, 'outrageous': 11, 'fortune': 3}


In [11]:
print(vectorizor.get_feature_names_out())

['and' 'arrows' 'be' 'fortune' 'in' 'is' 'mind' 'nobler' 'not' 'of' 'or'
 'outrageous' 'question' 'slings' 'suffer' 'that' 'the' 'tis' 'to'
 'whether']


In [12]:
print(list(vectorizor.vocabulary_.keys()))

['to', 'be', 'or', 'not', 'that', 'is', 'the', 'question', 'whether', 'tis', 'nobler', 'in', 'mind', 'suffer', 'slings', 'and', 'arrows', 'of', 'outrageous', 'fortune']


In [14]:
token_count_matrix = vectorizor.transform(corpus)
print(token_count_matrix)

  (0, 2)	2
  (0, 5)	1
  (0, 8)	1
  (0, 10)	1
  (0, 12)	1
  (0, 15)	1
  (0, 16)	1
  (0, 18)	2
  (1, 4)	1
  (1, 6)	1
  (1, 7)	1
  (1, 14)	1
  (1, 16)	1
  (1, 17)	1
  (1, 18)	1
  (1, 19)	1
  (2, 0)	1
  (2, 1)	1
  (2, 3)	1
  (2, 9)	1
  (2, 11)	1
  (2, 13)	1
  (2, 16)	1


In [17]:
dense_tcm = token_count_matrix.toarray()
dense_tcm

array([[0, 0, 2, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 2, 0],
       [0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1],
       [1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0]],
      dtype=int64)

In [18]:
feature_names = vectorizor.get_feature_names_out()

for el in vectorizor.vocabulary_:
    print(el)

to
be
or
not
that
is
the
question
whether
tis
nobler
in
mind
suffer
slings
and
arrows
of
outrageous
fortune


In [19]:
import pandas as pd

pd.DataFrame(data=dense_tcm, index=['corpus_0', 'corpus_1', 'corpus_2'],
             columns=vectorizor.get_feature_names_out())

Unnamed: 0,and,arrows,be,fortune,in,is,mind,nobler,not,of,or,outrageous,question,slings,suffer,that,the,tis,to,whether
corpus_0,0,0,2,0,0,1,0,0,1,0,1,0,1,0,0,1,1,0,2,0
corpus_1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,1,1,1,1
corpus_2,1,1,0,1,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0


In [20]:
word = 'be'

i = 1
j = vectorizor.vocabulary_[word]

print('number of times "' + word+ '" occurs in:')

for i in range(len(corpus)):
    print('    "' + corpus[i] + '": ' + str(dense_tcm[i][j]))

number of times "be" occurs in:
    "To be, or not to be, that is the question:": 2
    "Whether 'tis nobler in the mind to suffer": 0
    "The slings and arrows of outrageous fortune": 0


In [21]:
txt = 'That is the question and it is nobler in the mind.'

vectorizor.transform([txt]).toarray()

array([[1, 0, 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 0, 0]],
      dtype=int64)

In [22]:
print(vectorizor.get_feature_names_out())

['and' 'arrows' 'be' 'fortune' 'in' 'is' 'mind' 'nobler' 'not' 'of' 'or'
 'outrageous' 'question' 'slings' 'suffer' 'that' 'the' 'tis' 'to'
 'whether']


In [23]:
print(vectorizor.vocabulary_)

{'to': 18, 'be': 2, 'or': 10, 'not': 8, 'that': 15, 'is': 5, 'the': 16, 'question': 12, 'whether': 19, 'tis': 17, 'nobler': 7, 'in': 4, 'mind': 6, 'suffer': 14, 'slings': 13, 'and': 0, 'arrows': 1, 'of': 9, 'outrageous': 11, 'fortune': 3}


In [24]:
from sklearn.feature_extraction import text

corpus = ['It does not matter what you are doing, just do it!',
          'Would you work if you won the lottery?',
          'You like Python, he likes Python, we like Python, everybody loves Python!',
          'You said: "I wish I were a Python programmer"',
          'You can stay here, if you want to. I would, if I were you.']

In [25]:
vectorizor = text.CountVectorizer()
vectorizor.fit(corpus)

token_count_matrix = vectorizor.transform(corpus)
print(token_count_matrix)

  (0, 0)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 9)	2
  (0, 10)	1
  (0, 15)	1
  (0, 16)	1
  (0, 26)	1
  (0, 31)	1
  (1, 8)	1
  (1, 13)	1
  (1, 21)	1
  (1, 28)	1
  (1, 29)	1
  (1, 30)	1
  (1, 31)	2
  (2, 5)	1
  (2, 6)	1
  (2, 11)	2
  (2, 12)	1
  (2, 14)	1
  (2, 18)	4
  (2, 24)	1
  (2, 31)	1
  (3, 17)	1
  (3, 18)	1
  (3, 19)	1
  (3, 25)	1
  (3, 27)	1
  (3, 31)	1
  (4, 1)	1
  (4, 7)	1
  (4, 8)	2
  (4, 20)	1
  (4, 22)	1
  (4, 23)	1
  (4, 25)	1
  (4, 30)	1
  (4, 31)	3


In [26]:
tf_idf = text.TfidfTransformer()

tf_idf.fit(token_count_matrix)

tf_idf.idf_

array([2.09861229, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 1.69314718, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 1.69314718, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       1.69314718, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       1.69314718, 1.        ])

In [28]:
tf_idf.idf_[vectorizor.vocabulary_['python']]

1.6931471805599454

In [29]:
da = vectorizor.transform(corpus).toarray()
i = 0

word_ind = vectorizor.vocabulary_['would']
da[i][word_ind]
da[:, word_ind]

array([0, 1, 0, 0, 1], dtype=int64)

In [30]:
word_weight_list = list(zip(vectorizor.get_feature_names_out(), tf_idf.idf_))

word_weight_list.sort(key=lambda x:x[1])

for word, idf_weight in word_weight_list:
    print(f'{word:15s}: {idf_weight:4.3f}')

you            : 1.000
if             : 1.693
python         : 1.693
were           : 1.693
would          : 1.693
are            : 2.099
can            : 2.099
do             : 2.099
does           : 2.099
doing          : 2.099
everybody      : 2.099
he             : 2.099
here           : 2.099
it             : 2.099
just           : 2.099
like           : 2.099
likes          : 2.099
lottery        : 2.099
loves          : 2.099
matter         : 2.099
not            : 2.099
programmer     : 2.099
said           : 2.099
stay           : 2.099
the            : 2.099
to             : 2.099
want           : 2.099
we             : 2.099
what           : 2.099
wish           : 2.099
won            : 2.099
work           : 2.099


In [32]:
print(corpus)

['It does not matter what you are doing, just do it!', 'Would you work if you won the lottery?', 'You like Python, he likes Python, we like Python, everybody loves Python!', 'You said: "I wish I were a Python programmer"', 'You can stay here, if you want to. I would, if I were you.']


In [33]:
from numpy import log
from sklearn.feature_extraction import text

n = len(corpus)

vectorizor = text.CountVectorizer()
vectorizor.fit(corpus)
da = vectorizor.transform(corpus).toarray()

In [41]:
def tf(t, d, mode='raw'):
    if t in vectorizor.vocabulary_:
        word_ind = vectorizor.vocabulary_[t]
        t_occurences = da[d, word_ind]
    else:
        t_occurences = 0

    if mode == 'raw':
        result = t_occurences
    elif mode == 'length':
        all_terms = (da[d] > 0).sum() 
        result = t_occurences / all_terms
    elif mode == 'log':
        result = log(1 + t_occurences)
    elif mode == 'augfreq':
        result = 0.5 + 0.5 * t_occurences / da[d].max()

    return result

In [42]:
print('   raw    length  log    augmented freq')

for term in ['matter', 'python', 'would']:
    for docu_index in range(len(corpus)):
        d = corpus[docu_index]

        print(f"\n'{term}' in '{d}'")

        for mode in ['raw', 'length', 'log', 'augfreq']:
            x = tf(term, docu_index, mode=mode)
            print(f"{x:7.2f}", end="")

   raw    length  log    augmented freq

'matter' in 'It does not matter what you are doing, just do it!'
   1.00   0.10   0.69   0.75
'matter' in 'Would you work if you won the lottery?'
   0.00   0.00   0.00   0.50
'matter' in 'You like Python, he likes Python, we like Python, everybody loves Python!'
   0.00   0.00   0.00   0.50
'matter' in 'You said: "I wish I were a Python programmer"'
   0.00   0.00   0.00   0.50
'matter' in 'You can stay here, if you want to. I would, if I were you.'
   0.00   0.00   0.00   0.50
'python' in 'It does not matter what you are doing, just do it!'
   0.00   0.00   0.00   0.50
'python' in 'Would you work if you won the lottery?'
   0.00   0.00   0.00   0.50
'python' in 'You like Python, he likes Python, we like Python, everybody loves Python!'
   4.00   0.50   1.61   1.00
'python' in 'You said: "I wish I were a Python programmer"'
   1.00   0.17   0.69   1.00
'python' in 'You can stay here, if you want to. I would, if I were you.'
   0.00   0.00   0.0

In [43]:
def df(t):
    word_ind = vectorizor.vocabulary_[t]

    tf_in_docus = da[:, word_ind]
    existence_in_docus = tf_in_docus > 0

    return existence_in_docus.sum()

def idf(t, smooth_idf=True):
    if smooth_idf:
        return log((1 + n) / (1 + df(t))) + 1
    else:
        return log(n / df(t)) + 1
    
def tf_idf(t, d):
    return idf(t) * tf(t, d)


res_idf = []

for word in vectorizor.get_feature_names_out():
    tf_docus = []
    res_idf.append([word, idf(word)])

res_idf.sort(key=lambda x:x[1])

for item in res_idf:
    print(item)

['you', 1.0]
['if', 1.6931471805599454]
['python', 1.6931471805599454]
['were', 1.6931471805599454]
['would', 1.6931471805599454]
['are', 2.09861228866811]
['can', 2.09861228866811]
['do', 2.09861228866811]
['does', 2.09861228866811]
['doing', 2.09861228866811]
['everybody', 2.09861228866811]
['he', 2.09861228866811]
['here', 2.09861228866811]
['it', 2.09861228866811]
['just', 2.09861228866811]
['like', 2.09861228866811]
['likes', 2.09861228866811]
['lottery', 2.09861228866811]
['loves', 2.09861228866811]
['matter', 2.09861228866811]
['not', 2.09861228866811]
['programmer', 2.09861228866811]
['said', 2.09861228866811]
['stay', 2.09861228866811]
['the', 2.09861228866811]
['to', 2.09861228866811]
['want', 2.09861228866811]
['we', 2.09861228866811]
['what', 2.09861228866811]
['wish', 2.09861228866811]
['won', 2.09861228866811]
['work', 2.09861228866811]


In [44]:
corpus

['It does not matter what you are doing, just do it!',
 'Would you work if you won the lottery?',
 'You like Python, he likes Python, we like Python, everybody loves Python!',
 'You said: "I wish I were a Python programmer"',
 'You can stay here, if you want to. I would, if I were you.']

In [45]:
for word, word_index in vectorizor.vocabulary_.items():
    print(f"\n{word:12s}: ", end='')

    for d_index in range(len(corpus)):
        print(f"{d_index:1d} {tf_idf(word, d_index):3.2f}, ", end="")


it          : 0 4.20, 1 0.00, 2 0.00, 3 0.00, 4 0.00, 
does        : 0 2.10, 1 0.00, 2 0.00, 3 0.00, 4 0.00, 
not         : 0 2.10, 1 0.00, 2 0.00, 3 0.00, 4 0.00, 
matter      : 0 2.10, 1 0.00, 2 0.00, 3 0.00, 4 0.00, 
what        : 0 2.10, 1 0.00, 2 0.00, 3 0.00, 4 0.00, 
you         : 0 1.00, 1 2.00, 2 1.00, 3 1.00, 4 3.00, 
are         : 0 2.10, 1 0.00, 2 0.00, 3 0.00, 4 0.00, 
doing       : 0 2.10, 1 0.00, 2 0.00, 3 0.00, 4 0.00, 
just        : 0 2.10, 1 0.00, 2 0.00, 3 0.00, 4 0.00, 
do          : 0 2.10, 1 0.00, 2 0.00, 3 0.00, 4 0.00, 
would       : 0 0.00, 1 1.69, 2 0.00, 3 0.00, 4 1.69, 
work        : 0 0.00, 1 2.10, 2 0.00, 3 0.00, 4 0.00, 
if          : 0 0.00, 1 1.69, 2 0.00, 3 0.00, 4 3.39, 
won         : 0 0.00, 1 2.10, 2 0.00, 3 0.00, 4 0.00, 
the         : 0 0.00, 1 2.10, 2 0.00, 3 0.00, 4 0.00, 
lottery     : 0 0.00, 1 2.10, 2 0.00, 3 0.00, 4 0.00, 
like        : 0 0.00, 1 0.00, 2 4.20, 3 0.00, 4 0.00, 
python      : 0 0.00, 1 0.00, 2 6.77, 3 1.69, 4 0.00, 
he       

In [47]:
from sklearn.feature_extraction import text

word = 'Cold wind blows over the cornfields'.split()

corpus = []

for i in range(1, len(word) + 1):
    corpus.append(' '.join(word[:i]))

print(corpus)

['Cold', 'Cold wind', 'Cold wind blows', 'Cold wind blows over', 'Cold wind blows over the', 'Cold wind blows over the cornfields']


In [48]:
vectorizor = text.CountVectorizer()

vectorizor = vectorizor.fit(corpus)
vectorizor_text = vectorizor.transform(corpus)

In [49]:
tf_idf = text.TfidfTransformer()
tf_idf.fit(vectorizor_text)

tf_idf.idf_

array([1.33647224, 1.        , 2.25276297, 1.55961579, 1.84729786,
       1.15415068])

In [50]:
word_weight_list = list(zip(vectorizor.get_feature_names_out(),
                            tf_idf.idf_))
word_weight_list.sort(key=lambda x:x[1])

for word, idf_weight in word_weight_list:
    print(f"{word:15s}: {idf_weight:4.3f}")

cold           : 1.000
wind           : 1.154
blows          : 1.336
over           : 1.560
the            : 1.847
cornfields     : 2.253


In [51]:
TfidF = text.TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf = TfidF.fit_transform(vectorizor_text)

word_weight_list = list(zip(vectorizor.get_feature_names_out(),
                            tf_idf.idf_))
word_weight_list.sort(key=lambda x:x[1])

for word, idf_weight in word_weight_list:
    print(f"{word:15s}: {idf_weight:4.3f}")

cold           : 1.000
wind           : 1.154
blows          : 1.336
over           : 1.560
the            : 1.847
cornfields     : 2.253


In [52]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

import numpy as np

vectorizor = CountVectorizer()

newsgroups_data = fetch_20newsgroups()

In [53]:
print(newsgroups_data.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [54]:
print(newsgroups_data.data[200])

Subject: Re: "Proper gun control?" What is proper gun cont
From: kim39@scws8.harvard.edu (John Kim)
Organization: Harvard University Science Center
Nntp-Posting-Host: scws8.harvard.edu
Lines: 17

In article <C5JGz5.34J@SSD.intel.com> hays@ssd.intel.com (Kirk Hays) writes:
>I'd like to point out that I was in error - "Terminator" began posting only 
>six months before he purchased his first firearm, according to private email
>from him.
>I can't produce an archived posting of his earlier than January 1992,
>and he purchased his first firearm in March 1992.
>I guess it only seemed like years.
>Kirk Hays - NRA Life, seventh generation.

I first read and consulted rec.guns in the summer of 1991.  I
just purchased my first firearm in early March of this year.

 NOt for lack of desire for a firearm, you understand.  I could 
have purchased a rifle or shotgun but didn't want one.
-Case Kim





In [55]:
vectorizor.fit(newsgroups_data.data)

In [56]:
counter = 0
n = 10

for word, index in vectorizor.vocabulary_.items():
    print(word, index)
    
    counter += 1

    if counter > n:
        break

from 56979
lerxst 75358
wam 123162
umd 118280
edu 50527
where 124031
my 85354
thing 114688
subject 111322
what 123984
car 37780


In [57]:
a = vectorizor.transform([newsgroups_data.data[0]]).toarray()[0]
print(a)

[0 0 0 ... 0 0 0]


In [58]:
len(vectorizor.vocabulary_)

130107

In [59]:
newsgroups_data_cleaned = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

In [60]:
print(newsgroups_data_cleaned.data[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [61]:
vectorizor_cleaned = vectorizor.fit(newsgroups_data_cleaned.data)
len(vectorizor_cleaned.vocabulary_)

101631

In [62]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [63]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

vectorizor = CountVectorizer()

In [64]:
train_data = vectorizor.fit_transform(newsgroups_train.data)

classifier = MultinomialNB(alpha=0.01)
classifier.fit(train_data, newsgroups_data.target)

test_data = vectorizor.transform(newsgroups_test.data)

predictions = classifier.predict(test_data)
accuracy_score = metrics.accuracy_score(newsgroups_test.target, predictions)

f1_Score = metrics.f1_score(newsgroups_test.target, predictions, average='macro')

print('Accuracy score: ', accuracy_score)
print('F1 score: ', f1_Score)

Accuracy score:  0.6460435475305364
F1 score:  0.6203806145034193


In [67]:
corpus = ["A horse, a horse, my kingdom for a horse!",
          "Horse sense is the thing a horse has which keeps it from betting on people."
          "I’ve often said there is nothing better for the inside of the man, than the outside of the horse.",
          "A man on a horse is spiritually, as well as physically, bigger then a man on foot.",
          "No heaven can heaven be, if my horse isn’t there to welcome me."]

cv = CountVectorizer(stop_words=["my", "for","the", "has", "than", "if", 
                                 "from", "on", "of", "it", "there", "ve",
                                 "as", "no", "be", "which", "isn", "to", 
                                 "me", "is", "can", "then"])
count_vector = cv.fit_transform(corpus)
count_vector.shape

cv.vocabulary_

{'horse': 5,
 'kingdom': 8,
 'sense': 16,
 'thing': 18,
 'keeps': 7,
 'betting': 1,
 'people': 13,
 'often': 11,
 'said': 15,
 'nothing': 10,
 'better': 0,
 'inside': 6,
 'man': 9,
 'outside': 12,
 'spiritually': 17,
 'well': 20,
 'physically': 14,
 'bigger': 2,
 'foot': 3,
 'heaven': 4,
 'welcome': 19}

In [69]:
n = 25

print(str(n) + " arbitrary words from ENGLISH_STOP_WORDS:")

counter = 0

for word in text.ENGLISH_STOP_WORDS:
    if counter == n - 1:
        print(word)
        break

    print(word, end=', ')
    counter += 1

25 arbitrary words from ENGLISH_STOP_WORDS:
across, always, somehow, any, sincere, become, there, from, something, whose, myself, if, next, put, each, yet, though, anyone, bill, thick, cry, them, were, its, everything


In [71]:
vectorizor = CountVectorizer(stop_words=list(text.ENGLISH_STOP_WORDS))

vectors = vectorizor.fit_transform(newsgroups_train.data)

classifier = MultinomialNB(alpha=0.01)
classifier.fit(vectors, newsgroups_train.target)

vectors_test = vectorizor.transform(newsgroups_test.data)

predictions = classifier.predict(vectors_test)
accuracy_score = metrics.accuracy_score(newsgroups_test.target, predictions)

f1_score = metrics.f1_score(newsgroups_test.target, predictions, average='macro')

print('accuracy score: ', accuracy_score)
print('f1_Score: ', f1_Score)

accuracy score:  0.6526818906001062
f1_Score:  0.6203806145034193


In [72]:
corpus = ["""People say you cannot live without love, 
             but I think oxygen is more important""",
          "Sometimes, when you close your eyes, you cannot see."
          "A horse, a horse, my kingdom for a horse!",
          """Horse sense is the thing a horse has which 
          keeps it from betting on people."""
          """I’ve often said there is nothing better for 
          the inside of the man, than the outside of the horse.""",
          """A man on a horse is spiritually, as well as physically, 
          bigger then a man on foot.""",
          """No heaven can heaven be, if my horse isn’t there 
          to welcome me."""]

cv = CountVectorizer(min_df=2)

count_vector = cv.fit_transform(corpus)
cv.vocabulary_

{'people': 7,
 'you': 9,
 'cannot': 0,
 'is': 3,
 'horse': 2,
 'my': 5,
 'for': 1,
 'on': 6,
 'there': 8,
 'man': 4}

In [73]:
cv.stop_words_

{'as',
 'be',
 'better',
 'betting',
 'bigger',
 'but',
 'can',
 'close',
 'eyes',
 'foot',
 'from',
 'has',
 'heaven',
 'if',
 'important',
 'inside',
 'isn',
 'it',
 'keeps',
 'kingdom',
 'live',
 'love',
 'me',
 'more',
 'no',
 'nothing',
 'of',
 'often',
 'outside',
 'oxygen',
 'physically',
 'said',
 'say',
 'see',
 'sense',
 'sometimes',
 'spiritually',
 'than',
 'the',
 'then',
 'thing',
 'think',
 'to',
 've',
 'welcome',
 'well',
 'when',
 'which',
 'without',
 'your'}

In [74]:
print('number of docus, size of vocabulary, stop_words list size')

for i in range(len(corpus)):
    cv = CountVectorizer(min_df=i)
    count_vector = cv.fit_transform(corpus)

    len_voc = len(cv.vocabulary_)
    len_stop_words = len(cv.stop_words_)

    print(f"{i:10d} {len_voc:15d} {len_stop_words: 19d}")

number of docus, size of vocabulary, stop_words list size
         0              60                   0
         1              60                   0
         2              10                  50
         3               2                  58
         4               1                  59


In [75]:
cv = CountVectorizer(max_df=0.20)

count_vector = cv.fit_transform(corpus)
cv.stop_words_

{'cannot', 'for', 'horse', 'is', 'man', 'my', 'on', 'people', 'there', 'you'}