In [32]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import numpy as np

In [11]:
line = "I saw Mr. Ajay at the party. He was very happy. His email is ajay@cb.lk."

In [14]:
sent_tokenize(line)

['I saw Mr. Ajay at the party.',
 'He was very happy.',
 'His email is ajay@cb.lk.']

In [15]:
sent_tokenize(line, language="french")

['I saw Mr.',
 'Ajay at the party.',
 'He was very happy.',
 'His email is ajay@cb.lk.']

In [16]:
word_tokenize(line)

['I',
 'saw',
 'Mr.',
 'Ajay',
 'at',
 'the',
 'party',
 '.',
 'He',
 'was',
 'very',
 'happy',
 '.',
 'His',
 'email',
 'is',
 'ajay',
 '@',
 'cb.lk',
 '.']

In [19]:
with open("../datasets/speeches/speech.txt", encoding="utf8") as f:
    text = f.read()

In [20]:
f.closed

True

In [21]:
words = word_tokenize(text)

In [22]:
len(words)

330273

In [23]:
len(set(words))

16030

In [24]:
freq = {}
for word in words:
    if word in freq:
        freq[word] += 1
    else:
        freq[word] = 1

In [25]:
len(freq)

16030

In [27]:
from string import punctuation

In [46]:
for punc in punctuation:
    if punc in freq:
        freq.pop(punc)

In [47]:
for stop in stopwords.words("english"):
    if stop in freq:
        freq.pop(stop)

In [65]:
arr = np.array(list(freq.items()))

In [66]:
# arr[arr[:, 1].astype(int).argsort()[300:350]]

In [70]:
arr = arr[arr[:, 1].astype(int) > 5]

In [71]:
len(arr)

2656

In [73]:
arr[:, 1] = np.arange(len(arr))

In [74]:
bag = dict(arr)

In [76]:
# bag

In [83]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

In [79]:
stemmer = PorterStemmer()

In [82]:
stemmer.stem("better")

'better'

In [84]:
lemet = WordNetLemmatizer()

In [85]:
lemet.lemmatize("better", "a")

'good'

In [86]:
lemet.lemmatize("running", "v")

'run'

In [88]:
stemmer.stem("bowling")

'bowl'

In [98]:
text_mod = text.replace("\n", ". ")

In [99]:
sents = sent_tokenize(text_mod)

In [100]:
sents[:10]

['26 8 2016, India.',
 'Niti Aayog.',
 'There was a time when development was believed to depend on the quantity of capital and labour.',
 'Today we know that it depends as much on the quality of institutions and ideas.',
 'Early last year, a new institution was created, namely, the National Institution for Transforming India or NITI.',
 'NITI was created as an evidence based think tank to guide India’s transformation.. One of NITI’s functions is:.',
 '- to mainstream external ideas into Government policies, through collaboration with national and international experts;.',
 '- to be the Government’s link to the outside world, outside experts and practitioners;.',
 '- to be the instrument through which ideas from outside are incorporated into policy-making..',
 'The Government of India and the State Governments have a long administrative tradition.']

In [91]:
len(sents)

17939

In [101]:
sent = sents[10]

In [102]:
sent

'This tradition combines indigenous and external ideas from India’s past.'

In [115]:
out = np.zeros((len(sents), len(bag)), dtype=int)

In [116]:
out.shape

(21374, 2656)

In [122]:
for index, sent in enumerate(sents):
    vec = out[index]
    for w in sent:
        if w in bag:
            vec[int(bag[w])] += 1

In [123]:
from sklearn.feature_extraction.text import CountVectorizer

In [129]:
model = CountVectorizer?

[0;31mInit signature:[0m
[0mCountVectorizer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m[[0m[0;34m"input='content'"[0m[0;34m,[0m [0;34m"encoding='utf-8'"[0m[0;34m,[0m [0;34m"decode_error='strict'"[0m[0;34m,[0m [0;34m'strip_accents=None'[0m[0;34m,[0m [0;34m'lowercase=True'[0m[0;34m,[0m [0;34m'preprocessor=None'[0m[0;34m,[0m [0;34m'tokenizer=None'[0m[0;34m,[0m [0;34m'stop_words=None'[0m[0;34m,[0m [0;34m"token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b'"[0m[0;34m,[0m [0;34m'ngram_range=(1, 1)'[0m[0;34m,[0m [0;34m"analyzer='word'"[0m[0;34m,[0m [0;34m'max_df=1.0'[0m[0;34m,[0m [0;34m'min_df=1'[0m[0;34m,[0m [0;34m'max_features=None'[0m[0;34m,[0m [0;34m'vocabulary=None'[0m[0;34m,[0m [0;34m'binary=False'[0m[0;34m,[0m [0;34m"dtype=<class 'numpy.int64'>"[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Convert a collection of text documents to a matrix of token coun

In [None]:
model = CountVectorizer

In [127]:
model.fit(sents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [128]:
model.vocabulary_

{'26': 179,
 '2016': 140,
 'india': 5792,
 'niti': 7766,
 'aayog': 333,
 'there': 11449,
 'was': 12391,
 'time': 11537,
 'when': 12479,
 'development': 3261,
 'believed': 1374,
 'to': 11559,
 'depend': 3159,
 'on': 7942,
 'the': 11431,
 'quantity': 9078,
 'of': 7906,
 'capital': 1879,
 'and': 796,
 'labour': 6514,
 'today': 11562,
 'we': 12423,
 'know': 6452,
 'that': 11427,
 'it': 6160,
 'depends': 3163,
 'as': 978,
 'much': 7511,
 'quality': 9075,
 'institutions': 5965,
 'ideas': 5611,
 'early': 3689,
 'last': 6575,
 'year': 12653,
 'new': 7727,
 'institution': 5962,
 'created': 2809,
 'namely': 7606,
 'national': 7628,
 'for': 4647,
 'transforming': 11676,
 'or': 7999,
 'an': 773,
 'evidence': 4134,
 'based': 1292,
 'think': 11464,
 'tank': 11271,
 'guide': 5148,
 'transformation': 11671,
 'one': 7945,
 'functions': 4811,
 'is': 6137,
 'mainstream': 6946,
 'external': 4290,
 'into': 6058,
 'government': 5028,
 'policies': 8550,
 'through': 11507,
 'collaboration': 2302,
 'with': 125