In [1]:
# write sample text
sample_text = "Hola! My name is Gaurav. I am very pleased to meet you :)"

## Tokenize & Stop-words

In [7]:
# load libs
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
len(stopwords.words('english')), stopwords.words('english')[:10]

(179,
 ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're"])

In [8]:
word_tokens = word_tokenize(sample_text)
word_tokens

['Hola',
 '!',
 'My',
 'name',
 'is',
 'Gaurav',
 '.',
 'I',
 'am',
 'very',
 'pleased',
 'to',
 'meet',
 'you',
 ':',
 ')']

In [9]:
removed_stop_words = [
                      word for word in word_tokens if word not in stopwords.words('english')
]

In [10]:
removed_stop_words

['Hola', '!', 'My', 'name', 'Gaurav', '.', 'I', 'pleased', 'meet', ':', ')']

## Stemming

In [20]:
from nltk.stem.porter import PorterStemmer

In [22]:
porter = PorterStemmer()
[porter.stem(word) for word in word_tokens]

['hola',
 '!',
 'My',
 'name',
 'is',
 'gaurav',
 '.',
 'I',
 'am',
 'veri',
 'pleas',
 'to',
 'meet',
 'you',
 ':',
 ')']

## Bag-of-words

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
c_vectorizer = CountVectorizer()
bag_of_words = c_vectorizer.fit_transform(word_tokens)

In [25]:
bag_of_words

<16x11 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [26]:
bag_of_words.toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [41]:
# Reduce this sparse matrix (CSR)
from scipy import sparse
print(sparse.csr_matrix(bag_of_words.toarray()))

  (0, 2)	1
  (2, 5)	1
  (3, 6)	1
  (4, 3)	1
  (5, 1)	1
  (8, 0)	1
  (9, 9)	1
  (10, 7)	1
  (11, 8)	1
  (12, 4)	1
  (13, 10)	1


In [29]:
c_vectorizer.get_feature_names()

['am',
 'gaurav',
 'hola',
 'is',
 'meet',
 'my',
 'name',
 'pleased',
 'to',
 'very',
 'you']

## Word Importance

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(word_tokens)
feature_matrix

<16x11 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [32]:
feature_matrix.toarray()

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [34]:
tfidf.vocabulary_

{'am': 0,
 'gaurav': 1,
 'hola': 2,
 'is': 3,
 'meet': 4,
 'my': 5,
 'name': 6,
 'pleased': 7,
 'to': 8,
 'very': 9,
 'you': 10}

$BY Gaurav Kabra$