In [1]:
# Tokenize Text
from nltk.tokenize import word_tokenize, sent_tokenize

string = "The science of today is the technology of tomorrow. \
    Tomorrow is today."

# Tokenize words
tokenized_words = word_tokenize(string)
print(tokenized_words)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow', '.', 'Tomorrow', 'is', 'today', '.']


In [11]:
# Tokenize sentences
sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

In [3]:
# Remove Stop Words
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/suzuka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# Load stop words
stop_words = stopwords.words('english')

# Show stop words
stops = stop_words[:]
print(len(stops))

179


In [9]:
# Remove stop words
nonstop_words = []
for word in tokenized_words:
    if word not in stop_words:
        nonstop_words.append(word)
print(nonstop_words)       

['The', 'science', 'today', 'technology', 'tomorrow', '.', 'Tomorrow', 'today', '.']


In [10]:
# Stemming
from nltk.stem.porter import PorterStemmer

# Create stemmer
porter = PorterStemmer()

# Apply stemmer
for word in tokenized_words:
    print(porter.stem(word))
    
#[porter.stem(word) for word in tokenized_words]

the
scienc
of
today
is
the
technolog
of
tomorrow
.
tomorrow
is
today
.


In [12]:
# Tag Parts Of Speech
from nltk import pos_tag
from nltk import word_tokenize

# Use pre-trained part of speech tagger
text_tagged = pos_tag(word_tokenize(string))
print(text_tagged)

[('The', 'DT'), ('science', 'NN'), ('of', 'IN'), ('today', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('technology', 'NN'), ('of', 'IN'), ('tomorrow', 'NN'), ('.', '.'), ('Tomorrow', 'NN'), ('is', 'VBZ'), ('today', 'NN'), ('.', '.')]


In [18]:
# Bag of Words model
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create text
text_data = np.array(['I love San Francisco!',
                      'New York is best',
                      'LA beats both'])

In [14]:
# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

# Show feature matrix
bag_of_words.toarray()

array([[0, 0, 0, 1, 0, 0, 1, 0, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 0, 1, 0, 0, 0, 0]], dtype=int64)

In [15]:
# Get feature names
feature_names = count.get_feature_names()
print(feature_names)

['beats', 'best', 'both', 'francisco', 'is', 'la', 'love', 'new', 'san', 'york']


In [16]:
# Create data frame
df = pd.DataFrame(bag_of_words.toarray(), columns=feature_names)
print(df)

   beats  best  both  francisco  is  la  love  new  san  york
0      0     0     0          1   0   0     1    0    1     0
1      0     1     0          0   1   0     0    1    0     1
2      1     0     1          0   0   1     0    0    0     0


In [17]:
# Term Frequency – Inverse Document Frequency (TF – IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

obj = TfidfVectorizer()
corpus = ['This is sample document.', 'another random document.', 'third sample document text']

X = obj.fit_transform(corpus)
print(X)

  (0, 7)	0.5844829010200651
  (0, 2)	0.5844829010200651
  (0, 4)	0.444514311537431
  (0, 1)	0.34520501686496574
  (1, 1)	0.3853716274664007
  (1, 0)	0.652490884512534
  (1, 3)	0.652490884512534
  (2, 4)	0.444514311537431
  (2, 1)	0.34520501686496574
  (2, 6)	0.5844829010200651
  (2, 5)	0.5844829010200651


In [53]:
from gensim.models import Word2Vec

sentences = [['data', 'science'], 
             ['science', 'data', 'analytics'], 
             ['machine', 'learning'], 
             ['Woodbury', 'computer', 'science'],
             ['deep', 'learning']]

# train the model on your corpus  
model = Word2Vec(sentences, min_count = 1)

print(model.wv.similarity('data', 'science'))

-0.16685711


In [58]:
print(model.wv.similarity('Woodbury', 'science'))

0.041243795


In [59]:
print(model['Woodbury'])

[-2.8138672e-04 -3.8610934e-03  2.6190863e-03  4.8636068e-03
  2.2238060e-03  4.3003969e-03 -3.8194559e-03 -2.8702852e-03
 -3.7463487e-03 -2.2866086e-03  1.5371896e-03  2.7930783e-03
  1.2456301e-03 -2.4211605e-03  4.4885064e-03 -4.3097250e-03
  1.4070435e-03  2.6501480e-03  1.5569822e-03 -2.5650887e-03
 -3.2691783e-03  2.5493090e-04 -4.7438140e-03  1.0247465e-03
  8.3169230e-04 -3.2375345e-04 -4.2614643e-03  1.8137189e-05
  3.2401725e-03  9.1164029e-04 -1.6952838e-03  1.0598367e-03
 -6.7888475e-05  3.2858809e-03  1.5157023e-03  1.7273537e-03
 -4.4889948e-03 -2.7818333e-03  4.6497090e-03  1.7127949e-03
 -1.0339462e-03 -3.5879831e-03 -1.5833150e-03  4.5878594e-03
  2.2639406e-03 -3.2179763e-03  4.0261407e-04  4.9690693e-03
  4.7700628e-04  2.0386970e-03 -3.9804252e-04 -1.8492838e-03
  4.0697795e-03  2.5932947e-03 -2.4097192e-03  4.9152658e-03
  1.5523387e-03 -3.0126399e-03  3.0247825e-03 -8.9566811e-04
 -3.0556088e-03  4.2012050e-03  3.7428113e-03 -6.3232391e-04
 -3.7477021e-03 -7.01026