#### Related links

* https://www.kaggle.com/currie32/project-gutenbergs-top-20-books
* https://www.kaggle.com/PromptCloudHQ/imdb-data
* https://www.yelp.com/dataset
* http://vectors.nlpl.eu/repository/20/40.zip

# Putting documents into a bag of words

* A bag of words is the simplest way of representing text. We treat our text as a collection of documents, where documents are anything from sentences to book chapters to whole books. 


* Since we usually compare different documents to each other or use them in a larger context of other documents, typically, we work with a collection of documents, not just a single document.

In [8]:
import sys
sys.path.append('..')

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from Chapter01.dividing_into_sentences import read_text_file,\
preprocess_text, divide_into_sentences_nltk

In [12]:
#. Read in the text file, preprocess the text, 
# and divide it into sentences:

def get_sentences(filename):
    sherlock_holmes_text = read_text_file(filename)
    sherlock_holmes_text = preprocess_text(sherlock_holmes_text)
    senteces = divide_into_sentences_nltk(sherlock_holmes_text)
    return senteces


In [14]:
# Create a function that will return the vectorizer and final matrix

def create_vectorizer(sentences):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    return vectorizer, X

In [16]:
sentences = get_sentences('../Chapter01/sherlock_holmes_1.txt')
vectorizer, X = create_vectorizer(sentences)

In [23]:
# he resulting matrix is a scipy.sparse.csr.csr_matrix object, and the beginning of its printout looks like this:
print(X)

# first number document number, word number

  (0, 114)	1
  (0, 99)	1
  (0, 47)	1
  (0, 98)	1
  (0, 54)	1
  (0, 10)	1
  (0, 0)	1
  (0, 124)	1
  (1, 39)	1
  (1, 95)	1
  (1, 41)	1
  (1, 44)	1
  (1, 64)	1
  (1, 42)	1
  (1, 116)	1
  (1, 12)	1
  (1, 79)	1
  (1, 70)	1
  (2, 98)	1
  (2, 42)	1
  (2, 48)	1
  (2, 46)	1
  (2, 29)	1
  (2, 25)	1
  (2, 11)	1
  :	:
  (9, 57)	1
  (9, 15)	1
  (9, 67)	1
  (9, 21)	1
  (9, 107)	1
  (9, 103)	1
  (9, 71)	1
  (10, 114)	1
  (10, 124)	2
  (10, 44)	1
  (10, 11)	3
  (10, 109)	1
  (10, 76)	1
  (10, 119)	2
  (10, 108)	1
  (10, 53)	1
  (10, 4)	1
  (10, 77)	1
  (10, 16)	1
  (10, 127)	1
  (10, 110)	1
  (10, 56)	1
  (10, 24)	1
  (10, 89)	1
  (10, 61)	1


In [25]:
# It can also be turned into a numpy.matrixlib.defmatrix.matrix object, where each sentence is a vector. These sentence vectors can be used our machine learning algorithms later:
denseX = X.todense()
print(denseX)

[[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]


In [26]:
print(len(sentences))
print(denseX.shape)

11
(11, 128)


In [28]:
print(len(vectorizer.get_feature_names()))
print(vectorizer.get_feature_names())

128
['_the_', 'abhorrent', 'actions', 'adjusted', 'adler', 'admirable', 'admirably', 'admit', 'akin', 'all', 'always', 'and', 'any', 'as', 'balanced', 'be', 'but', 'cold', 'crack', 'delicate', 'distracting', 'disturbing', 'doubt', 'drawing', 'dubious', 'eclipses', 'emotion', 'emotions', 'excellent', 'eyes', 'factor', 'false', 'felt', 'finely', 'for', 'from', 'gibe', 'grit', 'has', 'have', 'he', 'heard', 'her', 'high', 'him', 'himself', 'his', 'holmes', 'in', 'instrument', 'into', 'introduce', 'intrusions', 'irene', 'is', 'it', 'late', 'lenses', 'love', 'lover', 'machine', 'memory', 'men', 'mental', 'mention', 'might', 'mind', 'more', 'most', 'motives', 'name', 'nature', 'never', 'not', 'observer', 'observing', 'of', 'one', 'or', 'other', 'own', 'particularly', 'passions', 'perfect', 'placed', 'position', 'power', 'precise', 'predominates', 'questionable', 'reasoner', 'reasoning', 'results', 'save', 'seen', 'seldom', 'sensitive', 'sex', 'she', 'sherlock', 'sneer', 'softer', 'spoke', 'st

In [31]:
new_sentence = "I had seen little of Holmes lately."
new_sentece_vector = vectorizer.transform([new_sentence])

In [32]:
print(new_sentece_vector)
print(new_sentece_vector.todense())

  (0, 47)	1
  (0, 76)	1
  (0, 94)	1
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [41]:
vectorizer.get_feature_names()[75]

'observing'

In [42]:
vectorizer.vocabulary_

{'to': 114,
 'sherlock': 99,
 'holmes': 47,
 'she': 98,
 'is': 54,
 'always': 10,
 '_the_': 0,
 'woman': 124,
 'have': 39,
 'seldom': 95,
 'heard': 41,
 'him': 44,
 'mention': 64,
 'her': 42,
 'under': 116,
 'any': 12,
 'other': 79,
 'name': 70,
 'in': 48,
 'his': 46,
 'eyes': 29,
 'eclipses': 25,
 'and': 11,
 'predominates': 88,
 'the': 109,
 'whole': 122,
 'of': 76,
 'sex': 97,
 'it': 55,
 'was': 119,
 'not': 73,
 'that': 108,
 'he': 40,
 'felt': 32,
 'emotion': 26,
 'akin': 8,
 'love': 58,
 'for': 34,
 'irene': 53,
 'adler': 4,
 'all': 9,
 'emotions': 27,
 'one': 77,
 'particularly': 81,
 'were': 120,
 'abhorrent': 1,
 'cold': 17,
 'precise': 87,
 'but': 16,
 'admirably': 6,
 'balanced': 14,
 'mind': 66,
 'take': 105,
 'most': 68,
 'perfect': 83,
 'reasoning': 91,
 'observing': 75,
 'machine': 60,
 'world': 125,
 'has': 38,
 'seen': 94,
 'as': 13,
 'lover': 59,
 'would': 126,
 'placed': 84,
 'himself': 45,
 'false': 31,
 'position': 85,
 'never': 72,
 'spoke': 102,
 'softer': 101,
 

In [43]:
## Vectorizer with stopwords removal

vectorizer = CountVectorizer(stop_words='english')

In [44]:
X = vectorizer.fit_transform(sentences)

In [46]:
print(len(vectorizer.get_feature_names()))
print(vectorizer.get_feature_names())

79
['_the_', 'abhorrent', 'actions', 'adjusted', 'adler', 'admirable', 'admirably', 'admit', 'akin', 'balanced', 'cold', 'crack', 'delicate', 'distracting', 'disturbing', 'doubt', 'drawing', 'dubious', 'eclipses', 'emotion', 'emotions', 'excellent', 'eyes', 'factor', 'false', 'felt', 'finely', 'gibe', 'grit', 'heard', 'high', 'holmes', 'instrument', 'introduce', 'intrusions', 'irene', 'late', 'lenses', 'love', 'lover', 'machine', 'memory', 'men', 'mental', 'mention', 'mind', 'motives', 'nature', 'observer', 'observing', 'particularly', 'passions', 'perfect', 'placed', 'position', 'power', 'precise', 'predominates', 'questionable', 'reasoner', 'reasoning', 'results', 'save', 'seen', 'seldom', 'sensitive', 'sex', 'sherlock', 'sneer', 'softer', 'spoke', 'strong', 'temperament', 'things', 'throw', 'trained', 'veil', 'woman', 'world']


In [47]:
new_sentence = "And yet there was but one woman to him, and that woman was the late Irene Adler, of dubious and questionable memory."

In [49]:
new_sentence_vector = vectorizer.transform([new_sentence])
analyse = vectorizer.build_analyzer()
print(analyse(new_sentence))

['woman', 'woman', 'late', 'irene', 'adler', 'dubious', 'questionable', 'memory']


In [51]:
print(new_sentence_vector)

  (0, 4)	1
  (0, 17)	1
  (0, 35)	1
  (0, 36)	1
  (0, 41)	1
  (0, 58)	1
  (0, 77)	2


In [53]:
# if any word is present more than 80%  on the document we will ignore
vectorizer = CountVectorizer(max_df=0.8)

In [55]:
new_sentence_vector = vectorizer.fit_transform(sentences)

In [56]:
len(vectorizer.get_feature_names())

128

# Constructing the N-gram model

* Representing a document as a bag of words is useful, but semantics is about more than just words in isolation. 


* To capture word combinations, an n-gram model is useful. Its vocabulary consists not just of words, but word sequences, or n-grams.

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
from Chapter01.dividing_into_sentences import read_text_file, preprocess_text, divide_into_sentences_nltk
from Chapter03.bag_of_words_1 import get_sentences, get_new_sentences_vector

In [61]:
sentences = get_sentences('../Chapter01/sherlock_holmes_1.txt')

In [64]:
bigram_vectorizer = CountVectorizer(ngram_range = (1, 2))

In [67]:
X = bigram_vectorizer.fit_transform(sentences)
print(X)

  (0, 269)	1
  (0, 229)	1
  (0, 118)	1
  (0, 226)	1
  (0, 136)	1
  (0, 20)	1
  (0, 0)	1
  (0, 299)	1
  (0, 275)	1
  (0, 230)	1
  (0, 119)	1
  (0, 228)	1
  (0, 137)	1
  (0, 21)	1
  (0, 1)	1
  (1, 93)	1
  (1, 221)	1
  (1, 101)	1
  (1, 108)	1
  (1, 156)	1
  (1, 103)	1
  (1, 278)	1
  (1, 31)	1
  (1, 190)	1
  (1, 167)	1
  :	:
  (10, 307)	1
  (10, 261)	1
  (10, 141)	1
  (10, 60)	1
  (10, 210)	1
  (10, 151)	1
  (10, 30)	1
  (10, 308)	1
  (10, 262)	1
  (10, 285)	1
  (10, 45)	1
  (10, 187)	1
  (10, 300)	1
  (10, 271)	1
  (10, 109)	1
  (10, 251)	1
  (10, 301)	1
  (10, 288)	1
  (10, 253)	1
  (10, 142)	1
  (10, 8)	1
  (10, 180)	1
  (10, 61)	1
  (10, 27)	1
  (10, 211)	1


In [68]:
denseX = X.todense()
print(denseX)

[[1 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 1]]


In [69]:
print(bigram_vectorizer.get_feature_names())

['_the_', '_the_ woman', 'abhorrent', 'abhorrent to', 'actions', 'adjusted', 'adjusted temperament', 'adler', 'adler of', 'admirable', 'admirable things', 'admirably', 'admirably balanced', 'admit', 'admit such', 'akin', 'akin to', 'all', 'all emotions', 'all his', 'always', 'always _the_', 'and', 'and actions', 'and finely', 'and observing', 'and predominates', 'and questionable', 'and sneer', 'and that', 'and yet', 'any', 'any emotion', 'any other', 'as', 'as his', 'as lover', 'balanced', 'balanced mind', 'be', 'be more', 'but', 'but admirably', 'but as', 'but for', 'but one', 'cold', 'cold precise', 'crack', 'crack in', 'delicate', 'delicate and', 'distracting', 'distracting factor', 'disturbing', 'disturbing than', 'doubt', 'doubt upon', 'drawing', 'drawing the', 'dubious', 'dubious and', 'eclipses', 'eclipses and', 'emotion', 'emotion akin', 'emotion in', 'emotions', 'emotions and', 'excellent', 'excellent for', 'eyes', 'eyes she', 'factor', 'factor which', 'false', 'false positio

In [70]:
print(len(bigram_vectorizer.get_feature_names()))

309


In [71]:
new_sentence = "I had seen little of Holmes lately."
new_sentece_vector = bigram_vectorizer.transform([new_sentence])

In [72]:
print(new_sentence_vector)
print(new_sentece_vector.todense())

  (0, 114)	1
  (0, 99)	1
  (0, 47)	1
  (0, 98)	1
  (0, 54)	1
  (0, 10)	1
  (0, 0)	1
  (0, 124)	1
  (1, 39)	1
  (1, 95)	1
  (1, 41)	1
  (1, 44)	1
  (1, 64)	1
  (1, 42)	1
  (1, 116)	1
  (1, 12)	1
  (1, 79)	1
  (1, 70)	1
  (2, 98)	1
  (2, 42)	1
  (2, 48)	1
  (2, 46)	1
  (2, 29)	1
  (2, 25)	1
  (2, 11)	1
  :	:
  (9, 57)	1
  (9, 15)	1
  (9, 67)	1
  (9, 21)	1
  (9, 107)	1
  (9, 103)	1
  (9, 71)	1
  (10, 114)	1
  (10, 124)	2
  (10, 44)	1
  (10, 11)	3
  (10, 109)	1
  (10, 76)	1
  (10, 119)	2
  (10, 108)	1
  (10, 53)	1
  (10, 4)	1
  (10, 77)	1
  (10, 16)	1
  (10, 127)	1
  (10, 110)	1
  (10, 56)	1
  (10, 24)	1
  (10, 89)	1
  (10, 61)	1
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [73]:
new_sentence1 = " And yet there was but one woman to him, and that woman was the late Irene Adler, of dubious and questionable memory."
new_sentence_vector1 = vectorizer.transform([new_sentence])

In [74]:
print(new_sentence_vector1)
print(new_sentence_vector1.todense())

  (0, 47)	1
  (0, 76)	1
  (0, 94)	1
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


# Representing texts with TF-IDF

* TF-IDF stands for term frequency-inverse document frequency and gives more weight to words that are unique to a document than to words that are frequent,but repeated throughout most documents. 


* This allows us to give more weight to words uniquely characteristic to particular documents. 


* https://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting.


* The TfidfVectorizer class allows for all the functionality of CountVectorizer, except that it uses the TF-IDF algorithm to count the words instead of direct counts. The other features of the class should be familiar. 

In [76]:
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer

In [77]:
stemmer = SnowballStemmer('english')

In [78]:
sentences = get_sentences('../Chapter01/sherlock_holmes_1.txt')

In [80]:
from nltk.corpus import stopwords

In [86]:
stop_words = stopwords.words('english')

In [89]:
def tokenize_and_stem(sentences):
    tokens = nltk.word_tokenize(sentences)
    filtered_token = [t for t in tokens if t not in string.punctuation]
    stems = [stemmer.stem(t) for t in filtered_token]
    return stems

In [90]:
all_stopwords = stop_words + [tokenize_and_stem(stop_word)[0] for stop_word in stop_words]

In [95]:
tfidf_vectorizer = TfidfVectorizer(max_df = 0.9, max_features = 200000,
                                  min_df = 0.05, stop_words=all_stopwords,
                                  use_idf=True, tokenizer=tokenize_and_stem,
                                  ngram_range=(1, 3))

In [96]:
tfidf_vectorizer = tfidf_vectorizer.fit(sentences)



In [97]:
tf_idf_matrix = tfidf_vectorizer.transform(sentences)
print(tf_idf_matrix)

  (0, 221)	0.24956605786128022
  (0, 187)	0.2919708551400885
  (0, 186)	0.2919708551400885
  (0, 185)	0.2919708551400885
  (0, 88)	0.2919708551400885
  (0, 87)	0.2919708551400885
  (0, 86)	0.2919708551400885
  (0, 25)	0.2919708551400885
  (0, 24)	0.2919708551400885
  (0, 23)	0.2919708551400885
  (0, 1)	0.2919708551400885
  (0, 0)	0.2919708551400885
  (1, 180)	0.3333333333333333
  (1, 179)	0.3333333333333333
  (1, 178)	0.3333333333333333
  (1, 127)	0.3333333333333333
  (1, 123)	0.3333333333333333
  (1, 122)	0.3333333333333333
  (1, 82)	0.3333333333333333
  (1, 81)	0.3333333333333333
  (1, 80)	0.3333333333333333
  (2, 220)	0.28867513459481287
  (2, 219)	0.28867513459481287
  (2, 184)	0.28867513459481287
  (2, 163)	0.28867513459481287
  :	:
  (10, 230)	0.19360286426407214
  (10, 229)	0.19360286426407214
  (10, 225)	0.19360286426407214
  (10, 224)	0.19360286426407214
  (10, 223)	0.19360286426407214
  (10, 222)	0.19360286426407214
  (10, 221)	0.33096936063604365
  (10, 165)	0.19360286426407

In [99]:
dense_matrix = tf_idf_matrix.todense()
dense_matrix

matrix([[0.29197086, 0.29197086, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [100]:
print(tfidf_vectorizer.get_feature_names())

['_the_', '_the_ woman', 'abhorr', 'abhorr cold', 'abhorr cold precis', 'action', 'adjust', 'adjust tempera', 'adjust tempera introduc', 'adler', 'adler dubious', 'adler dubious question', 'admir', 'admir balanc', 'admir balanc mind', 'admir thing', 'admir thing observer—excel', 'admit', 'admit intrus', 'admit intrus delic', 'akin', 'akin love', 'akin love iren', 'alway', 'alway _the_', 'alway _the_ woman', 'balanc', 'balanc mind', 'cold', 'cold precis', 'cold precis admir', 'crack', 'crack one', 'crack one high-pow', 'delic', 'delic fine', 'delic fine adjust', 'distract', 'distract factor', 'distract factor throw', 'disturb', 'disturb strong', 'disturb strong emot', 'doubt', 'doubt upon', 'doubt upon mental', 'draw', 'draw veil', 'draw veil men', 'dubious', 'dubious question', 'dubious question memori', 'eclips', 'eclips predomin', 'eclips predomin whole', 'emot', 'emot akin', 'emot akin love', 'emot natur', 'emot one', 'emot one particular', 'eye', 'eye eclips', 'eye eclips predomin'

In [101]:
analyze = tfidf_vectorizer.build_analyzer()
print(analyze("To Sherlock Holmes she is always _the_ woman."))

['sherlock', 'holm', 'alway', '_the_', 'woman', 'sherlock holm', 'holm alway', 'alway _the_', '_the_ woman', 'sherlock holm alway', 'holm alway _the_', 'alway _the_ woman']


### Char ngram

In [102]:
sentences = get_sentences("../Chapter01/sherlock_holmes_1.txt")

In [104]:
tfidf_char_vectorizer = TfidfVectorizer(analyzer = 'char_wb',
                                       max_df = 0.9, max_features=200000,
                                       min_df=0.05,use_idf=True,
                                       ngram_range=(1, 3))

tfidf_char_vectorizer = tfidf_char_vectorizer.fit(sentences)

In [106]:
tfidf_matrix = tfidf_char_vectorizer.transform(sentences)
print(tfidf_matrix)

  (0, 763)	0.12662434631923655
  (0, 762)	0.12662434631923655
  (0, 753)	0.05840470946313
  (0, 745)	0.10823388151187574
  (0, 744)	0.0850646359499111
  (0, 733)	0.12662434631923655
  (0, 731)	0.07679517427049085
  (0, 684)	0.07679517427049085
  (0, 683)	0.07679517427049085
  (0, 675)	0.05840470946313
  (0, 639)	0.21646776302375148
  (0, 638)	0.21646776302375148
  (0, 623)	0.16087778612557863
  (0, 602)	0.12662434631923655
  (0, 600)	0.09518563907785169
  (0, 521)	0.10823388151187574
  (0, 519)	0.07679517427049085
  (0, 518)	0.12662434631923655
  (0, 515)	0.09518563907785169
  (0, 507)	0.12662434631923655
  (0, 506)	0.12662434631923655
  (0, 503)	0.07679517427049085
  (0, 460)	0.10823388151187574
  (0, 459)	0.10823388151187574
  (0, 442)	0.12662434631923655
  :	:
  (10, 102)	0.10440492924682453
  (10, 101)	0.08607445743804548
  (10, 100)	0.08607445743804548
  (10, 99)	0.11564746590034126
  (10, 95)	0.11564746590034126
  (10, 88)	0.05220246462341226
  (10, 87)	0.10935865461197974
  (10,

In [108]:
dense_matrix = tfidf_matrix.todense()
print(dense_matrix)

[[0.12662435 0.12662435 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.07119069 ... 0.         0.         0.        ]
 [0.         0.         0.17252729 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [109]:
print(tfidf_char_vectorizer.get_feature_names())

[' _', ' _t', ' a ', ' ab', ' ac', ' ad', ' ak', ' al', ' an', ' as', ' b', ' ba', ' be', ' bu', ' c', ' co', ' cr', ' d', ' de', ' di', ' do', ' dr', ' du', ' e', ' ec', ' em', ' ey', ' f', ' fa', ' fe', ' fi', ' fo', ' fr', ' g', ' gi', ' gr', ' ha', ' he', ' hi', ' ho', ' i', ' i ', ' in', ' ir', ' is', ' it', ' l', ' la', ' le', ' lo', ' m', ' ma', ' me', ' mi', ' mo', ' n', ' na', ' ne', ' no', ' o', ' ob', ' of', ' on', ' or', ' ot', ' ow', ' p', ' pa', ' pe', ' pl', ' po', ' pr', ' q', ' qu', ' r', ' re', ' s', ' sa', ' se', ' sh', ' sn', ' so', ' sp', ' st', ' su', ' ta', ' te', ' th', ' to', ' tr', ' u', ' un', ' up', ' v', ' ve', ' wa', ' we', ' wh', ' wi', ' wo', ' y', ' ye', ',', ', ', '-', '-p', '-po', '_', '_ ', '_t', '_th', 'a ', 'ab', 'abh', 'abl', 'ac', 'ace', 'ach', 'ack', 'act', 'ad', 'adj', 'adl', 'adm', 'ai', 'ain', 'ak', 'ake', 'aki', 'al', 'al ', 'ala', 'all', 'als', 'alw', 'am', 'ame', 'an ', 'an.', 'anc', 'and', 'any', 'ar', 'ard', 'arl', 'art', 'as', 'as ', 'a

In [110]:
analyze = tfidf_char_vectorizer.build_analyzer()
print(analyze("To Sherlock Holmes she is always _the_ woman."))

[' ', 't', 'o', ' ', ' t', 'to', 'o ', ' to', 'to ', ' ', 's', 'h', 'e', 'r', 'l', 'o', 'c', 'k', ' ', ' s', 'sh', 'he', 'er', 'rl', 'lo', 'oc', 'ck', 'k ', ' sh', 'she', 'her', 'erl', 'rlo', 'loc', 'ock', 'ck ', ' ', 'h', 'o', 'l', 'm', 'e', 's', ' ', ' h', 'ho', 'ol', 'lm', 'me', 'es', 's ', ' ho', 'hol', 'olm', 'lme', 'mes', 'es ', ' ', 's', 'h', 'e', ' ', ' s', 'sh', 'he', 'e ', ' sh', 'she', 'he ', ' ', 'i', 's', ' ', ' i', 'is', 's ', ' is', 'is ', ' ', 'a', 'l', 'w', 'a', 'y', 's', ' ', ' a', 'al', 'lw', 'wa', 'ay', 'ys', 's ', ' al', 'alw', 'lwa', 'way', 'ays', 'ys ', ' ', '_', 't', 'h', 'e', '_', ' ', ' _', '_t', 'th', 'he', 'e_', '_ ', ' _t', '_th', 'the', 'he_', 'e_ ', ' ', 'w', 'o', 'm', 'a', 'n', '.', ' ', ' w', 'wo', 'om', 'ma', 'an', 'n.', '. ', ' wo', 'wom', 'oma', 'man', 'an.', 'n. ']


# Using word embeddings

* These are powerful because they are a result of training a neural network that predicts a word from all other words in the sentence. 


* The resulting vector embeddings are similar for words that occur in similar contexts. We will use the embeddings to show these similarities.

In [4]:
from gensim.models import KeyedVectors
import numpy as np

In [9]:
w2vec_model_path = "40/model.bin"

In [26]:
model = KeyedVectors.load_word2vec_format(w2vec_model_path, binary=True)

In [27]:
print(model['holmes'])

[-0.309647 -0.127936 -0.136244 -0.252969  0.410695  0.206325  0.119236
 -0.244745 -0.436801  0.058889  0.237439  0.247656  0.072103  0.044183
 -0.424878  0.367344  0.153287  0.343856  0.232269 -0.181432 -0.050021
  0.225756  0.71465  -0.564166 -0.168468 -0.153668  0.300445 -0.220122
 -0.021261  0.25779  -0.581744  0.320341 -0.236189  0.224906  0.029358
 -0.295143  0.483847 -0.05832   0.010784  0.050842 -0.034141  0.420114
  0.126926 -0.405974 -0.421415  0.006092 -0.137557  0.038477  0.100005
  0.151401  0.287163 -0.433263 -0.249083 -0.057834  0.367427 -0.181977
  0.31608   0.063203 -0.486009 -0.127354 -0.283149  0.028113 -0.150146
 -0.38704   0.033237  0.146932  0.470853 -0.151154  0.064424  0.146739
 -0.164267 -0.094909  0.443384 -0.055244  0.117268 -0.221496 -0.185951
  0.056249 -0.176986 -0.449508  0.345431 -0.096014 -0.19798   0.117698
 -0.162563 -0.181655 -0.18644  -0.158727  0.595464  0.161437 -0.382661
  0.148537  0.173535  0.370556 -0.346765  0.055452  0.024405 -0.002895
  0.08

In [28]:
print(model.most_similar(['holmes'], topn=15))

[('sherlock', 0.8416914939880371), ('parker', 0.8099909424781799), ('moriarty', 0.8039607405662537), ('sawyer', 0.8002702593803406), ('moore', 0.7932805418968201), ('wolfe', 0.7923581600189209), ('hale', 0.7910093069076538), ('doyle', 0.7906038761138916), ('holmes.the', 0.7895271182060242), ('watson', 0.7887691259384155), ('yates', 0.7882786393165588), ('stevenson', 0.7879441380500793), ('spencer', 0.7877693176269531), ('goodwin', 0.7866846323013306), ('baxter', 0.7864187359809875)]


In [29]:
sentence = "It was not that he felt any emotion akin to love for Irene Adler."

In [34]:
def get_word_vectors(sentecne, model):
    word_vectors = []
    for word in sentence:
        try:
            word_vector = model.get_vector(word.lower())
            word_vectors.append(word_vector)
        except KeyError:
            continue
    return word_vectors

In [35]:
def get_sentence_vector(word_vectors):
    matrix = np.array(word_vectors)
    centroid = np.mean(matrix[:,:],axis=0)
    return centroid

In [40]:
word_vectors = get_word_vectors(sentence, model)
sentence_vector = get_sentence_vector(word_vectors)
print(sentence_vector)

[ 0.09226871  0.14478634  0.23788658 -0.31754282  0.42911175 -0.05198449
  0.12572111  0.01170996 -0.01138579  0.05200932  0.15247145  0.34026343
  0.12961692  0.05010585 -0.09165132  0.3782767   0.08390289  0.30078036
 -0.24396846  0.42507184 -0.13556597  0.157348    0.19739327 -0.13114193
 -0.16301586  0.19061208 -0.17776786  0.00779739  0.22080304  0.00757292
  0.08214489  0.05292403 -0.26995075  0.00906517  0.18542539 -0.20518285
 -0.09054315  0.02091755  0.15495133 -0.03320368 -0.03254781  0.35649517
 -0.14889626  0.07488623  0.13680871  0.4443542  -0.14066774  0.10251798
 -0.18436027  0.11045676 -0.17975916 -0.02136871 -0.11026109 -0.18642433
 -0.05931851  0.01703786  0.3544097   0.17131186 -0.31452173 -0.12231107
 -0.08258836  0.15248556  0.12112819 -0.32618955  0.01297824 -0.04008434
  0.35412577 -0.13917081 -0.19634432 -0.03216437  0.30779663 -0.00925971
  0.2535734  -0.14927842 -0.2347377  -0.32309702 -0.41007644  0.42555934
 -0.05917206  0.07272248  0.05830745  0.23424557  0

In [41]:
words = ['banana', 'apple', 'computer', 'strawberry']

In [42]:
print(model.doesnt_match(words))

computer


In [43]:
word = 'cup'
words = ['glass', 'computer', 'pencil', 'watch']
print(model.most_similar_to_given(word, words))

glass


## Training your own embeddings model

In [66]:
import sys
sys.path.append('..')

In [67]:
import gensim
import pickle
from os import listdir
from os.path import isfile, join
from Chapter03.bag_of_words_1 import get_sentences
from Chapter01.tokenization import tokenize_nltk

In [68]:
word2vec_model_path = "word2vec.model"
books_dir = "1025_1853_bundle_archive"

In [87]:
def get_all_book_sentences(directory):
    text_files = [join(directory, f) for f in listdir(directory) if \
                 isfile(join(directory,f)) and '.txt' in f]
    all_sentences = []
    for text_file in text_files:
        sentences = get_sentences(text_file)
        all_sentences = all_sentences + sentences
    return all_sentences

In [125]:
def train_word2vec(words, word2vec_model_path):
    model = gensim.models.Word2Vec(words, window=5, 
                                   vector_size=200)
    model.train(words, total_examples=len(words), epochs=200)
    pickle.dump(model, open(word2vec_model_path, 'wb'))
    return model
    

In [115]:
sentences = get_all_book_sentences(books_dir)

In [116]:
from striprtf.striprtf import rtf_to_text

In [117]:
sentences = [rtf_to_text(s).strip() for s in sentences]

In [118]:
len(sentences)

89975

In [119]:
sentences[0]

"Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan DoyleThis eBook is for the use of anyone anywhere at no cost and withalmost no restrictions whatsoever."

In [120]:
sentences = [tokenize_nltk(s.lower()) for s in sentences]

In [121]:
sentences[0]

['project',
 'gutenberg',
 "'s",
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes',
 ',',
 'by',
 'arthur',
 'conan',
 'doylethis',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'withalmost',
 'no',
 'restrictions',
 'whatsoever',
 '.']

In [126]:
model = train_word2vec(sentences, word2vec_model_path)

In [130]:
w1 = "river"
words = model.wv.most_similar(w1, topn=10)
print(words)

[('illinois', 0.4451196789741516), ('road', 0.4412068724632263), ('mile', 0.4335242807865143), ('canoe', 0.4315584599971771), ('shore', 0.4308687448501587), ('passage', 0.41741570830345154), ('path', 0.4171934127807617), ('rhine', 0.4164977967739105), ('island', 0.4142218828201294), ('strand', 0.41371068358421326)]


In [133]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
import pickle

In [134]:
model = pickle.load(open(word2vec_model_path, 'rb'))

In [140]:
analogy_score, word_list = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))

In [141]:
analogy_score

0.14359892569382274

In [143]:
# word_list

In [146]:
pretrained_model_path = "40/model.bin"

In [147]:
pretrained_model = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)

In [148]:
analogy_score, word_list = pretrained_model.evaluate_word_analogies(datapath('questions-words.txt'))

In [149]:
print(analogy_score)

0.5867802524889665


# Representing phrases – phrase2vec

* Encoding words is useful, but usually, we deal with more complex units, such as phrases and sentences. 


* Phrases are important because they specify more detail than just words. For example, the phrase delicious fried rice is very different than just the word rice.

In [1]:
import sys
sys.path.append('..')

In [2]:
import nltk
import string
import csv
import json
import pandas as pd
import gensim
from langdetect import detect
import pickle
from nltk import FreqDist
from Chapter01.dividing_into_sentences import \
divide_into_sentences_nltk
from Chapter01.tokenization import tokenize_nltk
from nltk.corpus import stopwords

In [3]:
stop_words = stopwords.words('english')

In [13]:
yelp_reviews_file = "/Volumes/MAC-DISK/yelp_dataset/yelp_academic_dataset_review.json"

In [4]:
def get_yelp_review(filename):
    reader = pd.read_json(filename, orient='records',
                         lines=True, chunksize=1000)
    chunk = next(reader)
    text = ''
    for index, row in chunk.iterrows():
        row_text = row['text']
        lang = detect(row_text)
        if lang =='en':
            text = text + row_text.lower()
    return text

In [5]:
def get_phrases(text):
    words = nltk.tokenize.word_tokenize(text)
    phrases = {}
    current_phrase = []
    for word in words:
        if word in stop_words or word in string.punctuation:
            if len(current_phrase) > 1:
                phrases[" ".join(current_phrase)] = '_'.join(current_phrase)
                current_phrase = []
        else:
            current_phrase.append(word)
        
        if len(current_phrase) > 1:
            phrases[" ".join(current_phrase)] = '_'.join(current_phrase)
    
    return phrases

In [6]:
def replace_phrases(phrases_dict, text):
    for phrase in phrases_dict.keys():
        text = text.replace(phrase, phrases_dict[phrase])
    return text

In [7]:
def write_text_to_file(text, filename):
    text_file = open(filename, 'w', encoding='utf-8')
    text_file.write(text)
    text_file.close()

In [8]:
def create_and_save_frequency_list(word_list, filename):
    fdist = FreqDist(word_list)
    pickle.dump(fdist, open(filename, 'wb'))
    return fdist

In [25]:
text = get_yelp_review(yelp_reviews_file)

In [53]:
phrases = get_phrases(text)
text = replace_phrases(phrases, text)
write_text_to_file(text, 'all_text.txt')

KeyboardInterrupt: 

In [9]:
with open('all_text.txt','r') as f:
    text = f.read()

In [10]:
sentences = divide_into_sentences_nltk(text)
all_sentece_words = [tokenize_nltk(sentence.lower()) for sentence in sentences]
flat_word_list = [word.lower() for sentence in all_sentece_words for word in sentence]
fdist = create_and_save_frequency_list(flat_word_list, 'fdist.bin')

In [11]:
print(fdist.most_common()[:100])

[('.', 58177), ('the', 53315), (',', 39252), ('and', 37563), ('i', 29031), ('a', 27953), ('to', 25349), ('was', 18698), ('of', 15052), ('it', 14691), ('is', 13431), ('for', 12294), ('in', 11866), ('!', 11453), ('my', 9945), ('that', 9503), ('with', 8693), ('but', 8563), ('this', 7907), ('they', 7898), ('we', 7667), ('on', 7409), ('you', 6990), ('have', 6594), ('not', 6572), ('had', 6230), ('so', 5661), ('were', 5648), ('are', 5305), ('at', 5288), (')', 4795), ('be', 4618), ('(', 4436), ('as', 4283), ('very', 4156), ('there', 4079), ('me', 4079), ("'s", 3786), ('if', 3646), ('all', 3508), ('out', 3507), ('just', 3369), ('here', 3322), ('place', 3132), ('our', 3050), ('from', 3007), ('their', 2994), ('when', 2959), ('food', 2830), ('up', 2828), ('or', 2792), ('an', 2752), ('about', 2652), ('which', 2505), ('what', 2300), ('some', 2294), ('...', 2215), ('will', 2191), ('-', 2170), ('he', 2157), ('been', 2113), ('no', 2105), ('only', 2070), ('she', 2053), ('your', 2030), ("n't", 2012), ('m

In [12]:
def create_and_save_word2vec_model(words, filename):
    model = gensim.models.Word2Vec(words, min_count=1)
    model.train(words, total_examples=model.corpus_count, epochs=400)
    pickle.dump(model, open(filename, 'wb'))
    return model

In [13]:
model = create_and_save_word2vec_model(all_sentece_words, 'phrases.model')

In [14]:
words = model.wv.most_similar('highly_recommend', topn=10)
print(words)

[('would_highly_recommend', 0.6025195121765137), ('definitely_recommend', 0.5658879280090332), ('would_definitely_recommend', 0.5532544255256653), ('would_recommend', 0.4962897300720215), ('ate_around', 0.46942415833473206), ('love', 0.46003633737564087), ('thank', 0.4555864930152893), ('recommend', 0.44822776317596436), ('really_enjoyed', 0.42181792855262756), ('would_strongly_recommend', 0.41948580741882324)]


In [16]:
words = model.wv.most_similar("dim_sum", topn=10)
print(words)

[('comfort_food_choices', 0.4160696268081665), ('pricy', 0.3976624011993408), ('cuisine', 0.3934507668018341), ('food_court', 0.3922019302845001), ('citywalk', 0.38666999340057373), ('thai', 0.3826940357685089), ('classic', 0.3824933171272278), ('cool_drink', 0.3738446533679962), ('finally_came', 0.37134498357772827), ('overpriced', 0.37012335658073425)]


# Using BERT instead of word embeddings


* A recent development in the embeddings world is BERT, also known as Bidirectional Encoder Representations from Transformers, which, like word embeddings, gives a vector representation, but it takes context into account and can represent a whole sentence. 


* We can use the Hugging Face sentence_transformers package to represent sentences as vectors.

In [26]:
from sentence_transformers import SentenceTransformer
from Chapter01.dividing_into_sentences import read_text_file, divide_into_sentences_nltk

In [27]:
text = read_text_file("../Chapter01/sherlock_holmes.txt")
sentences = divide_into_sentences_nltk(text)

In [28]:
model = SentenceTransformer('bert-base-nli-mean-tokens') 

In [29]:
sentence_embeddings = model.encode(sentences)
sentence_embeddings

array([[-0.4108902 ,  1.1092619 ,  0.65330595, ..., -0.92320925,
         0.47286835,  0.36298928],
       [-0.16485526,  0.6998441 ,  0.70763886, ..., -0.40428248,
        -0.30385858, -0.3291513 ],
       [-0.37814918,  0.34770998, -0.09765318, ...,  0.13831207,
         0.36044088,  0.12381998],
       ...,
       [-0.251487  ,  0.57580566,  1.459657  , ...,  0.56890166,
        -0.6003895 , -0.02739903],
       [-0.64917624,  0.360967  ,  1.1350368 , ..., -0.0405464 ,
         0.07568653,  0.18094465],
       [-0.42418253,  0.4814613 ,  0.93001944, ...,  0.73677164,
        -0.09357806, -0.00368021]], dtype=float32)

In [30]:
sentence_embeddings = model.encode(['the beautiful lake'])
sentence_embeddings

array([[-7.61978924e-02, -5.74670076e-01,  1.08264256e+00,
         7.36554265e-01,  5.51345468e-01, -9.39117551e-01,
        -2.80430049e-01, -5.41625977e-01,  7.50948846e-01,
        -4.40971613e-01,  5.31526685e-01, -5.41883230e-01,
         1.92792654e-01,  3.44117582e-01,  1.50266385e+00,
        -6.26989722e-01, -2.42828563e-01, -3.66734564e-01,
         5.57459652e-01, -2.21802279e-01, -9.69591439e-01,
        -4.38950717e-01, -7.93552220e-01, -5.84923029e-01,
        -1.55690983e-01,  2.12004229e-01,  4.02013630e-01,
        -2.63063580e-01,  6.21910393e-01,  5.97237408e-01,
         9.78125483e-02,  7.20052421e-01, -4.66322839e-01,
         3.86450231e-01, -8.24903369e-01,  1.09985697e+00,
        -3.59134972e-01, -4.31918561e-01,  2.56565101e-02,
         5.73159695e-01,  2.40237087e-01, -7.67571330e-01,
         9.38899398e-01, -3.60024542e-01, -8.77114952e-01,
        -2.47680426e-01, -8.65839005e-01,  1.04203534e+00,
         3.65989566e-01, -6.47715852e-02, -7.04246700e-0

# Getting started with semantic search

In [None]:
! pip install whoosh