In [4]:
# ! pip install sklearn gensim langdetect  torch transformers sentence-transformers whoosh -U 

#### Related links

* https://www.kaggle.com/currie32/project-gutenbergs-top-20-books
* https://www.kaggle.com/PromptCloudHQ/imdb-data
* https://www.yelp.com/dataset
* http://vectors.nlpl.eu/repository/20/40.zip

# Putting documents into a bag of words

* A bag of words is the simplest way of representing text. We treat our text as a collection of documents, where documents are anything from sentences to book chapters to whole books. 


* Since we usually compare different documents to each other or use them in a larger context of other documents, typically, we work with a collection of documents, not just a single document.

In [8]:
import sys
sys.path.append('..')

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from Chapter01.dividing_into_sentences import read_text_file,\
preprocess_text, divide_into_sentences_nltk

In [12]:
#. Read in the text file, preprocess the text, 
# and divide it into sentences:

def get_sentences(filename):
    sherlock_holmes_text = read_text_file(filename)
    sherlock_holmes_text = preprocess_text(sherlock_holmes_text)
    senteces = divide_into_sentences_nltk(sherlock_holmes_text)
    return senteces


In [14]:
# Create a function that will return the vectorizer and final matrix

def create_vectorizer(sentences):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    return vectorizer, X

In [16]:
sentences = get_sentences('../Chapter01/sherlock_holmes_1.txt')
vectorizer, X = create_vectorizer(sentences)

In [23]:
# he resulting matrix is a scipy.sparse.csr.csr_matrix object, and the beginning of its printout looks like this:
print(X)

# first number document number, word number

  (0, 114)	1
  (0, 99)	1
  (0, 47)	1
  (0, 98)	1
  (0, 54)	1
  (0, 10)	1
  (0, 0)	1
  (0, 124)	1
  (1, 39)	1
  (1, 95)	1
  (1, 41)	1
  (1, 44)	1
  (1, 64)	1
  (1, 42)	1
  (1, 116)	1
  (1, 12)	1
  (1, 79)	1
  (1, 70)	1
  (2, 98)	1
  (2, 42)	1
  (2, 48)	1
  (2, 46)	1
  (2, 29)	1
  (2, 25)	1
  (2, 11)	1
  :	:
  (9, 57)	1
  (9, 15)	1
  (9, 67)	1
  (9, 21)	1
  (9, 107)	1
  (9, 103)	1
  (9, 71)	1
  (10, 114)	1
  (10, 124)	2
  (10, 44)	1
  (10, 11)	3
  (10, 109)	1
  (10, 76)	1
  (10, 119)	2
  (10, 108)	1
  (10, 53)	1
  (10, 4)	1
  (10, 77)	1
  (10, 16)	1
  (10, 127)	1
  (10, 110)	1
  (10, 56)	1
  (10, 24)	1
  (10, 89)	1
  (10, 61)	1


In [25]:
# It can also be turned into a numpy.matrixlib.defmatrix.matrix object, where each sentence is a vector. These sentence vectors can be used our machine learning algorithms later:
denseX = X.todense()
print(denseX)

[[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]


In [26]:
print(len(sentences))
print(denseX.shape)

11
(11, 128)


In [28]:
print(len(vectorizer.get_feature_names()))
print(vectorizer.get_feature_names())

128
['_the_', 'abhorrent', 'actions', 'adjusted', 'adler', 'admirable', 'admirably', 'admit', 'akin', 'all', 'always', 'and', 'any', 'as', 'balanced', 'be', 'but', 'cold', 'crack', 'delicate', 'distracting', 'disturbing', 'doubt', 'drawing', 'dubious', 'eclipses', 'emotion', 'emotions', 'excellent', 'eyes', 'factor', 'false', 'felt', 'finely', 'for', 'from', 'gibe', 'grit', 'has', 'have', 'he', 'heard', 'her', 'high', 'him', 'himself', 'his', 'holmes', 'in', 'instrument', 'into', 'introduce', 'intrusions', 'irene', 'is', 'it', 'late', 'lenses', 'love', 'lover', 'machine', 'memory', 'men', 'mental', 'mention', 'might', 'mind', 'more', 'most', 'motives', 'name', 'nature', 'never', 'not', 'observer', 'observing', 'of', 'one', 'or', 'other', 'own', 'particularly', 'passions', 'perfect', 'placed', 'position', 'power', 'precise', 'predominates', 'questionable', 'reasoner', 'reasoning', 'results', 'save', 'seen', 'seldom', 'sensitive', 'sex', 'she', 'sherlock', 'sneer', 'softer', 'spoke', 'st

In [31]:
new_sentence = "I had seen little of Holmes lately."
new_sentece_vector = vectorizer.transform([new_sentence])

In [32]:
print(new_sentece_vector)
print(new_sentece_vector.todense())

  (0, 47)	1
  (0, 76)	1
  (0, 94)	1
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [41]:
vectorizer.get_feature_names()[75]

'observing'

In [42]:
vectorizer.vocabulary_

{'to': 114,
 'sherlock': 99,
 'holmes': 47,
 'she': 98,
 'is': 54,
 'always': 10,
 '_the_': 0,
 'woman': 124,
 'have': 39,
 'seldom': 95,
 'heard': 41,
 'him': 44,
 'mention': 64,
 'her': 42,
 'under': 116,
 'any': 12,
 'other': 79,
 'name': 70,
 'in': 48,
 'his': 46,
 'eyes': 29,
 'eclipses': 25,
 'and': 11,
 'predominates': 88,
 'the': 109,
 'whole': 122,
 'of': 76,
 'sex': 97,
 'it': 55,
 'was': 119,
 'not': 73,
 'that': 108,
 'he': 40,
 'felt': 32,
 'emotion': 26,
 'akin': 8,
 'love': 58,
 'for': 34,
 'irene': 53,
 'adler': 4,
 'all': 9,
 'emotions': 27,
 'one': 77,
 'particularly': 81,
 'were': 120,
 'abhorrent': 1,
 'cold': 17,
 'precise': 87,
 'but': 16,
 'admirably': 6,
 'balanced': 14,
 'mind': 66,
 'take': 105,
 'most': 68,
 'perfect': 83,
 'reasoning': 91,
 'observing': 75,
 'machine': 60,
 'world': 125,
 'has': 38,
 'seen': 94,
 'as': 13,
 'lover': 59,
 'would': 126,
 'placed': 84,
 'himself': 45,
 'false': 31,
 'position': 85,
 'never': 72,
 'spoke': 102,
 'softer': 101,
 

In [43]:
## Vectorizer with stopwords removal

vectorizer = CountVectorizer(stop_words='english')

In [44]:
X = vectorizer.fit_transform(sentences)

In [46]:
print(len(vectorizer.get_feature_names()))
print(vectorizer.get_feature_names())

79
['_the_', 'abhorrent', 'actions', 'adjusted', 'adler', 'admirable', 'admirably', 'admit', 'akin', 'balanced', 'cold', 'crack', 'delicate', 'distracting', 'disturbing', 'doubt', 'drawing', 'dubious', 'eclipses', 'emotion', 'emotions', 'excellent', 'eyes', 'factor', 'false', 'felt', 'finely', 'gibe', 'grit', 'heard', 'high', 'holmes', 'instrument', 'introduce', 'intrusions', 'irene', 'late', 'lenses', 'love', 'lover', 'machine', 'memory', 'men', 'mental', 'mention', 'mind', 'motives', 'nature', 'observer', 'observing', 'particularly', 'passions', 'perfect', 'placed', 'position', 'power', 'precise', 'predominates', 'questionable', 'reasoner', 'reasoning', 'results', 'save', 'seen', 'seldom', 'sensitive', 'sex', 'sherlock', 'sneer', 'softer', 'spoke', 'strong', 'temperament', 'things', 'throw', 'trained', 'veil', 'woman', 'world']


In [47]:
new_sentence = "And yet there was but one woman to him, and that woman was the late Irene Adler, of dubious and questionable memory."

In [49]:
new_sentence_vector = vectorizer.transform([new_sentence])
analyse = vectorizer.build_analyzer()
print(analyse(new_sentence))

['woman', 'woman', 'late', 'irene', 'adler', 'dubious', 'questionable', 'memory']


In [51]:
print(new_sentence_vector)

  (0, 4)	1
  (0, 17)	1
  (0, 35)	1
  (0, 36)	1
  (0, 41)	1
  (0, 58)	1
  (0, 77)	2


In [53]:
# if any word is present more than 80%  on the document we will ignore
vectorizer = CountVectorizer(max_df=0.8)

In [55]:
new_sentence_vector = vectorizer.fit_transform(sentences)

In [56]:
len(vectorizer.get_feature_names())

128

# Constructing the N-gram model

* Representing a document as a bag of words is useful, but semantics is about more than just words in isolation. 


* To capture word combinations, an n-gram model is useful. Its vocabulary consists not just of words, but word sequences, or n-grams.

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
from Chapter01.dividing_into_sentences import read_text_file, preprocess_text, divide_into_sentences_nltk
from Chapter03.bag_of_words_1 import get_sentences, get_new_sentences_vector

In [61]:
sentences = get_sentences('../Chapter01/sherlock_holmes_1.txt')

In [64]:
bigram_vectorizer = CountVectorizer(ngram_range = (1, 2))

In [67]:
X = bigram_vectorizer.fit_transform(sentences)
print(X)

  (0, 269)	1
  (0, 229)	1
  (0, 118)	1
  (0, 226)	1
  (0, 136)	1
  (0, 20)	1
  (0, 0)	1
  (0, 299)	1
  (0, 275)	1
  (0, 230)	1
  (0, 119)	1
  (0, 228)	1
  (0, 137)	1
  (0, 21)	1
  (0, 1)	1
  (1, 93)	1
  (1, 221)	1
  (1, 101)	1
  (1, 108)	1
  (1, 156)	1
  (1, 103)	1
  (1, 278)	1
  (1, 31)	1
  (1, 190)	1
  (1, 167)	1
  :	:
  (10, 307)	1
  (10, 261)	1
  (10, 141)	1
  (10, 60)	1
  (10, 210)	1
  (10, 151)	1
  (10, 30)	1
  (10, 308)	1
  (10, 262)	1
  (10, 285)	1
  (10, 45)	1
  (10, 187)	1
  (10, 300)	1
  (10, 271)	1
  (10, 109)	1
  (10, 251)	1
  (10, 301)	1
  (10, 288)	1
  (10, 253)	1
  (10, 142)	1
  (10, 8)	1
  (10, 180)	1
  (10, 61)	1
  (10, 27)	1
  (10, 211)	1


In [68]:
denseX = X.todense()
print(denseX)

[[1 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 1]]


In [69]:
print(bigram_vectorizer.get_feature_names())

['_the_', '_the_ woman', 'abhorrent', 'abhorrent to', 'actions', 'adjusted', 'adjusted temperament', 'adler', 'adler of', 'admirable', 'admirable things', 'admirably', 'admirably balanced', 'admit', 'admit such', 'akin', 'akin to', 'all', 'all emotions', 'all his', 'always', 'always _the_', 'and', 'and actions', 'and finely', 'and observing', 'and predominates', 'and questionable', 'and sneer', 'and that', 'and yet', 'any', 'any emotion', 'any other', 'as', 'as his', 'as lover', 'balanced', 'balanced mind', 'be', 'be more', 'but', 'but admirably', 'but as', 'but for', 'but one', 'cold', 'cold precise', 'crack', 'crack in', 'delicate', 'delicate and', 'distracting', 'distracting factor', 'disturbing', 'disturbing than', 'doubt', 'doubt upon', 'drawing', 'drawing the', 'dubious', 'dubious and', 'eclipses', 'eclipses and', 'emotion', 'emotion akin', 'emotion in', 'emotions', 'emotions and', 'excellent', 'excellent for', 'eyes', 'eyes she', 'factor', 'factor which', 'false', 'false positio

In [70]:
print(len(bigram_vectorizer.get_feature_names()))

309


In [71]:
new_sentence = "I had seen little of Holmes lately."
new_sentece_vector = bigram_vectorizer.transform([new_sentence])

In [72]:
print(new_sentence_vector)
print(new_sentece_vector.todense())

  (0, 114)	1
  (0, 99)	1
  (0, 47)	1
  (0, 98)	1
  (0, 54)	1
  (0, 10)	1
  (0, 0)	1
  (0, 124)	1
  (1, 39)	1
  (1, 95)	1
  (1, 41)	1
  (1, 44)	1
  (1, 64)	1
  (1, 42)	1
  (1, 116)	1
  (1, 12)	1
  (1, 79)	1
  (1, 70)	1
  (2, 98)	1
  (2, 42)	1
  (2, 48)	1
  (2, 46)	1
  (2, 29)	1
  (2, 25)	1
  (2, 11)	1
  :	:
  (9, 57)	1
  (9, 15)	1
  (9, 67)	1
  (9, 21)	1
  (9, 107)	1
  (9, 103)	1
  (9, 71)	1
  (10, 114)	1
  (10, 124)	2
  (10, 44)	1
  (10, 11)	3
  (10, 109)	1
  (10, 76)	1
  (10, 119)	2
  (10, 108)	1
  (10, 53)	1
  (10, 4)	1
  (10, 77)	1
  (10, 16)	1
  (10, 127)	1
  (10, 110)	1
  (10, 56)	1
  (10, 24)	1
  (10, 89)	1
  (10, 61)	1
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [73]:
new_sentence1 = " And yet there was but one woman to him, and that woman was the late Irene Adler, of dubious and questionable memory."
new_sentence_vector1 = vectorizer.transform([new_sentence])

In [74]:
print(new_sentence_vector1)
print(new_sentence_vector1.todense())

  (0, 47)	1
  (0, 76)	1
  (0, 94)	1
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


# Representing texts with TF-IDF

* TF-IDF stands for term frequency-inverse document frequency and gives more weight to words that are unique to a document than to words that are frequent,but repeated throughout most documents. 


* This allows us to give more weight to words uniquely characteristic to particular documents. 


* https://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting.


* The TfidfVectorizer class allows for all the functionality of CountVectorizer, except that it uses the TF-IDF algorithm to count the words instead of direct counts. The other features of the class should be familiar. 

In [76]:
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer

In [77]:
stemmer = SnowballStemmer('english')

In [78]:
sentences = get_sentences('../Chapter01/sherlock_holmes_1.txt')

In [80]:
from nltk.corpus import stopwords

In [86]:
stop_words = stopwords.words('english')

In [89]:
def tokenize_and_stem(sentences):
    tokens = nltk.word_tokenize(sentences)
    filtered_token = [t for t in tokens if t not in string.punctuation]
    stems = [stemmer.stem(t) for t in filtered_token]
    return stems

In [90]:
all_stopwords = stop_words + [tokenize_and_stem(stop_word)[0] for stop_word in stop_words]

In [95]:
tfidf_vectorizer = TfidfVectorizer(max_df = 0.9, max_features = 200000,
                                  min_df = 0.05, stop_words=all_stopwords,
                                  use_idf=True, tokenizer=tokenize_and_stem,
                                  ngram_range=(1, 3))

In [96]:
tfidf_vectorizer = tfidf_vectorizer.fit(sentences)



In [97]:
tf_idf_matrix = tfidf_vectorizer.transform(sentences)
print(tf_idf_matrix)

  (0, 221)	0.24956605786128022
  (0, 187)	0.2919708551400885
  (0, 186)	0.2919708551400885
  (0, 185)	0.2919708551400885
  (0, 88)	0.2919708551400885
  (0, 87)	0.2919708551400885
  (0, 86)	0.2919708551400885
  (0, 25)	0.2919708551400885
  (0, 24)	0.2919708551400885
  (0, 23)	0.2919708551400885
  (0, 1)	0.2919708551400885
  (0, 0)	0.2919708551400885
  (1, 180)	0.3333333333333333
  (1, 179)	0.3333333333333333
  (1, 178)	0.3333333333333333
  (1, 127)	0.3333333333333333
  (1, 123)	0.3333333333333333
  (1, 122)	0.3333333333333333
  (1, 82)	0.3333333333333333
  (1, 81)	0.3333333333333333
  (1, 80)	0.3333333333333333
  (2, 220)	0.28867513459481287
  (2, 219)	0.28867513459481287
  (2, 184)	0.28867513459481287
  (2, 163)	0.28867513459481287
  :	:
  (10, 230)	0.19360286426407214
  (10, 229)	0.19360286426407214
  (10, 225)	0.19360286426407214
  (10, 224)	0.19360286426407214
  (10, 223)	0.19360286426407214
  (10, 222)	0.19360286426407214
  (10, 221)	0.33096936063604365
  (10, 165)	0.19360286426407

In [99]:
dense_matrix = tf_idf_matrix.todense()
dense_matrix

matrix([[0.29197086, 0.29197086, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [100]:
print(tfidf_vectorizer.get_feature_names())

['_the_', '_the_ woman', 'abhorr', 'abhorr cold', 'abhorr cold precis', 'action', 'adjust', 'adjust tempera', 'adjust tempera introduc', 'adler', 'adler dubious', 'adler dubious question', 'admir', 'admir balanc', 'admir balanc mind', 'admir thing', 'admir thing observer—excel', 'admit', 'admit intrus', 'admit intrus delic', 'akin', 'akin love', 'akin love iren', 'alway', 'alway _the_', 'alway _the_ woman', 'balanc', 'balanc mind', 'cold', 'cold precis', 'cold precis admir', 'crack', 'crack one', 'crack one high-pow', 'delic', 'delic fine', 'delic fine adjust', 'distract', 'distract factor', 'distract factor throw', 'disturb', 'disturb strong', 'disturb strong emot', 'doubt', 'doubt upon', 'doubt upon mental', 'draw', 'draw veil', 'draw veil men', 'dubious', 'dubious question', 'dubious question memori', 'eclips', 'eclips predomin', 'eclips predomin whole', 'emot', 'emot akin', 'emot akin love', 'emot natur', 'emot one', 'emot one particular', 'eye', 'eye eclips', 'eye eclips predomin'

In [101]:
analyze = tfidf_vectorizer.build_analyzer()
print(analyze("To Sherlock Holmes she is always _the_ woman."))

['sherlock', 'holm', 'alway', '_the_', 'woman', 'sherlock holm', 'holm alway', 'alway _the_', '_the_ woman', 'sherlock holm alway', 'holm alway _the_', 'alway _the_ woman']


### Char ngram

In [102]:
sentences = get_sentences("../Chapter01/sherlock_holmes_1.txt")

In [104]:
tfidf_char_vectorizer = TfidfVectorizer(analyzer = 'char_wb',
                                       max_df = 0.9, max_features=200000,
                                       min_df=0.05,use_idf=True,
                                       ngram_range=(1, 3))

tfidf_char_vectorizer = tfidf_char_vectorizer.fit(sentences)

In [106]:
tfidf_matrix = tfidf_char_vectorizer.transform(sentences)
print(tfidf_matrix)

  (0, 763)	0.12662434631923655
  (0, 762)	0.12662434631923655
  (0, 753)	0.05840470946313
  (0, 745)	0.10823388151187574
  (0, 744)	0.0850646359499111
  (0, 733)	0.12662434631923655
  (0, 731)	0.07679517427049085
  (0, 684)	0.07679517427049085
  (0, 683)	0.07679517427049085
  (0, 675)	0.05840470946313
  (0, 639)	0.21646776302375148
  (0, 638)	0.21646776302375148
  (0, 623)	0.16087778612557863
  (0, 602)	0.12662434631923655
  (0, 600)	0.09518563907785169
  (0, 521)	0.10823388151187574
  (0, 519)	0.07679517427049085
  (0, 518)	0.12662434631923655
  (0, 515)	0.09518563907785169
  (0, 507)	0.12662434631923655
  (0, 506)	0.12662434631923655
  (0, 503)	0.07679517427049085
  (0, 460)	0.10823388151187574
  (0, 459)	0.10823388151187574
  (0, 442)	0.12662434631923655
  :	:
  (10, 102)	0.10440492924682453
  (10, 101)	0.08607445743804548
  (10, 100)	0.08607445743804548
  (10, 99)	0.11564746590034126
  (10, 95)	0.11564746590034126
  (10, 88)	0.05220246462341226
  (10, 87)	0.10935865461197974
  (10,

In [108]:
dense_matrix = tfidf_matrix.todense()
print(dense_matrix)

[[0.12662435 0.12662435 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.07119069 ... 0.         0.         0.        ]
 [0.         0.         0.17252729 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [109]:
print(tfidf_char_vectorizer.get_feature_names())

[' _', ' _t', ' a ', ' ab', ' ac', ' ad', ' ak', ' al', ' an', ' as', ' b', ' ba', ' be', ' bu', ' c', ' co', ' cr', ' d', ' de', ' di', ' do', ' dr', ' du', ' e', ' ec', ' em', ' ey', ' f', ' fa', ' fe', ' fi', ' fo', ' fr', ' g', ' gi', ' gr', ' ha', ' he', ' hi', ' ho', ' i', ' i ', ' in', ' ir', ' is', ' it', ' l', ' la', ' le', ' lo', ' m', ' ma', ' me', ' mi', ' mo', ' n', ' na', ' ne', ' no', ' o', ' ob', ' of', ' on', ' or', ' ot', ' ow', ' p', ' pa', ' pe', ' pl', ' po', ' pr', ' q', ' qu', ' r', ' re', ' s', ' sa', ' se', ' sh', ' sn', ' so', ' sp', ' st', ' su', ' ta', ' te', ' th', ' to', ' tr', ' u', ' un', ' up', ' v', ' ve', ' wa', ' we', ' wh', ' wi', ' wo', ' y', ' ye', ',', ', ', '-', '-p', '-po', '_', '_ ', '_t', '_th', 'a ', 'ab', 'abh', 'abl', 'ac', 'ace', 'ach', 'ack', 'act', 'ad', 'adj', 'adl', 'adm', 'ai', 'ain', 'ak', 'ake', 'aki', 'al', 'al ', 'ala', 'all', 'als', 'alw', 'am', 'ame', 'an ', 'an.', 'anc', 'and', 'any', 'ar', 'ard', 'arl', 'art', 'as', 'as ', 'a

In [110]:
analyze = tfidf_char_vectorizer.build_analyzer()
print(analyze("To Sherlock Holmes she is always _the_ woman."))

[' ', 't', 'o', ' ', ' t', 'to', 'o ', ' to', 'to ', ' ', 's', 'h', 'e', 'r', 'l', 'o', 'c', 'k', ' ', ' s', 'sh', 'he', 'er', 'rl', 'lo', 'oc', 'ck', 'k ', ' sh', 'she', 'her', 'erl', 'rlo', 'loc', 'ock', 'ck ', ' ', 'h', 'o', 'l', 'm', 'e', 's', ' ', ' h', 'ho', 'ol', 'lm', 'me', 'es', 's ', ' ho', 'hol', 'olm', 'lme', 'mes', 'es ', ' ', 's', 'h', 'e', ' ', ' s', 'sh', 'he', 'e ', ' sh', 'she', 'he ', ' ', 'i', 's', ' ', ' i', 'is', 's ', ' is', 'is ', ' ', 'a', 'l', 'w', 'a', 'y', 's', ' ', ' a', 'al', 'lw', 'wa', 'ay', 'ys', 's ', ' al', 'alw', 'lwa', 'way', 'ays', 'ys ', ' ', '_', 't', 'h', 'e', '_', ' ', ' _', '_t', 'th', 'he', 'e_', '_ ', ' _t', '_th', 'the', 'he_', 'e_ ', ' ', 'w', 'o', 'm', 'a', 'n', '.', ' ', ' w', 'wo', 'om', 'ma', 'an', 'n.', '. ', ' wo', 'wom', 'oma', 'man', 'an.', 'n. ']
