In [44]:
import time
import random
from math import *
import operator
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 10000)

# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from matplotlib import style
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

# load make_blobs to simulate data
from sklearn.datasets import make_blobs
from sklearn.datasets import make_classification


# import the ML algorithm
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB


# For text processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# pre-processing
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer


# import libraries for model validation

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


# import libraries for metrics and reporting
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import adjusted_rand_score

In [45]:
texts = [
    "Jose bought dark blue mouses. !!",
    "Jose bought dark blue and white mouse.",
    "The cat ate a mouse at the store.",
    "Jessy went to the store. Jessy ate a bug. Jessy saw a mouse.",
    "It meowed once at the bug, it is still meowing at the bug and the mouse",
    "The cat is at the mouse store. The cat is white. The cat is meowing at the mouse.",
    "Jessy has a cat"
    ]

In [46]:
# default 
# - n-grams = 1
# - stopwords filterations : false
# - lowercase : yes

# instantiate the count vectorizer
vect_tfidf = TfidfVectorizer()

In [47]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [48]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 25
['and', 'at', 'ate', 'blue', 'bought', 'bug', 'cat', 'dark', 'has', 'is', 'it', 'jessy', 'jose', 'meowed', 'meowing', 'mouse', 'mouses', 'once', 'saw', 'still', 'store', 'the', 'to', 'went', 'white']
Vocabulary content:
 {'jose': 12, 'bought': 4, 'dark': 7, 'blue': 3, 'mouses': 16, 'and': 0, 'white': 24, 'mouse': 15, 'the': 21, 'cat': 6, 'ate': 2, 'at': 1, 'store': 20, 'jessy': 11, 'went': 23, 'to': 22, 'bug': 5, 'saw': 18, 'it': 10, 'meowed': 13, 'once': 17, 'is': 9, 'still': 19, 'meowing': 14, 'has': 8}


In [49]:
# prepare dtm
X_train_tfidf_dtm = vect_tfidf.transform(texts)

In [50]:
# sparse matrix
print(type(X_train_tfidf_dtm))


<class 'scipy.sparse.csr.csr_matrix'>


In [51]:
#dense matrix
print(X_train_tfidf_dtm.toarray())

[[0.         0.         0.         0.42830228 0.42830228 0.
  0.         0.42830228 0.         0.         0.         0.
  0.42830228 0.         0.         0.         0.51597346 0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.39458881 0.         0.         0.39458881 0.39458881 0.
  0.         0.39458881 0.         0.         0.         0.
  0.39458881 0.         0.         0.25651123 0.         0.
  0.         0.         0.         0.         0.         0.
  0.39458881]
 [0.         0.35439102 0.41460548 0.         0.         0.
  0.35439102 0.         0.         0.         0.         0.
  0.         0.         0.         0.26952351 0.         0.
  0.         0.         0.35439102 0.61537012 0.         0.
  0.        ]
 [0.         0.         0.2421242  0.         0.         0.2421242
  0.         0.         0.         0.         0.         0.7263726
  0.         0.         0.         0.15739822 0.         0.
  0.29168572 0.         0.20695974 0.1796840

In [52]:
# create a dataframe
pd.DataFrame(X_train_tfidf_dtm.toarray(), columns=feature_names)

Unnamed: 0,and,at,ate,blue,bought,bug,cat,dark,has,is,it,jessy,jose,meowed,meowing,mouse,mouses,once,saw,still,store,the,to,went,white
0,0.0,0.0,0.0,0.428302,0.428302,0.0,0.0,0.428302,0.0,0.0,0.0,0.0,0.428302,0.0,0.0,0.0,0.515973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.394589,0.0,0.0,0.394589,0.394589,0.0,0.0,0.394589,0.0,0.0,0.0,0.0,0.394589,0.0,0.0,0.256511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.394589
2,0.0,0.354391,0.414605,0.0,0.0,0.0,0.354391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.269524,0.0,0.0,0.0,0.0,0.354391,0.61537,0.0,0.0,0.0
3,0.0,0.0,0.242124,0.0,0.0,0.242124,0.0,0.0,0.0,0.0,0.0,0.726373,0.0,0.0,0.0,0.157398,0.0,0.0,0.291686,0.0,0.20696,0.179684,0.291686,0.291686,0.0
4,0.198182,0.338799,0.0,0.0,0.0,0.396364,0.0,0.0,0.0,0.198182,0.477498,0.0,0.0,0.238749,0.198182,0.128833,0.0,0.238749,0.0,0.238749,0.0,0.441222,0.0,0.0,0.0
5,0.0,0.28224,0.0,0.0,0.0,0.0,0.42336,0.0,0.0,0.495293,0.0,0.0,0.0,0.0,0.165098,0.214651,0.0,0.0,0.0,0.0,0.14112,0.612607,0.0,0.0,0.165098
6,0.0,0.0,0.0,0.0,0.0,0.0,0.479185,0.0,0.675356,0.0,0.0,0.560603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
# N-grams (sets of consecutive words) N=2
# instantiate the count vectorizer
vect_tfidf = TfidfVectorizer(ngram_range=(1, 2))

In [54]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [55]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 68
['and', 'and the', 'and white', 'at', 'at the', 'ate', 'ate bug', 'ate mouse', 'blue', 'blue and', 'blue mouses', 'bought', 'bought dark', 'bug', 'bug and', 'bug it', 'bug jessy', 'cat', 'cat ate', 'cat is', 'dark', 'dark blue', 'has', 'has cat', 'is', 'is at', 'is meowing', 'is still', 'is white', 'it', 'it is', 'it meowed', 'jessy', 'jessy ate', 'jessy has', 'jessy saw', 'jessy went', 'jose', 'jose bought', 'meowed', 'meowed once', 'meowing', 'meowing at', 'mouse', 'mouse at', 'mouse store', 'mouses', 'once', 'once at', 'saw', 'saw mouse', 'still', 'still meowing', 'store', 'store jessy', 'store the', 'the', 'the bug', 'the cat', 'the mouse', 'the store', 'to', 'to the', 'went', 'went to', 'white', 'white mouse', 'white the']
Vocabulary content:
 {'jose': 37, 'bought': 11, 'dark': 20, 'blue': 8, 'mouses': 46, 'jose bought': 38, 'bought dark': 12, 'dark blue': 21, 'blue mouses': 10, 'and': 0, 'white': 65, 'mouse': 43, 'blue and': 9, 'and white': 2, 'white mouse': 6

In [56]:
# prepare dtm
X_train_tfidf_dtm = vect_tfidf.transform(texts)

In [57]:
# create a dataframe
pd.DataFrame(X_train_tfidf_dtm.toarray(), columns=feature_names)

Unnamed: 0,and,and the,and white,at,at the,ate,ate bug,ate mouse,blue,blue and,blue mouses,bought,bought dark,bug,bug and,bug it,bug jessy,cat,cat ate,cat is,dark,dark blue,has,has cat,is,is at,is meowing,is still,is white,it,it is,it meowed,jessy,jessy ate,jessy has,jessy saw,jessy went,jose,jose bought,meowed,meowed once,meowing,meowing at,mouse,mouse at,mouse store,mouses,once,once at,saw,saw mouse,still,still meowing,store,store jessy,store the,the,the bug,the cat,the mouse,the store,to,to the,went,went to,white,white mouse,white the
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.317779,0.0,0.382827,0.317779,0.317779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.317779,0.317779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.317779,0.317779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.269421,0.0,0.32457,0.0,0.0,0.0,0.0,0.0,0.269421,0.32457,0.0,0.269421,0.269421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.269421,0.269421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.269421,0.269421,0.0,0.0,0.0,0.0,0.175143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.269421,0.32457,0.0
2,0.0,0.0,0.0,0.237969,0.237969,0.278402,0.0,0.33539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237969,0.33539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.180982,0.33539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237969,0.0,0.0,0.413213,0.0,0.278402,0.0,0.278402,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.17926,0.215954,0.0,0.0,0.0,0.0,0.0,0.0,0.17926,0.0,0.0,0.215954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.537781,0.215954,0.0,0.215954,0.215954,0.0,0.0,0.0,0.0,0.0,0.0,0.116532,0.0,0.0,0.0,0.0,0.0,0.215954,0.215954,0.0,0.0,0.153226,0.215954,0.0,0.133032,0.0,0.0,0.0,0.17926,0.215954,0.215954,0.215954,0.215954,0.0,0.0,0.0
4,0.142494,0.171662,0.0,0.243598,0.243598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.284988,0.171662,0.171662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142494,0.0,0.0,0.171662,0.0,0.343324,0.171662,0.171662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.171662,0.171662,0.142494,0.142494,0.092631,0.0,0.0,0.0,0.171662,0.171662,0.0,0.0,0.171662,0.171662,0.0,0.0,0.0,0.317241,0.343324,0.0,0.142494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.196903,0.196903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.295354,0.0,0.416267,0.0,0.0,0.0,0.0,0.345537,0.138756,0.138756,0.0,0.138756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115179,0.115179,0.149749,0.0,0.138756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098451,0.0,0.138756,0.427381,0.0,0.345537,0.230358,0.0,0.0,0.0,0.0,0.0,0.115179,0.0,0.138756
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.346526,0.0,0.0,0.0,0.0,0.488388,0.488388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.405404,0.0,0.488388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
# N=3
# instantiate the count vectorizer
vect_tfidf = TfidfVectorizer(ngram_range=(1, 3))

In [59]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [60]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 114
['and', 'and the', 'and the mouse', 'and white', 'and white mouse', 'at', 'at the', 'at the bug', 'at the mouse', 'at the store', 'ate', 'ate bug', 'ate bug jessy', 'ate mouse', 'ate mouse at', 'blue', 'blue and', 'blue and white', 'blue mouses', 'bought', 'bought dark', 'bought dark blue', 'bug', 'bug and', 'bug and the', 'bug it', 'bug it is', 'bug jessy', 'bug jessy saw', 'cat', 'cat ate', 'cat ate mouse', 'cat is', 'cat is at', 'cat is meowing', 'cat is white', 'dark', 'dark blue', 'dark blue and', 'dark blue mouses', 'has', 'has cat', 'is', 'is at', 'is at the', 'is meowing', 'is meowing at', 'is still', 'is still meowing', 'is white', 'is white the', 'it', 'it is', 'it is still', 'it meowed', 'it meowed once', 'jessy', 'jessy ate', 'jessy ate bug', 'jessy has', 'jessy has cat', 'jessy saw', 'jessy saw mouse', 'jessy went', 'jessy went to', 'jose', 'jose bought', 'jose bought dark', 'meowed', 'meowed once', 'meowed once at', 'meowing', 'meowing at', 'meowing a

In [61]:
# prepare dtm
X_train_tfidf_dtm = vect_tfidf.transform(texts)

In [62]:
# create a dataframe
pd.DataFrame(X_train_tfidf_dtm.toarray(), columns=feature_names)

Unnamed: 0,and,and the,and the mouse,and white,and white mouse,at,at the,at the bug,at the mouse,at the store,ate,ate bug,ate bug jessy,ate mouse,ate mouse at,blue,blue and,blue and white,blue mouses,bought,bought dark,bought dark blue,bug,bug and,bug and the,bug it,bug it is,bug jessy,bug jessy saw,cat,cat ate,cat ate mouse,cat is,cat is at,cat is meowing,cat is white,dark,dark blue,dark blue and,dark blue mouses,has,has cat,is,is at,is at the,is meowing,is meowing at,is still,is still meowing,is white,is white the,it,it is,it is still,it meowed,it meowed once,jessy,jessy ate,jessy ate bug,jessy has,jessy has cat,jessy saw,jessy saw mouse,jessy went,jessy went to,jose,jose bought,jose bought dark,meowed,meowed once,meowed once at,meowing,meowing at,meowing at the,mouse,mouse at,mouse at the,mouse store,mouse store the,mouses,once,once at,once at the,saw,saw mouse,still,still meowing,still meowing at,store,store jessy,store jessy ate,store the,store the cat,the,the bug,the bug and,the bug it,the cat,the cat ate,the cat is,the mouse,the mouse store,the store,the store jessy,to,to the,to the store,went,went to,went to the,white,white mouse,white the,white the cat
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.273651,0.0,0.0,0.329665,0.273651,0.273651,0.273651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.273651,0.273651,0.0,0.329665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.273651,0.273651,0.273651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.329665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.222882,0.0,0.0,0.268504,0.268504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222882,0.268504,0.268504,0.0,0.222882,0.222882,0.222882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222882,0.222882,0.268504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222882,0.222882,0.222882,0.0,0.0,0.0,0.0,0.0,0.0,0.144889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222882,0.268504,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.190379,0.190379,0.0,0.0,0.268318,0.222727,0.0,0.0,0.268318,0.268318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190379,0.268318,0.268318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144789,0.268318,0.268318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190379,0.0,0.0,0.0,0.0,0.330578,0.0,0.0,0.0,0.222727,0.268318,0.0,0.0,0.0,0.222727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150447,0.181242,0.181242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150447,0.0,0.0,0.0,0.0,0.181242,0.181242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.45134,0.181242,0.181242,0.0,0.0,0.181242,0.181242,0.181242,0.181242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.097801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181242,0.181242,0.0,0.0,0.0,0.128597,0.181242,0.181242,0.0,0.0,0.111649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150447,0.181242,0.181242,0.181242,0.181242,0.181242,0.181242,0.181242,0.0,0.0,0.0,0.0
4,0.117835,0.141956,0.141956,0.0,0.0,0.201443,0.201443,0.283911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235671,0.141956,0.141956,0.141956,0.141956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117835,0.0,0.0,0.0,0.0,0.141956,0.141956,0.0,0.0,0.283911,0.141956,0.141956,0.141956,0.141956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141956,0.141956,0.141956,0.117835,0.117835,0.117835,0.076601,0.0,0.0,0.0,0.0,0.0,0.141956,0.141956,0.141956,0.0,0.0,0.141956,0.141956,0.141956,0.0,0.0,0.0,0.0,0.0,0.262342,0.283911,0.141956,0.141956,0.0,0.0,0.0,0.117835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.163176,0.163176,0.0,0.229978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.244765,0.0,0.0,0.344967,0.114989,0.114989,0.114989,0.0,0.0,0.0,0.0,0.0,0.0,0.286353,0.114989,0.114989,0.114989,0.114989,0.0,0.0,0.114989,0.114989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095451,0.095451,0.095451,0.1241,0.0,0.0,0.114989,0.114989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081588,0.0,0.0,0.114989,0.114989,0.354178,0.0,0.0,0.0,0.286353,0.0,0.344967,0.190902,0.114989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095451,0.0,0.114989,0.114989
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.438847,0.438847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.36428,0.0,0.0,0.438847,0.438847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
# Min_df

# Min_df ignores terms that have a document frequency (presence in % of documents) strictly lower 
# than the given threshold. 
# For example, Min_df=0.66 requires that a term appear in 66% of the docuemnts for it to be considered 
# part of the vocabulary.

In [64]:
# Sometimes min_df is used to limit the vocabulary size, so it learns only those terms that appear 
# in at least 10%, 20%, etc. of the documents.

In [65]:
# instantiate the tfidf vectorizer
vect_tfidf = TfidfVectorizer(ngram_range=(1, 1), max_df=1.0, min_df=0.2, max_features=None)

In [66]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=0.2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [67]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 16
['and', 'at', 'ate', 'blue', 'bought', 'bug', 'cat', 'dark', 'is', 'jessy', 'jose', 'meowing', 'mouse', 'store', 'the', 'white']
Vocabulary content:
 {'jose': 10, 'bought': 4, 'dark': 7, 'blue': 3, 'and': 0, 'white': 15, 'mouse': 12, 'the': 14, 'cat': 6, 'ate': 2, 'at': 1, 'store': 13, 'jessy': 9, 'bug': 5, 'is': 8, 'meowing': 11}


In [68]:
# Max_df

# When building the vocabulary, it ignores terms that have a document frequency strictly higher 
# than the given threshold. 

# This could be used to exclude terms that are too frequent and are 
# unlikely to help predict the label. 

# For example, by analyzing reviews on the movie Lion King, 
# the term 'Lion' might appear in 90% of the reviews (documents), in which case, we could 
# consider establishing Max_df=0.89

In [69]:
# instantiate the tfidf vectorizer
vect_tfidfcv = TfidfVectorizer(ngram_range=(1, 1), max_df=0.5, min_df=0.2, max_features=None)

In [70]:
# train (Bow) 
vect_tfidfcv.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=0.2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [71]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidfcv.vocabulary_)))

feature_names = vect_tfidfcv.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidfcv.vocabulary_))

Vocabulary size: 14
['and', 'at', 'ate', 'blue', 'bought', 'bug', 'cat', 'dark', 'is', 'jessy', 'jose', 'meowing', 'store', 'white']
Vocabulary content:
 {'jose': 10, 'bought': 4, 'dark': 7, 'blue': 3, 'and': 0, 'white': 13, 'cat': 6, 'ate': 2, 'at': 1, 'store': 12, 'jessy': 9, 'bug': 5, 'is': 8, 'meowing': 11}


In [72]:
# Max_features

# Limit the amount of features (vocabulary) that the vectorizer will learn

In [73]:
# instantiate the tfidf vectorizer
vect_tfidf = TfidfVectorizer(ngram_range=(1, 1), max_df=0.5, min_df=0.2, max_features=6)

In [74]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=6,
                min_df=0.2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [75]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 6
['at', 'bug', 'cat', 'is', 'jessy', 'store']
Vocabulary content:
 {'cat': 2, 'at': 0, 'store': 5, 'jessy': 4, 'bug': 1, 'is': 3}


In [76]:
# stopwords

# instantiate the count vectorizer
vect_tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', max_features=None)

In [77]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [78]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 16
['ate', 'blue', 'bought', 'bug', 'cat', 'dark', 'jessy', 'jose', 'meowed', 'meowing', 'mouse', 'mouses', 'saw', 'store', 'went', 'white']
Vocabulary content:
 {'jose': 7, 'bought': 2, 'dark': 5, 'blue': 1, 'mouses': 11, 'white': 15, 'mouse': 10, 'cat': 4, 'ate': 0, 'store': 13, 'jessy': 6, 'went': 14, 'bug': 3, 'saw': 12, 'meowed': 8, 'meowing': 9}


In [79]:
# notice the lack of stemming .. mouse and mouses, meowed	meowing

# CountVectorizer can 
# - lowercase letters, 
# - disregard punctuation and 
# - stopwords

# but it can't LEMMATIZE or STEM

In [80]:
# create the stemmer object
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem("mouse"))
print(porter_stemmer.stem("mouses"))
print(porter_stemmer.stem("meowed"))
print(porter_stemmer.stem("meowing"))

mous
mous
meow
meow


In [81]:
# Use NLTK's PorterStemmer
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [82]:
# instantiate the count vectorizer
vect_tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', tokenizer=stemming_tokenizer, max_features=None)

In [83]:
# train (Bow) 
vect_tfidf.fit(texts)

  'stop_words.' % sorted(inconsistent))


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function stemming_tokenizer at 0x000001E0194BE9D8>,
                use_idf=True, vocabulary=None)

In [84]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 16
['ate', 'blue', 'bought', 'bug', 'cat', 'dark', 'ha', 'jessi', 'jose', 'meow', 'mous', 'onc', 'saw', 'store', 'went', 'white']
Vocabulary content:
 {'jose': 8, 'bought': 2, 'dark': 5, 'blue': 1, 'mous': 10, 'white': 15, 'cat': 4, 'ate': 0, 'store': 13, 'jessi': 7, 'went': 14, 'bug': 3, 'saw': 12, 'meow': 9, 'onc': 11, 'ha': 6}


In [85]:
# prepare dtm
X_train_tfidf_dtm = vect_tfidf.transform(texts)

In [86]:
# create a dataframe
pd.DataFrame(X_train_tfidf_dtm.toarray(), columns=feature_names)

Unnamed: 0,ate,blue,bought,bug,cat,dark,ha,jessi,jose,meow,mous,onc,saw,store,went,white
0,0.0,0.48071,0.48071,0.0,0.0,0.48071,0.0,0.0,0.48071,0.0,0.275087,0.0,0.0,0.0,0.0,0.0
1,0.0,0.433251,0.433251,0.0,0.0,0.433251,0.0,0.0,0.433251,0.0,0.247928,0.0,0.0,0.0,0.0,0.433251
2,0.598821,0.0,0.0,0.0,0.511853,0.0,0.0,0.0,0.0,0.0,0.342676,0.0,0.0,0.511853,0.0,0.0
3,0.258538,0.0,0.0,0.258538,0.0,0.0,0.0,0.775614,0.0,0.0,0.147949,0.0,0.311459,0.22099,0.311459,0.0
4,0.0,0.0,0.0,0.63957,0.0,0.0,0.0,0.0,0.0,0.63957,0.182997,0.385243,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.787019,0.0,0.0,0.0,0.0,0.306914,0.351263,0.0,0.0,0.26234,0.0,0.306914
6,0.0,0.0,0.0,0.0,0.479185,0.0,0.675356,0.560603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We can use the above document-term matrix as a input for the further analysis.