In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
from sklearn import metrics

import scipy.sparse as sp
import numpy as np
import nltk
import gensim
import pandas as pd

In [2]:
def bow_extractor(corpus, ngram_range=(1,1)):
    
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features


def tfidf_transformer(bow_matrix):
    
    transformer = TfidfTransformer(norm='l2',
                                   smooth_idf=True,
                                   use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix


def tfidf_extractor(corpus, ngram_range=(1,1)):
    
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features


def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector


def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


def tfidf_wtd_avg_word_vectors(words, tfidf_vector, tfidf_vocabulary, model, num_features):
    
    word_tfidfs = [tfidf_vector[0, tfidf_vocabulary.get(word)] 
                   if tfidf_vocabulary.get(word) 
                   else 0 for word in words]    
    word_tfidf_map = {word:tfidf_val for word, tfidf_val in zip(words, word_tfidfs)}
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    vocabulary = set(model.wv.index_to_key)
    wts = 0.
    for word in words:
        if word in vocabulary: 
            word_vector = model.wv[word]
            weighted_word_vector = word_tfidf_map[word] * word_vector
            wts = wts + word_tfidf_map[word]
            feature_vector = np.add(feature_vector, weighted_word_vector)
    if wts:
        feature_vector = np.divide(feature_vector, wts)
        
    return feature_vector


def tfidf_weighted_averaged_word_vectorizer(corpus, tfidf_vectors, 
                                   tfidf_vocabulary, model, num_features):
                                       
    docs_tfidfs = [(doc, doc_tfidf) 
                   for doc, doc_tfidf 
                   in zip(corpus, tfidf_vectors)]
    features = [tfidf_wtd_avg_word_vectors(tokenized_sentence, tfidf, tfidf_vocabulary,
                                   model, num_features)
                    for tokenized_sentence, tfidf in docs_tfidfs]
    return np.array(features)

In [3]:
# Start with a 'toy' corpus
CORPUS = [
    'the sky is blue',
    'sky is blue and sky is beautiful',
    'the beautiful sky is so blue',
    'i love blue cheese'
]

# Use new_doc as test dataset
new_doc = ['loving this blue sky today']

In [4]:
def display_features(features, feature_names):
    df = pd.DataFrame(data=features,
                      columns=feature_names)
    print(df)

In [5]:
# Pass CORPUS to the simplest bow extractor
bow_vectorizer, bow_features = bow_extractor(CORPUS)
features = bow_features.todense() # Since we can't view the default 'sparse matrix'
print(features)

[[0 0 1 0 1 0 1 0 1]
 [1 1 1 0 2 0 2 0 0]
 [0 1 1 0 1 0 1 1 1]
 [0 0 1 1 0 1 0 0 0]]


In [6]:
# Need to extract the same features from our test data too!
new_doc_features = bow_vectorizer.transform(new_doc)
new_doc_features = new_doc_features.todense()
print(new_doc_features)

[[0 0 1 0 0 0 1 0 0]]


In [7]:
# See which words/tokens these counts are for...
feature_names = bow_vectorizer.get_feature_names_out()
print(feature_names)

['and' 'beautiful' 'blue' 'cheese' 'is' 'love' 'sky' 'so' 'the']


In [8]:
# Print both the feature names and counts together

# for the training data
display_features(features, feature_names)

   and  beautiful  blue  cheese  is  love  sky  so  the
0    0          0     1       0   1     0    1   0    1
1    1          1     1       0   2     0    2   0    0
2    0          1     1       0   1     0    1   1    1
3    0          0     1       1   0     1    0   0    0


In [9]:
# for the test data
display_features(new_doc_features, feature_names)

   and  beautiful  blue  cheese  is  love  sky  so  the
0    0          0     1       0   0     0    1   0    0


In [10]:
# Try the same with tf-idf instead of frequency counts

# Use the tfidf_transformer function we defined
feature_names = bow_vectorizer.get_feature_names_out()

# Convert to the dense form to print the values out    
tfidf_trans, tdidf_features = tfidf_transformer(bow_features)
features = np.round(tdidf_features.todense(), 2)
display_features(features, feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [11]:
# Do the same for the test document
nd_tfidf = tfidf_trans.transform(new_doc_features)
nd_features = np.round(nd_tfidf.todense(), 2)
display_features(nd_features, feature_names)

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0




In [12]:
# Compute tf-idf scores/vectors ourselves from scratch
# - without using sklearn's TfidfTransformer class
feature_names = bow_vectorizer.get_feature_names_out()

In [13]:
# Compute term frequencies by simply using bow model
tf = bow_features.todense()
tf = np.array(tf, dtype='float64')

In [14]:
# Check if the term frequencies are as expected
display_features(tf, feature_names)

   and  beautiful  blue  cheese   is  love  sky   so  the
0  0.0        0.0   1.0     0.0  1.0   0.0  1.0  0.0  1.0
1  1.0        1.0   1.0     0.0  2.0   0.0  2.0  0.0  0.0
2  0.0        1.0   1.0     0.0  1.0   0.0  1.0  1.0  1.0
3  0.0        0.0   1.0     1.0  0.0   1.0  0.0  0.0  0.0


In [15]:
# Build the document frequency matrix
df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr)
df = 1 + df

In [16]:
# Check if the document frequencies are as expected
display_features([df], feature_names)

   and  beautiful  blue  cheese  is  love  sky  so  the
0    2          3     5       2   4     2    4   2    3


In [17]:
# Compute the inverse document frequencies
total_docs = 1 + len(CORPUS)
idf = 1.0 + np.log(float(total_docs) / df)

In [18]:
# Check if the inverse document frequencies are as expected
display_features([np.round(idf, 2)], feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  1.92       1.51   1.0    1.92  1.22  1.92  1.22  1.92  1.51


In [19]:
# Compute the idf diagonal matrix  
total_features = bow_features.shape[1]
idf_diag = sp.spdiags(idf, diags=0, m=total_features, n=total_features)
idf = idf_diag.todense()

In [20]:
# Is the idf diagonal matrix as expected?
print(np.round(idf, 2))

[[1.92 0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   1.51 0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   1.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   1.92 0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   1.22 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   1.92 0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   1.22 0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   1.92 0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   1.51]]


In [21]:
#Compute the full tfidf feature matrix
tfidf = tf * idf

In [22]:
# Is the tfidf feature matrix what we expected?
display_features(np.round(tfidf, 2), feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00   1.0    0.00  1.22  0.00  1.22  0.00  1.51
1  1.92       1.51   1.0    0.00  2.45  0.00  2.45  0.00  0.00
2  0.00       1.51   1.0    0.00  1.22  0.00  1.22  1.92  1.51
3  0.00       0.00   1.0    1.92  0.00  1.92  0.00  0.00  0.00


In [23]:
# Compute the L2 norms
norms = norm(tfidf, axis=1)

In [24]:
# Display the L2 norms for each document
print(np.round(norms, 2))

[2.5  4.35 3.5  2.89]


In [25]:
# Compute the 'normalized' tfidf
norm_tfidf = tfidf / norms[:, None]

In [26]:
# Check if the final tfidf feature matrix is as expected
# Is it the same as what we got using the TfidfTransformer class of sklearn?
display_features(np.round(norm_tfidf, 2), feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [27]:
# Do the same for the test data 
# First, compute the term freqs from bow freqs for the test data - new_doc
nd_tf = new_doc_features
nd_tf = np.array(nd_tf, dtype='float64')

In [28]:
# Next compute tfidf using idf matrix from the train corpus
nd_tfidf = nd_tf*idf
nd_norms = norm(nd_tfidf, axis=1)
norm_nd_tfidf = nd_tfidf / nd_norms[:, None]

In [29]:
# Check the new_doc tfidf feature vector
display_features(np.round(norm_nd_tfidf, 2), feature_names)

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


In [30]:
# sklearn's TfidfVectorizer provides a transformer to extract tfidf scores directly
# from raw data - avoiding the need for CountVectorizer based bow scores
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
display_features(np.round(tdidf_features.todense(), 2), feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [31]:
nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names)

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


In [32]:
#Do more sophisticated word-vector models using 
# Google's word2vec algorithm and
# the gensim python package
TOKENIZED_CORPUS = [nltk.word_tokenize(sentence) 
                    for sentence in CORPUS]
tokenized_new_doc = [nltk.word_tokenize(sentence) 
                    for sentence in new_doc]

In [33]:
# Model parameters for the NN-based word2vec 'word embeddings':

model = gensim.models.Word2Vec(TOKENIZED_CORPUS, 
                               vector_size=10, # dimension of the word vectors (tens to thousands)
                               window=10, # window size to conside the context of a word
                               min_count=2, # minimum frequency of a word in the whole corpus to be included in vocabulary
                               sample=1e-3) # used to downsample the effects of the occurence of frequent words

In [34]:
# Averaging word vectors of a document
avg_word_vec_features = averaged_word_vectorizer(corpus=TOKENIZED_CORPUS,
                                                 model=model,
                                                 num_features=10)
print(np.round(avg_word_vec_features, 3))

[[ 0.011  0.022 -0.004  0.03  -0.02  -0.032  0.002  0.063 -0.04  -0.02 ]
 [ 0.036  0.011  0.032  0.027 -0.014 -0.032  0.031  0.028 -0.061 -0.059]
 [-0.006  0.016  0.016  0.009 -0.02  -0.029  0.018  0.039 -0.032 -0.025]
 [-0.005  0.002  0.051  0.09  -0.093 -0.071  0.065  0.09  -0.05  -0.038]]


In [35]:
nd_avg_word_vec_features = averaged_word_vectorizer(corpus=tokenized_new_doc,
                                                    model=model,
                                                    num_features=10)
print(np.round(nd_avg_word_vec_features, 3))

[[ 0.034  0.027  0.059  0.049 -0.015 -0.053  0.028  0.074 -0.063 -0.038]]


In [36]:
# Using tfidf weighted average of word vectors in a document              
corpus_tfidf = tdidf_features
vocab = tfidf_vectorizer.vocabulary_
wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_vectorizer(corpus=TOKENIZED_CORPUS,
                                                                     tfidf_vectors=corpus_tfidf,
                                                                     tfidf_vocabulary=vocab,
                                                                     model=model, 
                                                                     num_features=10)
print(np.round(wt_tfidf_word_vec_features, 3))

[[ 0.006  0.024 -0.011  0.023 -0.015 -0.028 -0.006  0.064 -0.034 -0.012]
 [ 0.049  0.013  0.025  0.027 -0.005 -0.029  0.023  0.027 -0.067 -0.062]
 [-0.013  0.017  0.014  0.    -0.017 -0.026  0.015  0.035 -0.026 -0.021]
 [-0.005  0.002  0.051  0.09  -0.093 -0.071  0.065  0.09  -0.05  -0.038]]


In [37]:
nd_wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_new_doc,
                                                                     tfidf_vectors=nd_tfidf,
                                                                     tfidf_vocabulary=vocab,
                                                                     model=model, 
                                                                     num_features=10)
print(np.round(nd_wt_tfidf_word_vec_features, 3))

[[ 0.038  0.029  0.06   0.045 -0.007 -0.051  0.024  0.072 -0.064 -0.039]]
