In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
#### CORPUS

CORPUS = [
    'The sky is blue!',
    'Sky is blue and sky is beautiful',
    'The beautiful sky is so blue today',
    'I love blue shoes'
]

In [3]:
def bow_extractor(corpus, ngram_range=(1,1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [4]:
import pandas as pd

def display_features(features, feature_names):
    df = pd.DataFrame(data=features, columns=feature_names)
    return df

In [5]:
bow_vectorizer, bow_features = bow_extractor(CORPUS)

In [6]:
bow_feature_names = bow_vectorizer.get_feature_names()#
bow_dense_features = bow_features.todense()

In [7]:
display_features(bow_dense_features, bow_feature_names)

Unnamed: 0,and,beautiful,blue,is,love,shoes,sky,so,the,today
0,0,0,1,1,0,0,1,0,1,0
1,1,1,1,2,0,0,2,0,0,0
2,0,1,1,1,0,0,1,1,1,1
3,0,0,1,0,1,1,0,0,0,0


In [8]:
### TF_IDF

def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2', smooth_idf=True, use_idf=True)
    tf_idf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tf_idf_matrix

In [9]:
import numpy as np

feature_names = bow_vectorizer.get_feature_names()
tf_idf_transformer, tf_idf_matrix = tfidf_transformer(bow_features) #from before
features = np.round(tf_idf_matrix.todense(), 2)
display_features(features, feature_names)

Unnamed: 0,and,beautiful,blue,is,love,shoes,sky,so,the,today
0,0.0,0.0,0.4,0.49,0.0,0.0,0.49,0.0,0.6,0.0
1,0.44,0.35,0.23,0.56,0.0,0.0,0.56,0.0,0.0,0.0
2,0.0,0.38,0.25,0.31,0.0,0.0,0.31,0.48,0.38,0.48
3,0.0,0.0,0.35,0.0,0.66,0.66,0.0,0.0,0.0,0.0


In [10]:
### How it works
import scipy.sparse as sp
from numpy.linalg import norm

In [11]:
feature_names = bow_vectorizer.get_feature_names()
feature_names

['and',
 'beautiful',
 'blue',
 'is',
 'love',
 'shoes',
 'sky',
 'so',
 'the',
 'today']

In [12]:
## compute term frequency
tf = bow_features.todense()
print(tf)

[[0 0 1 1 0 0 1 0 1 0]
 [1 1 1 2 0 0 2 0 0 0]
 [0 1 1 1 0 0 1 1 1 1]
 [0 0 1 0 1 1 0 0 0 0]]


In [13]:
tf = np.array(tf, dtype="float64")

In [14]:
display_features(tf, feature_names)

Unnamed: 0,and,beautiful,blue,is,love,shoes,sky,so,the,today
0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,1.0,1.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0
2,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
3,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [15]:
## build document frequncy matrix
df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr)
df = 1+df ###smoothing

display_features([df], feature_names) #document frequencies

Unnamed: 0,and,beautiful,blue,is,love,shoes,sky,so,the,today
0,2,3,5,4,2,2,4,2,3,2


In [16]:
total_docs = 1+len(CORPUS)
idf = 1.0+np.log(float(total_docs)/df)

In [17]:
display_features([np.round(idf,2)], feature_names) #Invese document frequencies

Unnamed: 0,and,beautiful,blue,is,love,shoes,sky,so,the,today
0,1.92,1.51,1.0,1.22,1.92,1.92,1.22,1.92,1.51,1.92


In [18]:
### compute idf diagonal matrix

total_features = bow_features.shape[1]
idf_diag = sp.spdiags(idf, diags=0, m=total_features, n=total_features)
idf = idf_diag.todense()

np.round(idf,2)

array([[1.92, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 1.51, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 1.22, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 1.92, 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 1.92, 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.22, 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.92, 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.51, 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.92]])

In [19]:
## recall

from pprint import pprint
pprint(tf)
print()
pprint(idf)



array([[0., 0., 1., 1., 0., 0., 1., 0., 1., 0.],
       [1., 1., 1., 2., 0., 0., 2., 0., 0., 0.],
       [0., 1., 1., 1., 0., 0., 1., 1., 1., 1.],
       [0., 0., 1., 0., 1., 1., 0., 0., 0., 0.]])

matrix([[1.91629073, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 1.51082562, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 1.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 1.22314355, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 1.91629073,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         1.91629073, 0.        , 0.   

In [20]:
### compute tf*idf

tfidf = tf*idf

display_features(np.round(tfidf,2), feature_names)

Unnamed: 0,and,beautiful,blue,is,love,shoes,sky,so,the,today
0,0.0,0.0,1.0,1.22,0.0,0.0,1.22,0.0,1.51,0.0
1,1.92,1.51,1.0,2.45,0.0,0.0,2.45,0.0,0.0,0.0
2,0.0,1.51,1.0,1.22,0.0,0.0,1.22,1.92,1.51,1.92
3,0.0,0.0,1.0,0.0,1.92,1.92,0.0,0.0,0.0,0.0


In [21]:
###compute l2norms

norms = norm(tfidf, axis=1)
np.round(norms, 2) #norm for each document

array([2.5 , 4.35, 3.99, 2.89])

In [22]:
## normalized td-idf
norm_tfidf = tfidf / norms[:,None]
norm_tfidf

matrix([[0.        , 0.        , 0.39921021, 0.48829139, 0.        ,
         0.        , 0.48829139, 0.        , 0.60313701, 0.        ],
        [0.44051607, 0.34730793, 0.22987956, 0.5623514 , 0.        ,
         0.        , 0.5623514 , 0.        , 0.        , 0.        ],
        [0.        , 0.37887218, 0.25077161, 0.30672968, 0.        ,
         0.        , 0.30672968, 0.48055132, 0.37887218, 0.48055132],
        [0.        , 0.        , 0.34618161, 0.        , 0.66338461,
         0.66338461, 0.        , 0.        , 0.        , 0.        ]])

In [23]:
display_features(np.round(norm_tfidf,2), feature_names)

Unnamed: 0,and,beautiful,blue,is,love,shoes,sky,so,the,today
0,0.0,0.0,0.4,0.49,0.0,0.0,0.49,0.0,0.6,0.0
1,0.44,0.35,0.23,0.56,0.0,0.0,0.56,0.0,0.0,0.0
2,0.0,0.38,0.25,0.31,0.0,0.0,0.31,0.48,0.38,0.48
3,0.0,0.0,0.35,0.0,0.66,0.66,0.0,0.0,0.0,0.0


In [24]:
### Same computation but with TfIdfVecoriter

from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_extractor(corpus, ngram_range=(1,1)):
    vectorizer = TfidfVectorizer(min_df=1, norm="l2", smooth_idf=True, use_idf=True, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [25]:
tfidf_vect, tfidf_features = tfidf_extractor(CORPUS)

In [26]:
display_features(np.round(tfidf_features.todense(), 2), feature_names)

Unnamed: 0,and,beautiful,blue,is,love,shoes,sky,so,the,today
0,0.0,0.0,0.4,0.49,0.0,0.0,0.49,0.0,0.6,0.0
1,0.44,0.35,0.23,0.56,0.0,0.0,0.56,0.0,0.0,0.0
2,0.0,0.38,0.25,0.31,0.0,0.0,0.31,0.48,0.38,0.48
3,0.0,0.0,0.35,0.0,0.66,0.66,0.0,0.0,0.0,0.0


### Fortgeschrittene Methoden

In [27]:
import gensim 
import nltk

In [28]:
new_doc = ['loving this blue sky today']

In [29]:
TOKENIZED_CORPUS = [nltk.word_tokenize(sentence) for sentence in CORPUS]

tokenized_new_doc = [nltk.word_tokenize(sentence) for sentence in new_doc]

print(TOKENIZED_CORPUS)
print(tokenized_new_doc)

[['The', 'sky', 'is', 'blue', '!'], ['Sky', 'is', 'blue', 'and', 'sky', 'is', 'beautiful'], ['The', 'beautiful', 'sky', 'is', 'so', 'blue', 'today'], ['I', 'love', 'blue', 'shoes']]
[['loving', 'this', 'blue', 'sky', 'today']]


In [30]:
### word2vec model with gensim
model = gensim.models.Word2Vec(TOKENIZED_CORPUS, size=10, window=10, min_count=2, sample=1e-3)

In [32]:
model.corpus_total_words

23

In [33]:
model.corpus_count #number of documents

4

In [39]:
model['sky']

  """Entry point for launching an IPython kernel.


array([-0.01578772,  0.02409793,  0.03717836, -0.00491997,  0.00092736,
       -0.01574175, -0.01543428,  0.04089389, -0.00490058,  0.04396923],
      dtype=float32)

In [48]:
words = model.wv.index2word
words

['is', 'blue', 'sky', 'The', 'beautiful']

### Metrics

In [49]:
from sklearn import metrics
import numpy as np
import pandas as pd
from collections import Counter

In [50]:
actual_labels = ['spam', 'ham', 'spam', 'spam', 'spam','ham',  
                 'ham', 'spam', 'ham',  'spam','spam', 'ham', 'ham',  
                 'ham',  'spam','ham',  'ham', 'spam', 'spam', 'ham']

In [51]:
predicted_labels = ['spam', 'spam', 'spam', 'ham',  'spam','spam', 'ham',  
                    'ham',  'spam', 'spam','ham',  'ham',  
                    'spam', 'ham',  'ham','ham',  'spam', 'ham',  
                    'spam', 'spam']

In [52]:
ac = Counter(actual_labels)
pl = Counter(predicted_labels)

In [53]:
ac.most_common()

[('spam', 10), ('ham', 10)]

In [54]:
pl.most_common()

[('spam', 11), ('ham', 9)]

In [56]:
cm = metrics.confusion_matrix(y_true=actual_labels,
                         y_pred=predicted_labels,
                                labels=['spam','ham'])

In [60]:
print(pd.DataFrame(data=cm, 
                   columns=pd.MultiIndex(levels=[['Predicted:'],
                                                 ['spam','ham']], 
                                         codes=[[0,0],[0,1]]), 
                   index=pd.MultiIndex(levels=[['Actual:'],
                                               ['spam','ham']], 
                    codes=[[0,0],[0,1]])))

             Predicted:    
                   spam ham
Actual: spam          5   5
        ham           6   4


In [61]:
positive_class = 'spam'

true_positive = 5.
false_positive = 6.
false_negative = 5.
true_negative = 4.

accuracy = np.round(
                metrics.accuracy_score(y_true=actual_labels,
                                       y_pred=predicted_labels),2)
accuracy_manual = np.round(
                    (true_positive + true_negative) /
                      (true_positive + true_negative +
false_negative + false_positive),2)

In [63]:
print('Accuracy:', accuracy)
print ('Manually computed accuracy:', accuracy_manual )

Accuracy: 0.45
Manually computed accuracy: 0.45


In [64]:
precision = np.round(
                metrics.precision_score(y_true=actual_labels,
                                        y_pred=predicted_labels,
                                        pos_label=positive_class),2)
precision_manual = np.round(
                        (true_positive) /
                        (true_positive + false_positive),2)
print ('Precision:', precision)
print ('Manually computed precision:', precision_manual)

Precision: 0.45
Manually computed precision: 0.45


In [65]:
recall = np.round(
            metrics.recall_score(y_true=actual_labels,
                                 y_pred=predicted_labels,
                                 pos_label=positive_class),2)
recall_manual = np.round(
                    (true_positive) /
                    (true_positive + false_negative),2)
print ('Recall:', recall)
print ('Manually computed recall:', recall_manual)


Recall: 0.5
Manually computed recall: 0.5


In [66]:
f1_score = np.round(
                metrics.f1_score(y_true=actual_labels,
                                 y_pred=predicted_labels,
                                 pos_label=positive_class),2) 
f1_score_manual = np.round(
                    (2 * precision * recall) /
                    (precision + recall),2)
print ('F1 score:', f1_score)
print ('Manually computed F1 score:', f1_score_manual )

F1 score: 0.48
Manually computed F1 score: 0.47
