In [1]:
#import logging
#import codecs
#import glob
#import logging
#import multiprocessing
import os
import pprint
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn

# More  document features


We will denote by

- $W= \{w_1, \dots, w_D\}$ the set of words used to make the representations.
- $X$ our corpus of documents.
- $X_w$ the set of documents that contain word $w$. 

### Bag of words vector  (or `tf` vector)


- The bag of words representation for a document $x$ given a vocabulary $W$, or the term frequency vector **$\text{tf}(X;W)$** is defined as 

$$
\text{tf}(x;W) = \left( \#\{w_1| w_1 \in x\}, \dots, \#\{w_D| w_D \in x\})\right)
$$



### Term frequency Inverse Document frequency ( `tf * idf`)

The objective of tf-idf representation is to emphasize the most relevant words of the documents. We want to emphasize:

- Words that appear **frequently in the document**: term frequency 
- Words that appear **rarely in the corpus**: inverse document frequency

#### Definition of the feature vectors


- The **$\text{tf}(X;W)$** vector for a document $x$ is defined as 

$$
\text{tf}(x;W) = \left( \#\{w_1| w_1 \in x\}, \dots, \#\{w_D| w_D \in x\})\right)
$$

- The **$\text{idf}(W; X)$** vector is defined as 

**$$\text{idf}(W; X) = \left( \text{idf}(w_1; X), \dots, \text{idf}(w_D; X)\right)$$** 
   
$\,\,\,\,\,\,\,$ A component of the feature for word $w \in W$ in the corpus $X$ is defined as 

$$
\text{idf}(w, X) = log\left(\frac{|X|}{1+|X_{w}|}\right)
$$

$\,\,\,\,\,\,\,$Which simply means the full vector is 
$$
\text{idf}(w, X) = \left( log\left(\frac{|X|}{1+|X_{w_1}|}\right), \dots, log\left(\frac{|X|}{1+|X_{w_D}|}\right) \right)
$$

- The tfidf vector for a document $x$ will be: $tf(x; X) * idf(X)$

#### Observations

- If a word appears in a few documents the idf vector will increase its weight.

- If a word appears in a lots of documents documents the idf vector will decrease its weight.

#### `sklearn.feature_extraction.text.TfidfVectorizer`

- Notice that the implementation in sklearn already prevents zero divisions by default. This happens if `smooth_idf=True`.

- By default the tfidf will only use words since `ngram_range=(1, 1)`. But this can be changed to allow n-grams in the feature vector components.

#### Example

Let us assume we have a corpus with one milion documents

- Consider a word appearping in 100 documents:

$$\log\left(\frac{1000.000}{1 + 100} \right) = 9.200$$

- Consider a word appearing in 100.000 documents

$$\log\left(\frac{1000.000}{1 + 100.000} \right) = 2.197$$



In [9]:
import scipy as sp

def build_vocabulary(corpus, splitter):
    vocabulary = set()
    X_w = dict()    
    for document in corpus:
        words      = set(splitter.findall(document.lower()))
        vocabulary = vocabulary.union(words)
        for w in words:
            X_w[w] = X_w.get(w, 0) + 1
    
    return vocabulary, X_w

def term_frequency(document, word_to_ind, splitter, 
                   normalize=True, word_inds=False):
    
    words = splitter.findall(document.lower())
    n_features = len(word_to_ind)
    tf = sp.sparse.lil_matrix( (1, n_features), dtype=float)
    
    word_indices = []
    for w in words:
        word_indices.append(word_to_ind[w])
        tf[0, word_to_ind[w]] += 1
    
    if word_inds:
        if normalize:
            return tf.multiply(1/sp.sparse.linalg.norm(tf))
        else:
            return tf
    else:
        if normalize:
            return tf.multiply(1/sp.sparse.linalg.norm(tf))
        else:
            return tf

In [3]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

In [4]:
splitter = re.compile('(?u)\\b\\w\\w+\\b')
%time vocabulary, X_w = build_vocabulary(newsgroups_train.data, splitter)

word_to_ind = {v:i for i,v in enumerate(vocabulary)}
ind_to_word = {v:k for k,v in word_to_ind.items()}

CPU times: user 27.2 s, sys: 1.14 s, total: 28.3 s
Wall time: 28.5 s


In [10]:
%time tf = term_frequency(newsgroups_train.data[0],\
                          word_to_ind, splitter, word_inds=False)

CPU times: user 1.42 ms, sys: 292 µs, total: 1.71 ms
Wall time: 1.46 ms


## Verify that the term frequency is OK, compare with sklearn

In [61]:
np.sort(list(tfidf_sk.vocabulary_.values()))

array([     0,      1,      2, ..., 130104, 130105, 130106])

In [11]:
tfidf_sk = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=False,
                                                           smooth_idf=False, 
                                                           sublinear_tf=False)

%time tfidf_sk.fit(newsgroups_train.data)

inverse_vocabulary_ = {v: k for k, v in tfidf_sk.vocabulary_.items()}

CPU times: user 1.98 s, sys: 25.7 ms, total: 2.01 s
Wall time: 2.01 s


In [12]:
%time x_sk = tfidf_sk.transform([newsgroups_train.data[0]])

CPU times: user 366 µs, sys: 17 µs, total: 383 µs
Wall time: 384 µs


In [13]:
np.isclose(tf.sum(), x_sk.sum())

True

In [14]:
words_x_own = [ind_to_word[k] for k in tf.nonzero()[1]]
words_x_sk = [inverse_vocabulary_[k] for k in x_sk.nonzero()[1]]
set(words_x_own) == set(words_x_sk)

True

## Exercise: Generate tfidf and compare with sklearn 

Finish the `compute_idf` function

In [15]:
def compute_idf(X_w, word_to_ind, n_documents):

    n_features = len(word_to_ind)
    #idf = sp.sparse.csr_matrix( (1, n_features), dtype=float)
    idf = np.zeros([1, n_features])
    
    for w in X_w:
        idf[0, word_to_ind[w]] = np.log((1+n_documents)/(1 + X_w[w]))+1 
        
    #idf = idf + 1    
    return sp.sparse.csr_matrix(idf)

In [16]:
%%time 
# lil_matrix is more efficient.
tf = term_frequency(newsgroups_train.data[0], word_to_ind,\
                    splitter, normalize=False,word_inds=False)

idf = compute_idf(X_w,word_to_ind, len(newsgroups_train.data))

CPU times: user 187 ms, sys: 3.42 ms, total: 190 ms
Wall time: 190 ms


In [17]:
idf.max(), idf.min()

(9.640737377507692, 1.0)

In [18]:
n_documents = len(X_w)

In [19]:
tfidf = tf.multiply(idf)
tfidf = tfidf/sp.sparse.linalg.norm(tfidf)
sp.sparse.linalg.norm(tfidf)

1.0

In [20]:
tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
tfidf_vectorizer.fit(newsgroups_train.data);

In [21]:
tfidf_sklearn = tfidf_vectorizer.transform(newsgroups_train.data[0:1])

In [22]:
tfidf.data.dtype, tfidf_sklearn.data.dtype

(dtype('float64'), dtype('float64'))

In [23]:
print(tfidf.sum(), tfidf_sklearn.sum())
print("\nsklearn tfidf and our tfidf are the same:",
      np.isclose(tfidf_sklearn.sum(),tfidf.sum()))

7.697815233022509 7.697815233022508

sklearn tfidf and our tfidf are the same: True


# tfidf in data 


Now we will use a dataframe containing text and use the tf-idf

In [25]:
people = pd.read_csv('../data/people_wiki_small.csv')

In [26]:
people.head()

Unnamed: 0.1,Unnamed: 0,URI,name,text
0,0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [27]:
people.dtypes

Unnamed: 0     int64
URI           object
name          object
text          object
dtype: object

In [28]:
people["text"][3]

'franz rottensteiner born in waidmannsfeld lower austria austria on 18 january 1942 is an austrian publisher and critic in the fields of science fiction and the fantasticrottensteiner studied journalism english and history at the university of vienna receiving his doctorate in 1969 he served about fifteen years as librarian and editor at the sterreichisches institut fr bauforschung in vienna in addition he produced a number of translations into german of leading sf authors including herbert w franke stanislaw lem philip k dick kobo abe cordwainer smith brian w aldiss and the strugatski brothersin 1973 his new york anthology view from another shore of european science fiction introduced a number of continental authors to the englishreading public some of the authors in the work are stanislaw lem josef nesvadba gerard klein and jeanpierre andrevonthe year 1975 saw the start of his series die phantastischen romane for seven years it republished works of both lesser and betterknown writers

## Exercise: get_text_given_name

build a function  `get_text_given_name` that returns the text of a particular person from the data.

First try to think how can you select the text for Barack Obama...

In [29]:
obama = people[people['name'] == 'Barack Obama']

In [30]:
people[people['name'] == 'Barack Obama']["text"]

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object

In [31]:
def get_text(df, boolean_series):
    row_df = df[boolean_series].text
    return df.loc[row_df.index[0]].text

def get_text_given_name(df, name):
    row_df = df[df["name"] == name]
    return df.loc[row_df.index[0]].text


In [32]:
get_text_given_name(people, "Barack Obama")

'barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to 2004 he served three terms representing the 13th district in the illinois senate from 1997 to 2004 running unsuccessfully for the united states house of representatives in 2000in 2004 obama received national attention during his campaign to represent illinois in the united states senate with his victory in the march democratic party primary his keynote address at the democratic national convention in july and his election to the senate in november he began his presidential campaign in 2007 and afte

#### End of the exercise

Now we will use the tfidf to get a representation of the vector containing the description of obama and Emma Watson

In [33]:
import sklearn
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()

In [34]:
import numpy as np
import scipy as scp
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
obama_vec, emma_vec = tfidf.fit_transform([get_text_given_name(people, "Barack Obama"), 
                                           get_text_given_name(people, "Emma Watson")])

In [36]:
%%time
X_tfidf = tfidf.fit_transform(people["text"])

CPU times: user 7.6 s, sys: 132 ms, total: 7.74 s
Wall time: 7.75 s


## Exercise:

Build the function `get_closest_k_names(tfidf_vec, X_tfidf, k=10)` that returns the names of the people associated to the text that is closer to the query text.

Try to find the closest names to:

```
"Brad Pitt"
"Angelina Jolie"
"Barack Obama"
"Bill Clinton"
"Emma Watson"
```

Do they make any sense? (You might want to check the wikipedia)

You will need to compute the tfidf for each individual in the list.

In [37]:
brad_pitt_tfidf = tfidf.transform([get_text_given_name(people, "Brad Pitt")])
angelina_tfidf  = tfidf.transform([get_text_given_name(people, "Angelina Jolie")])
obama_tfidf     = tfidf.transform([get_text_given_name(people, "Barack Obama")])
bill_tfidf      = tfidf.transform([get_text_given_name(people, "Bill Clinton")])
emma_tfidf      = tfidf.transform([get_text_given_name(people, "Emma Watson")])

In [38]:
def get_closest_k_names(tfidf_vec, X_tfidf, k=10):
    aux = np.argsort(cosine_similarity(tfidf_vec, X_tfidf))
    return people["name"][aux[0][-k-1:-1]]

In [39]:
get_closest_k_names(emma_tfidf, X_tfidf, k=10)

29609         Bettina Devin
8504     Margaret C. Snyder
3633        Priyanka Chopra
26909      Pat Studdy-Clift
11666            Jane Fonda
34756          Maggie Smith
35902     Natashia Williams
8973           John Granger
17821         Emma Thompson
3115           Stuart Craig
Name: name, dtype: object

In [40]:
get_closest_k_names(bill_tfidf, X_tfidf, k=10)

19416         Donnie Fowler
11723           Howard Dean
37166    Richard L. Barclay
9517          Deval Patrick
28453            Jill Alper
35817          Barack Obama
2092     Richard Blumenthal
28447        George W. Bush
4096       Sheffield Nelson
25658           Dick Morris
Name: name, dtype: object

In [41]:
get_closest_k_names(angelina_tfidf, X_tfidf, k=10)

24263      Jessica Lange
11666         Jane Fonda
11156      Anne Hathaway
28076          Amy Adams
33529     Cate Blanchett
24426          Brad Pitt
21644       Jodie Foster
16242       Meryl Streep
34756       Maggie Smith
29009    Barbara Hershey
Name: name, dtype: object