# This talk covers
* Term Frequency (i.e. vectorized text)
* Stop words
* Term Frequency - Inverse Document Frequency (TF-IDF)
* Negation Handling
* Character NGrams

and includes a demo!

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

## Term frequency (i.e. vectorized text)

<img src="NLP/term-frequency.jpg" style="height: 250px;"/>
_From: https://www.linkedin.com/pulse/term-frequency-inverse-document-sanjay-singh-1_

In [2]:
docs = [
    "I liked the movie, just not the popcorn."
]

In [3]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None)
vectorizer.fit(docs)

pd.DataFrame(vectorizer.transform(docs).toarray(), columns=vectorizer.get_feature_names())

# Note: "I" is missing because the default tokenizer omits one letter words

Unnamed: 0,just,liked,movie,not,popcorn,the
0,1.0,1.0,1.0,1.0,1.0,2.0


## Stop words

<img src="NLP/stop-words.png" style="height: 200px;"/>
_From: https://www.slideshare.net/tomeksobczak/stopwords-in-search_

In [4]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None, stop_words='english')
vectorizer.fit(docs)

pd.DataFrame(vectorizer.transform(docs).toarray(), columns=vectorizer.get_feature_names())

# Note "the" and "not" considered stop words

Unnamed: 0,just,liked,movie,popcorn
0,1.0,1.0,1.0,1.0


## Term Frequency - Inverse Document Frequency (TF-IDF)

<img src="NLP/tf-idf.png" style="height: 300px;"/>
_Based on image from: http://ccdoc-tecnicasrecuperacioninformacion.blogspot.com/2012/11/frecuencias-y-pesos-de-los-terminos-de.html_

<img src="NLP/tfidf-downward.jpg" style="height: 400px;"/>
_From: http://seocopywriting.com/tf-idf-killed-copywriting-spam/_

In [25]:
docs = [
    "I liked the movie, just not the popcorn.",
    "I liked the song, just not the words."
]

### Without TF-IDF

In [6]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None, stop_words='english')
vectorizer.fit(docs)

pd.DataFrame(vectorizer.transform(docs).toarray(),
             columns=vectorizer.get_feature_names())

# Note: "I" is missing because the default tokenizer omits one letter words

Unnamed: 0,just,liked,movie,popcorn,song,words
0,1.0,1.0,1.0,1.0,0.0,0.0
1,1.0,1.0,0.0,0.0,1.0,1.0


### With TF-IDF

In [7]:
vectorizer = TfidfVectorizer(use_idf=True, norm=None, stop_words='english')
vectorizer.fit(docs)

pd.DataFrame(vectorizer.transform(docs).toarray(),
             columns=vectorizer.get_feature_names())

# Note: "I" is missing because the default tokenizer omits one letter words

Unnamed: 0,just,liked,movie,popcorn,song,words
0,1.0,1.0,1.405465,1.405465,0.0,0.0
1,1.0,1.0,0.0,0.0,1.405465,1.405465


## Negation Handling

<img src="NLP/negation-handling.png" style="height: 100px;"/>
_all my own artwork :P_

In [18]:
docs = [
    "I liked the movie, just not the popcorn.",
    "I liked the popcorn, just not the movie."
]

### Without negation handling

In [19]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None,
                             stop_words='english') # Note: no TF-IDF
vectorizer.fit(docs)

pd.DataFrame(vectorizer.transform(docs).toarray(),
             columns=vectorizer.get_feature_names())

Unnamed: 0,just,liked,movie,popcorn
0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0


### With negation handlin

In [20]:
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation

negation_marked_docs = [' '.join(mark_negation(word_tokenize(doc)))
                            for doc in docs]
negation_marked_docs

# Note "the_NEG" which is here because this hack doesn't account for stop words

['I liked the movie , just not the_NEG popcorn_NEG .',
 'I liked the popcorn , just not the_NEG movie_NEG .']

In [11]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None,
                             stop_words='english')
vectorizer.fit(negation_marked_docs)

pd.DataFrame(vectorizer.transform(negation_marked_docs).toarray(),
             columns=vectorizer.get_feature_names())

Unnamed: 0,just,liked,movie,movie_neg,popcorn,popcorn_neg,the_neg
0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
1,1.0,1.0,0.0,1.0,1.0,0.0,1.0


## Character NGrams

_(Polina's talk covered Word NGrames)_`

Example of NGrams where N=5:
<img src="NLP/character-ngrams.jpg" style="height: 300px;"/>
_From: http://plaza.ufl.edu/jgu/public_html/C-uppsats/cup.html_

In [12]:
docs = [
    "I liked the movie, just not the popcorn."
]

In [13]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None,
                             ngram_range=(2,2), analyzer='char')
vectorizer.fit(docs)

vectorizer.get_feature_names()

[u' j',
 u' l',
 u' m',
 u' n',
 u' p',
 u' t',
 u', ',
 u'co',
 u'd ',
 u'e ',
 u'e,',
 u'ed',
 u'he',
 u'i ',
 u'ie',
 u'ik',
 u'ju',
 u'ke',
 u'li',
 u'mo',
 u'n.',
 u'no',
 u'op',
 u'or',
 u'ot',
 u'ov',
 u'pc',
 u'po',
 u'rn',
 u'st',
 u't ',
 u'th',
 u'us',
 u'vi']

In [14]:
pd.DataFrame(vectorizer.transform(docs).toarray(),
             columns=vectorizer.get_feature_names())

Unnamed: 0,j,l,m,n,p,t,",",co,d,e,...,ot,ov,pc,po,rn,st,t.1,th,us,vi
0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0


## A quick character ngrams demo

Based on: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

### Detecting language from character ngrams
<img src="NLP/languages.png" style="height: 300px;"/>
_From: http://4chanint.wikia.com/wiki/The_Official_/int/_How_to_Learn_A_Foreign_Language_Guide_Wiki_

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [34]:
dataset = load_files('NLP/paragraphs')

In [45]:
# ~1000 paragrams across 11 languages
print len(dataset.data)
print dataset.target_names

969
['ar', 'de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru']


In [46]:
# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.5)


In [47]:
# All of 1, 2, and 3 ngrams
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char',
                             use_idf=False)

In [55]:
clf = Pipeline([
    ('vec', vectorizer),
    ('clf', Perceptron()), # Perceptron performs well on this problem
])

In [56]:
# Fit the pipeline on the training set
clf.fit(docs_train, y_train)

# Predict the outcome on the testing set in a variable named y_predicted
y_predicted = clf.predict(docs_test)

In [57]:
# Plot the confusion matrix
print(metrics.confusion_matrix(y_test, y_predicted))

[[18  0  0  0  0  0  0  0  0  0  0]
 [ 0 78  0  0  0  0  0  0  0  0  0]
 [ 0  0 71  0  0  0  0  0  0  0  0]
 [ 2  0  1 59  0  0  0  0  0  0  0]
 [ 0  0  0  0 52  0  0  0  0  0  0]
 [ 1  0  0  0  0 34  0  0  0  0  0]
 [ 0  0  0  0  0  0 40  0  0  0  0]
 [ 1  0  0  0  0  0  0 22  0  0  0]
 [ 0  0  0  0  0  0  0  0 16  0  1]
 [ 3  0  0  0  0  0  0  0  0 46  4]
 [ 0  0  0  0  0  0  0  0  0  0 36]]


In [67]:
# Display F1 score
print(metrics.f1_score(y_test, y_predicted, average='weighted'))

0.974056552112


In [58]:
# Predict the result on some short new sentences:
sentences = [
    u'This is a language detection test.',
    u'Ceci est un test de d\xe9tection de la langue.',
    u'Dies ist ein Test, um die Sprache zu erkennen.',
]
predicted = clf.predict(sentences)

for s, p in zip(sentences, predicted):
    print(u'The language of "%s" is "%s"' % (s, dataset.target_names[p]))

The language of "This is a language detection test." is "fr"
The language of "Ceci est un test de détection de la langue." is "fr"
The language of "Dies ist ein Test, um die Sprache zu erkennen." is "de"


### Exploring the ngram hyperparameters

In [None]:
import pandas as pd

scores = pd.DataFrame([], columns=range(1,10+1))

for min_n in range(1,10+1):
    for max_n in range(min_n, 10+1):
        vectorizer = TfidfVectorizer(ngram_range=(min_n, max_n),
                                     analyzer='char', use_idf=False)
        clf = Pipeline([
                ('vec', vectorizer),
                ('clf', Perceptron())
              ])
        clf.fit(docs_train, y_train)
        y_predicted = clf.predict(docs_test)
        scores.loc[min_n, max_n] = metrics.f1_score(y_test, y_predicted, average='weighted')

In [None]:
scores

<img src='NLP/results.png'>