# This talk covers
* Term Frequency (i.e. vectorized text)
* Stop words
* Term Frequency - Inverse Document Frequency (TF-IDF)
* Negation Marking
* NGrams

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

## Term frequency (i.e. vectorized text)

In [2]:
docs = [
    "I liked the movie, just not the popcorn."
]

In [3]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None)
vectorizer.fit(docs)

pd.DataFrame(vectorizer.transform(docs).toarray(), columns=vectorizer.get_feature_names())

# Note: "I" is missing because the default tokenizer omits one letter words

Unnamed: 0,just,liked,movie,not,popcorn,the
0,1.0,1.0,1.0,1.0,1.0,2.0


## Stop words

In [4]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None, stop_words='english')
vectorizer.fit(docs)

pd.DataFrame(vectorizer.transform(docs).toarray(), columns=vectorizer.get_feature_names())

# Note "the" and "not" considered stop words

Unnamed: 0,just,liked,movie,popcorn
0,1.0,1.0,1.0,1.0


## Term Frequency - Inverse Document Frequency (TF-IDF)

In [5]:
docs = [
    "I liked the movie, just not the popcorn.",
    "I liked the song, just not the words."
]

### Without TF-IDF

In [6]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None, stop_words='english')
vectorizer.fit(docs)

pd.DataFrame(vectorizer.transform(docs).toarray(),
             columns=vectorizer.get_feature_names())

# Note: "I" is missing because the default tokenizer omits one letter words

Unnamed: 0,just,liked,movie,popcorn,song,words
0,1.0,1.0,1.0,1.0,0.0,0.0
1,1.0,1.0,0.0,0.0,1.0,1.0


### With TF-IDF

In [7]:
vectorizer = TfidfVectorizer(use_idf=True, norm=None, stop_words='english')
vectorizer.fit(docs)

pd.DataFrame(vectorizer.transform(docs).toarray(),
             columns=vectorizer.get_feature_names())

# Note: "I" is missing because the default tokenizer omits one letter words

Unnamed: 0,just,liked,movie,popcorn,song,words
0,1.0,1.0,1.405465,1.405465,0.0,0.0
1,1.0,1.0,0.0,0.0,1.405465,1.405465


## Negation Marking

In [18]:
docs = [
    "I liked the movie, just not the popcorn.",
    "I liked the popcorn, just not the movie."
]

### Without negation marking

In [19]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None,
                             stop_words='english') # Note: no TF-IDF
vectorizer.fit(docs)

pd.DataFrame(vectorizer.transform(docs).toarray(),
             columns=vectorizer.get_feature_names())

Unnamed: 0,just,liked,movie,popcorn
0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0


### With negation marking

In [20]:
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation

negation_marked_docs = [' '.join(mark_negation(word_tokenize(doc)))
                            for doc in docs]
negation_marked_docs

# Note "the_NEG" which is here because this hack doesn't account for stop words

['I liked the movie , just not the_NEG popcorn_NEG .',
 'I liked the popcorn , just not the_NEG movie_NEG .']

In [11]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None,
                             stop_words='english')
vectorizer.fit(negation_marked_docs)

pd.DataFrame(vectorizer.transform(negation_marked_docs).toarray(),
             columns=vectorizer.get_feature_names())

Unnamed: 0,just,liked,movie,movie_neg,popcorn,popcorn_neg,the_neg
0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
1,1.0,1.0,0.0,1.0,1.0,0.0,1.0


## NGrams

In [12]:
docs = [
    "I liked the movie, just not the popcorn."
]

### Character NGrams

In [13]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None,
                             ngram_range=(2,2), analyzer='char')
vectorizer.fit(docs)

vectorizer.get_feature_names()

[u' j',
 u' l',
 u' m',
 u' n',
 u' p',
 u' t',
 u', ',
 u'co',
 u'd ',
 u'e ',
 u'e,',
 u'ed',
 u'he',
 u'i ',
 u'ie',
 u'ik',
 u'ju',
 u'ke',
 u'li',
 u'mo',
 u'n.',
 u'no',
 u'op',
 u'or',
 u'ot',
 u'ov',
 u'pc',
 u'po',
 u'rn',
 u'st',
 u't ',
 u'th',
 u'us',
 u'vi']

In [14]:
pd.DataFrame(vectorizer.transform(docs).toarray(),
             columns=vectorizer.get_feature_names())

Unnamed: 0,j,l,m,n,p,t,",",co,d,e,...,ot,ov,pc,po,rn,st,t.1,th,us,vi
0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0


### Word NGrams

#### Word bigrams (N=2)

In [15]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None,
                             stop_words='english', ngram_range=(2,2))
vectorizer.fit(docs)

pd.DataFrame(vectorizer.transform(docs).toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,just popcorn,liked movie,movie just
0,1.0,1.0,1.0


### Word trigrams N=3

In [16]:
vectorizer = TfidfVectorizer(use_idf=False, norm=None,
                             stop_words='english', ngram_range=(3,3))
vectorizer.fit(docs)

pd.DataFrame(vectorizer.transform(docs).toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,liked movie just,movie just popcorn
0,1.0,1.0
