In [2]:
import nltk

In [3]:
sentence = "This is Andrew's text, isn't it?"

In [4]:
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokenizer.tokenize(sentence)

['This', 'is', "Andrew's", 'text,', "isn't", 'it?']

In [5]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
tokenizer.tokenize(sentence)

['This', 'is', 'Andrew', "'", 's', 'text', ',', 'isn', "'", 't', 'it', '?']

In [6]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['This', 'is', 'Andrew', "'s", 'text', ',', 'is', "n't", 'it', '?']

In [7]:
sentence = "feet wolves cats talked"
tokenizer = nltk.tokenize.TreebankWordTokenizer()
word_list = tokenizer.tokenize(sentence)
print(word_list)

['feet', 'wolves', 'cats', 'talked']


In [8]:
stemmer = nltk.stem.PorterStemmer()
" ".join( stemmer.stem(word) for word in word_list)

u'feet wolv cat talk'

In [9]:
def stemmer(word):
#     return word
    stemmer_obj = nltk.stem.PorterStemmer() # SnowballStemmer("english")
    #new_list = [stemmer_obj.stem(w) for w in word_list]
    return stemmer_obj.stem(word)

def stem_word(word):
    """returns the stem word, whether the input is bigram/trigram/ngram"""
    tmp = word.split()
    tmp2 = []
    for i in range(len(tmp)):
        if len(tmp[i]) == 0:
            continue
        tmp2.append(stemmer(tmp[i]))
    word = " ".join(tmp2)
    return word

def stem_list(word_list):
    new_list = word_list # list(iem.asklist)
    # Handles the stemming of bigrams and trigrams also
    for i in range(len(new_list)):
        tmp = new_list[i].split()
        tmp2 = []
        for j in range(len(tmp)):
            if len(tmp[j]) == 0:
                continue
            tmp2.append(stemmer(tmp[j]))

        new_list[i] = " ".join(tmp2)

    #print(new_list)
    return new_list

In [10]:
mylist = ["withdraw","withdrawal","withdrawn", "withdrew", "amount", "debited"]
stem_list( mylist )

['withdraw', u'withdraw', 'withdrawn', 'withdrew', 'amount', u'debit']

In [11]:
lemmatizer = nltk.stem.WordNetLemmatizer()
" ".join( lemmatizer.lemmatize(word) for word in mylist)

u'withdraw withdraw withdrawn withdrew amount debit'

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [13]:
texts = ["good movie", "not a good movie", "did not like", "i like it", "good one"]
# document freq: Frequency in all the documents.
# tokens, document freq: (good, 3), (movie, 2), (not, 2), (a, 1), (did, 1), (like, 2), (i, 1), (it, 1), (one, 1)

In [14]:
tfidf = TfidfVectorizer(min_df= 2, max_df= 0.5, ngram_range=(1,2))
# bigrams, df: (good movie, 2), (not a, 1), (a good, 1), (did not, 1), (not like, 1), (i like, 1), (like it,1),(good one,1)
# mind_df = 2, so all the tokens with df<2 are ignored to avoid "overfit" and "typo error"
# max_df = 0.5, so all the tokens with df > 50% are ignored to avoid "stop words" and meaning less words

In [15]:
features = tfidf.fit_transform(texts)
tfidf.get_feature_names()

[u'good movie', u'like', u'movie', u'not']

In [16]:
features[0,0]

0.7071067811865476

In [17]:
features

<5x4 sparse matrix of type '<type 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [18]:
features.todense()

matrix([[0.70710678, 0.        , 0.70710678, 0.        ],
        [0.57735027, 0.        , 0.57735027, 0.57735027],
        [0.        , 0.70710678, 0.        , 0.70710678],
        [0.        , 1.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        ]])

In [19]:
pd.DataFrame( features.todense(), columns= tfidf.get_feature_names())

Unnamed: 0,good movie,like,movie,not
0,0.707107,0.0,0.707107,0.0
1,0.57735,0.0,0.57735,0.57735
2,0.0,0.707107,0.0,0.707107
3,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0


In [20]:
text2 = ["bad movie", "not a good one", "did not like", "stupid movie"]
features2 = tfidf.transform(text2)

In [21]:
pd.DataFrame( features2.todense(), columns= tfidf.get_feature_names())

Unnamed: 0,good movie,like,movie,not
0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0
2,0.0,0.707107,0.0,0.707107
3,0.0,0.0,1.0,0.0


array(<5x4 sparse matrix of type '<type 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>, dtype=object)