In [1]:
from sklearn.feature_extraction import DictVectorizer

In [3]:
# One-hot encoding
one_hot_encoder = DictVectorizer()

In [4]:
instances = [ {'city':'Bangalore'}, {'city':'Tezpur'}, {'city':'Chennai'}, {'city':'Mumbai'}, {'city':'Allahabad'}, {'city':'Mangalore'}]

In [5]:
encoder = one_hot_encoder.fit_transform(instances)


In [7]:
# It creates coloumn for each possible options
# Allahabad -> 1, Bangalore -> 2, Chennai -> 3, Mangalore -> 4, Mumbai -> 5 Tezpur -> 6, 
print (encoder.toarray())

[[ 0.  1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.]
 [ 0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.]
 [ 1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.]]


### Bag-of-words

This representation uses a multiset, or bag, that encodes the words that appear in a text; the bag-of-words does not encode any of the text's syntax, ignores the order of words, and disregards all grammar.
One important understanding here is - text with similar words will have similar meaning

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
corpus = [
'UNC played Duke in basketball',
'Duke lost the basketball game',
'I ate a sandwich'
]

In [43]:
import pandas as pd
df = pd.DataFrame({'text':corpus, 'id':[11,22,33]})
df

Unnamed: 0,id,text
0,11,UNC played Duke in basketball
1,22,Duke lost the basketball game
2,33,I ate a sandwich


In [30]:
vectorizer = CountVectorizer()

In [35]:
df.to_dict(orient='records')

[{'text': 'UNC played Duke in basketball'},
 {'text': 'Duke lost the basketball game'},
 {'text': 'I ate a sandwich'}]

In [44]:
tf_text = vectorizer.fit_transform(df.text.values).toarray()

In [62]:
l = vectorizer.vocabulary_.items()
col = list(map(lambda e: e[0], sorted(l)))

In [66]:
df2 = pd.DataFrame(tf_text, index=df.index, columns=col)
d = df.join(df2)
del d['text']

In [67]:
d

Unnamed: 0,id,ate,basketball,duke,game,in,lost,played,sandwich,the,unc
0,11,0,1,1,0,1,0,1,0,0,1
1,22,0,1,1,1,0,1,0,0,1,0
2,33,1,0,0,0,0,0,0,1,0,0


In [48]:
vectorizer.vocabulary_['game']

3

In [46]:
print (vectorizer.vocabulary_)

{'duke': 2, 'game': 3, 'basketball': 1, 'in': 4, 'the': 8, 'sandwich': 7, 'played': 6, 'unc': 9, 'lost': 5, 'ate': 0}


In [19]:
from sklearn.metrics.pairwise import euclidean_distances
# Lesser eucliden_distance mean they are similar

In [20]:
euclidean_distances(counts[0], counts[1])

array([[ 2.44948974]])

In [21]:
euclidean_distances(counts[1], counts[2])

array([[ 2.64575131]])

In [22]:
euclidean_distances(counts[0], counts[2])

array([[ 2.64575131]])

# Stop Words

In [23]:
vectorizer = CountVectorizer(stop_words='english')

In [24]:
print vectorizer.fit_transform(corpus).todense()

[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]


In [25]:
print vectorizer.vocabulary_

{u'duke': 2, u'basketball': 1, u'lost': 4, u'played': 5, u'game': 3, u'sandwich': 6, u'unc': 7, u'ate': 0}


# Problems with this 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
'He ate the sandwiches',
'Every sandwich was eaten by him'
]
vectorizer = CountVectorizer(binary=True, stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_

[[1 0 0 1]
 [0 1 1 0]]
{u'sandwich': 2, u'ate': 0, u'sandwiches': 3, u'eaten': 1}


# Stemming and lemmatization 

In [68]:
import nltk

In [69]:
from nltk.stem.wordnet import WordNetLemmatizer

In [71]:
lemmatizer = WordNetLemmatizer()
print (lemmatizer.lemmatize('gathering', 'v'))
print (lemmatizer.lemmatize('gathering', 'n'))

gather
gathering


In [72]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print (stemmer.stem('gathering'))

gather


# Lemmatize our test case

In [82]:
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

wordnet_tags = ['n', 'v']
corpus = [
'He ate the sandwiches',
'Everydays sandwich was eaten by him goode nice nicee'
]

In [83]:
# Stemming the corpus
stemmer = PorterStemmer()
print ('Stemmed:', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus])

Stemmed: [['He', 'ate', 'the', 'sandwich'], ['everyday', 'sandwich', 'wa', 'eaten', 'by', 'him', 'good', 'nice', 'nice']]


In [3]:
def lemmatize(token, tag):
    if tag[0].lower() in ['n', 'v']:
        return lemmatizer.lemmatize(token, tag[0].lower())
    return token

lemmatizer = WordNetLemmatizer()

In [16]:
help(pos_tag)

Help on function pos_tag in module nltk.tag:

pos_tag(tokens, tagset=None)
    Use NLTK's currently recommended part of speech tagger to
    tag the given list of tokens.
    
        >>> from nltk.tag import pos_tag
        >>> from nltk.tokenize import word_tokenize
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
        ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
        [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
        ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]
    
    NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence.
    
    :param tokens: Sequence of tokens to be tagged
    :type tokens: list(str)
    :param tagset: the tagset to be used, e.g. un

In [5]:
#Sentence Toekenizer
text = "this's a sent tokenize test. this is sent two. is this sent three? sent 4 is cool! Now it's your turn."
from nltk.tokenize import sent_tokenize
sent_tokenize_list = sent_tokenize(text)
print len(sent_tokenize_list)
print sent_tokenize_list

5
["this's a sent tokenize test.", 'this is sent two.', 'is this sent three?', 'sent 4 is cool!', "Now it's your turn."]


In [6]:
#Word Tokenizer
from nltk.tokenize import word_tokenize
print word_tokenize('Hello World.')
print word_tokenize("this's a test")

['Hello', 'World', '.']
['this', "'s", 'a', 'test']


In [84]:
words = word_tokenize("Dive into NLTK: Part-of-speech tagging and POS Tagger")
print (nltk.pos_tag(words))

[('Dive', 'NNP'), ('into', 'IN'), ('NLTK', 'NNP'), (':', ':'), ('Part-of-speech', 'JJ'), ('tagging', 'NN'), ('and', 'CC'), ('POS', 'NNP'), ('Tagger', 'NNP')]
