In [24]:
text = "Are you curious about tokenization?Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."


In [25]:
from nltk.tokenize import sent_tokenize

In [26]:
import nltk
nltk.download('punkt')
sent_tokenize_list = sent_tokenize(text)

[nltk_data] Downloading package punkt to /home/lynn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
print ("\n Sentence tokenize:")
print (sent_tokenize_list)


 Sentence tokenize:
["Are you curious about tokenization?Let's see how it works!", 'We need to analyze a couple of sentences with punctuations to see it in action.']


In [28]:
from nltk.tokenize import word_tokenize

In [29]:
print ("\nWord tokenizer:")
print (word_tokenize(text))


Word tokenizer:
['Are', 'you', 'curious', 'about', 'tokenization', '?', 'Let', "'s", 'see', 'how', 'it', 'works', '!', 'We', 'need', 'to', 'analyze', 'a', 'couple', 'of', 'sentences', 'with', 'punctuations', 'to', 'see', 'it', 'in', 'action', '.']


In [30]:
from nltk.tokenize import WordPunctTokenizer
word_punct_tokenizer = WordPunctTokenizer()
print ("\n Word punct tokenizer:")
print (word_punct_tokenizer.tokenize(text))


 Word punct tokenizer:
['Are', 'you', 'curious', 'about', 'tokenization', '?', 'Let', "'", 's', 'see', 'how', 'it', 'works', '!', 'We', 'need', 'to', 'analyze', 'a', 'couple', 'of', 'sentences', 'with', 'punctuations', 'to', 'see', 'it', 'in', 'action', '.']


In [31]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
words = ['table','probably','wolves','playing','is','dog','the','beaches','grounded','dreamt','envision']

In [32]:
stemmers = ['PORTER','LANCASTER','SNOWBALL']
stemmer_porter = PorterStemmer()
stemmer_lancaster = LancasterStemmer()
stemmer_snowball = SnowballStemmer('english')

In [33]:
formatted_row = '{:>16}' * (len(stemmers) + 1)
print ('\n',formatted_row.format('WORD',*stemmers),'\n')


             WORD          PORTER       LANCASTER        SNOWBALL 



In [34]:
for word in words:
    stemmed_words = [stemmer_porter.stem(word),stemmer_lancaster.stem(word),stemmer_snowball.stem(word)]
    print (formatted_row.format(word,*stemmed_words))

           table            tabl            tabl            tabl
        probably         probabl            prob         probabl
          wolves            wolv            wolv            wolv
         playing            play            play            play
              is              is              is              is
             dog             dog             dog             dog
             the             the             the             the
         beaches           beach           beach           beach
        grounded          ground          ground          ground
          dreamt          dreamt          dreamt          dreamt
        envision           envis           envid           envis


In [35]:

from nltk.stem import WordNetLemmatizer
words = ['table','probably','wolves','playing','is','dog','the','beaches','grounded','dreamt','envision']
lemmatizer = ['NOUN LEMMATIZER','VERB LEMMATIZER']
lemmatizer_wordnet = WordNetLemmatizer()
formatted_row = '{:>24}' * (len(lemmatizer) + 1)
print ('\n',formatted_row.format('WORD',*lemmatizer),'\n')
for word in words:
    lemmatizer_words = [lemmatizer_wordnet.lemmatize(word,pos='n'),lemmatizer_wordnet.lemmatize(word,pos='v')]
    print (formatted_row.format(word,*lemmatizer_words))


                     WORD         NOUN LEMMATIZER         VERB LEMMATIZER 

                   table                   table                   table
                probably                probably                probably
                  wolves                    wolf                  wolves
                 playing                 playing                    play
                      is                      is                      be
                     dog                     dog                     dog
                     the                     the                     the
                 beaches                   beach                   beach
                grounded                grounded                  ground
                  dreamt                  dreamt                   dream
                envision                envision                envision


In [36]:
import numpy as np
from nltk.corpus import brown
def splitter(data,num_words):
    words = data.split(' ')
    output = []
    cur_count = 0
    cur_words = []
    for word in words:
        cur_words.append(word)
        cur_count += 1
        if cur_count == num_words:
            output.append(' '.join(cur_words))
            cur_words = []
            cur_count = 0
    output.append(' '.join(cur_words))
    return output

In [37]:
nltk.download('brown')
data = ' '.join(brown.words()[:10000])

[nltk_data] Downloading package brown to /home/lynn/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [39]:
num_words = 1700
chunks = []
counter = 0
text_chunks = splitter(data,num_words)
print ("Number of text chunks = ",len(text_chunks))
print (text_chunks)

Number of text chunks =  6


In [43]:
num_words = 2000
chunks = []
counter = 0
text_chunks = splitter(data,num_words)
for text in text_chunks:
    chunk = {'index':counter,'text':text}
    chunks.append(chunk)
    counter += 1
print (counter)


6


In [42]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=5,max_df=.95)
doc_term_matrix = vectorizer.fit_transform([chunk['text'] for chunk in chunks])

In [44]:
vocab = np.array(vectorizer.get_feature_names())
print ("\nVocabulary:")
print (vocab)


Vocabulary:
['about' 'after' 'against' 'aid' 'all' 'also' 'an' 'and' 'are' 'as' 'at'
 'be' 'been' 'before' 'but' 'by' 'committee' 'congress' 'did' 'each'
 'education' 'first' 'for' 'from' 'general' 'had' 'has' 'have' 'he'
 'health' 'his' 'house' 'in' 'increase' 'is' 'it' 'last' 'made' 'make'
 'may' 'more' 'no' 'not' 'of' 'on' 'one' 'only' 'or' 'other' 'out' 'over'
 'pay' 'program' 'proposed' 'said' 'similar' 'state' 'such' 'take' 'than'
 'that' 'the' 'them' 'there' 'they' 'this' 'time' 'to' 'two' 'under' 'up'
 'was' 'were' 'what' 'which' 'who' 'will' 'with' 'would' 'year' 'years']


In [47]:
print ("\nDocument term matrix:")
chunk_names = ['Chunk-0','Chunk-1','Chunk-2','Chunk-3','Chunk-4']
formatted_row = '{:>12}' * (len(chunk_names) + 1)
print ('\n',formatted_row.format('Word',*chunk_names),'\n')
for word,item in zip(vocab,doc_term_matrix.T):
    output = [str(x) for x in item.data]
    print (formatted_row.format(word,*output))


Document term matrix:

         Word     Chunk-0     Chunk-1     Chunk-2     Chunk-3     Chunk-4 

       about           1           1           1           1           3
       after           2           3           2           1           3
     against           1           2           2           1           1
         aid           1           1           1           3           5
         all           2           2           5           2           1
        also           3           3           3           4           3
          an           5           7           5           7          10
         and          34          27          36          36          41
         are           5           3           6           3           2
          as          13           4          14          18           4
          at           5           7           9           3           6
          be          20          14           7          10          18
        been           7

In [48]:
from sklearn.datasets import fetch_20newsgroups
category_map = {'misc.forsale':'Sales','rec.motorcycles':'Motorcycles','rec.sport.baseball':'Baseball','sci.crypt':'Cryptography','sci.space':'Space'}
training_data = fetch_20newsgroups(subset='train',categories=category_map.keys(),shuffle=True,random_state=7)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_termcounts = vectorizer.fit_transform(training_data.data)
print ('\nDimensions of training data:',X_train_termcounts.shapepe)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)



Dimensions of training data: (2968, 40605)


In [50]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
input_data = [
    "The curveballs of right handed pitchers tend to curve to the left",
    "Caesar cipher is an ancient form of encryption",
    "This two-wheeler is really good on slippery roads"
]
tf_idf_transformer = TfidfTransformer()
X_train_tfidf = tf_idf_transformer.fit_transform(X_train_termcounts)
classifier = MultinomialNB().fit(X_train_tfidf,training_data.target)
X_input_termcounts = vectorizer.transform(input_data)
X_input_tfidf = tf_idf_transformer.transform(X_input_termcounts)
predicted_categories = classifier.predict(X_input_tfidf)
for sentence,category in zip(input_data,predicted_categories):
    print ('\nInput:',sentence,'\nPredicted category:',category_map[training_data.target_names[category]])


Input: The curveballs of right handed pitchers tend to curve to the left 
Predicted category: Baseball

Input: Caesar cipher is an ancient form of encryption 
Predicted category: Cryptography

Input: This two-wheeler is really good on slippery roads 
Predicted category: Motorcycles


In [54]:
nltk.download('names')
import random
from nltk.corpus import names
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy

def gender_features(word,num_letters=2):
    return {'feature':word[-num_letters:].lower()}
labeled_names = ([(name,'male') for name in names.words('male.txt')] + [(name,'female') for name  in names.words('female.txt')])
random.seed(7)
random.shuffle(labeled_names)
input_names = ['Leonardo','Amy','Sam']
for i in range(1,5):
    print ('\nNumber of letters:',i)
    featuresets = [(gender_features(n,i),gender) for (n,gender) in labeled_names]
    train_set,test_set = featuresets[500:],featuresets[:500]
    classifier = NaiveBayesClassifier.train(train_set)
    print ('Accuracy==>',str(100*nltk_accuracy(classifier,test_set)) + str('%'))
    for name in input_names:
        print (name,'==>',classifier.classify(gender_features(name,i)))


Number of letters: 1
Accuracy==> 76.2%
Leonardo ==> male
Amy ==> female
Sam ==> male

Number of letters: 2
Accuracy==> 78.60000000000001%
Leonardo ==> male
Amy ==> female
Sam ==> male

Number of letters: 3
Accuracy==> 76.6%
Leonardo ==> male
Amy ==> female
Sam ==> female

Number of letters: 4
Accuracy==> 70.8%
Leonardo ==> male
Amy ==> female
Sam ==> female


[nltk_data] Downloading package names to /home/lynn/nltk_data...
[nltk_data]   Package names is already up-to-date!


In [58]:
nltk.download('movie_reviews')
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
def extract_features(word_list):
    return dict([(word,True) for word in word_list])
positive_fileids = movie_reviews.fileids('pos')
negative_fileids = movie_reviews.fileids('neg')
features_positive = [(extract_features(movie_reviews.words(fileids=[f])),'Positive') for f in positive_fileids]
features_negative = [(extract_features(movie_reviews.words(fileids=[f])),'Negative') for f in negative_fileids]
threshold_factor = 0.8
threshold_positive = int(threshold_factor * len(features_positive))
threshold_negative = int(threshold_factor * len(features_negative))
feature_train = features_positive[:threshold_positive] + features_negative[:threshold_negative]
feature_test = features_positive[threshold_positive:] + features_negative[threshold_negative:]
print ('\nNumber of training datapoints:',len(feature_train))
print ('\nNumber of test datapoints:',len(feature_test))

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/lynn/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!



Number of training datapoints: 1600

Number of test datapoints: 400


In [59]:
classifier = NaiveBayesClassifier.train(feature_train)
print ('\nAccuracy of the classifier:',nltk.classify.util.accuracy(classifier,feature_test))


Accuracy of the classifier: 0.735


In [60]:
print ('\nTop 10 most information:')
for item in classifier.most_informative_features()[:10]:
    print (item[0])


Top 10 most information:
outstanding
insulting
vulnerable
ludicrous
uninvolving
astounding
avoids
fascination
anna
darker


In [61]:
input_reviews = [
    "It is an amazing movie",
    "This is a dull movie.I would never recommend it to anyone.",
    "The cinematography is pretty great in this movie",
    "The direction was terrible and the story was all over the place"
]

In [63]:
print ("\nPredictions:")
for review in input_reviews:
    print ("\nReview:",review)
    probdist = classifier.prob_classify(extract_features(review.split()))
    pred_sentiment = probdist.max()
    print ("Predicted sentiment:",pred_sentiment)
    print ("Probability:",round(probdist.prob(pred_sentiment),2))


Predictions:

Review: It is an amazing movie
Predicted sentiment: Positive
Probability: 0.61

Review: This is a dull movie.I would never recommend it to anyone.
Predicted sentiment: Negative
Probability: 0.77

Review: The cinematography is pretty great in this movie
Predicted sentiment: Positive
Probability: 0.67

Review: The direction was terrible and the story was all over the place
Predicted sentiment: Negative
Probability: 0.63


In [67]:
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from gensim import models,corpora
from nltk.corpus import stopwords

def load_data(input_file):
    data = []
    with open(input_file,'r') as f:
        for line in f.readlines():
            data.append(line[:-1])
    return data
class Preprocessor(object):
    def __init__(self):
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.stop_words_english = stopwords.words('english')
        self.stemmer = SnowballStemmer('english')
    def process(self,input_text):
        tokens = self.tokenizer.tokenize(input_text.lower())
        tokens_stopwords = [x for x in tokens if not x in self.stop_words_english]
        tokens_stemmed = [self.stemmer.stem(x) for x in tokens_stopwords]
        return tokens_stemmed
input_file = 'data_topic_modeling.txt'
data = load_data(input_file)
preprocessor = Preprocessor()
processed_tokens = [preprocessor.process(x) for x in data]
dict_tokens = corpora.Dictionary(processed_tokens)
corpus = [dict_tokens.doc2bow(text) for text in processed_tokens]
num_topics = 2
num_words = 4
ldamodel = models.ldamodel.LdaModel(corpus,num_topics=num_topics,id2word=dict_tokens,passes=25)
print ('\nMost contributing words to the topic:')
for item in ldamodel.print_topics(num_topics=num_topics,num_words=num_words):
    print ('\nTopic',item[0],'==>',item[1])

[nltk_data] Downloading package stopwords to /home/lynn/nltk_data...



Most contributing words to the topic:

Topic 0 ==> 0.065*"need" + 0.046*"order" + 0.028*"understand" + 0.028*"modern"

Topic 1 ==> 0.044*"need" + 0.044*"younger" + 0.044*"club" + 0.044*"polici"


[nltk_data]   Unzipping corpora/stopwords.zip.
