In [1]:
import nltk

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer

In [3]:
input_text = """Do you know how to tokenization works? It's actually quite interesting
! let's analyze a couple of sentences and figure it out"""

In [6]:
# sentence tokinizer
# divide the input text into sentence tokens
print ("\n Sentence tokenizer: ")
print (sent_tokenize(input_text))

In [9]:
# word tokenizer
# divide the input text into word tokens
print ("\nWord tokenizer: ")
print (word_tokenize(input_text))

In [7]:
# WordPunct tokenizer
# divide the input text into word tokens using word punct tokenizer
print ("\nWord punct tokenizer: ")
print (WordPunctTokenizer().tokenize(input_text))


Word punct tokenizer: 
['Do', 'you', 'know', 'how', 'to', 'tokenization', 'works', '?', 'It', "'", 's', 'actually', 'quite', 'interesting', '!', 'let', "'", 's', 'analyze', 'a', 'couple', 'of', 'sentences', 'and', 'figure', 'it', 'out']


In [11]:
# converting words to their base forms using stemming

from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

In [12]:
input_words = ['writting', 'calves', 'be', 'branded', 'horse','randomize',
              'possible','provision','hospital','kept', 'scratchy','code']

# create objects for porter, Lancaster, and Snowball stemmers.
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer("english")

In [15]:
# create a list of stemmer names for display
stemmer_names = ['PORTER', 'LANCASTER', 'SNOWBALL']
formatted_text = '{: >16}' * (len(stemmer_names) + 1)
print ('\n', formatted_text.format('INPUT WORD', *stemmer_names), '\n', '='*68)

# stem each word and display the output
for word in input_words:
    output = [word, porter.stem(word), lancaster.stem(word) , snowball.stem(word)]
    print (formatted_text.format(*output))


       INPUT WORD          PORTER       LANCASTER        SNOWBALL 
        writting            writ            writ            writ
          calves            calv            calv            calv
              be              be              be              be
         branded           brand           brand           brand
           horse            hors            hors            hors
       randomize          random          random          random
        possible         possibl            poss         possibl
       provision          provis          provid          provis
        hospital          hospit          hospit          hospit
            kept            kept            kept            kept
        scratchy        scratchi        scratchy        scratchi
            code            code             cod            code


In [16]:
#convert words to their base forms using lemmatization
# lemmatization uses vocabulary and morphological analysis of words. It obtains
# the base forms by removing the inflectional word ending such as ing or ed.
# this base form of any word is known as lemma.

from nltk.stem import WordNetLemmatizer

In [17]:
input_words = ['writing', 'calves','be','branded','horse','randomize','possible',
              'provision','hospital','kept','scratchy','code']

# create lemmatizer objec
lemmatizer = WordNetLemmatizer()

In [27]:
# create a list of lemmatizer names for display
lemmatizer_names = ['NOUN LEMMATIZER', 'VERB LEMMATIZER']
formatted_text = '{:>24}' * (len(lemmatizer_names) + 1)
print ('\n', formatted_text.format('INPUT WORD', *lemmatizer_names), '\n', '='*75)


# lemmatize each word and display the output
for word in input_words:
    #output = [word, lemmatizer.lemmatize(word, pos = 'n'), lemmatizer.lemmatize(word, pos = 'v')]
    #print (formatted_text.format(*output))


               INPUT WORD         NOUN LEMMATIZER         VERB LEMMATIZER 


In [28]:
# dividing text data into chunks

import numpy as np
from nltk.corpus import brown

In [None]:
# define a function to divide the input text into chunks.

# split the input text into chunks, where each chuck contains N words

def chunker(input_data, N):
    input_words = input_data.split(" ")
    output = []
    
    # iteraate through the words and divide them into chunks using the input parameter.
    
    cur_chunk = []
    count = 0
    for word in input_words:
        cur_chunk.append(word)
        count += 1
        if count == N:
            output.append(" ".join(cur_chunk))
            count, cur_chunk = 0, []
    output.append(' '.join(cur_chunk))
    return output

if __name__ == "__main__":
    # read the first 12000 words from the brown corpus
    input_data = ' '.join(brown.words()[:12000])
    
    # define the number of words in each chunk
    chunk_size = 700
    chunks = chunker(input_data, chunk_size)
    print ("\nNumber of text chunks = ", len(chunks), '\n')
    for i , chunk in enumerate(chunks):
        print ("chunk", i+1, '==>', chunk[:50])
        

In [None]:
# building a bag of words model in NLTK

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import brown
from text_chunker import chunker

input_data = " ".join(brown.words()[:5400])

chunk_size = 800

text_chunks = chunker(input_data, chunk_size)

# convert the chunks into dictionary items
chunks = []
for count, chunk in enumerate(text_chunks):
    d = {'index': count, 'text': chunk}
    chunks.append(d)
    
# Extract the document term matrix
count_vectorizer = CountVectorizer(min_df = 7, max_df = 20)
document_term_matrix = count_vectorizer.fit_transform([chunk['text'] for chunk in chunks])

# Extract the vocabulary and display it
vocabulary = np.array(count_vectorizer.get_feature_names())
print ("\nVocabulary: \n", vocabulary)


# Generate the names for display
chunk_names = []
for i in range(len(text_chunks)):
    chunk_names.append("Chunk-" + str(i+1))
    
# print the document term matrix
print ("\nDocument term matrix:")
formatted_text = '{:>12}' * (len(chunk_names), '\n')
for word, item in zip(vocabulary, document_term_matrix.T):
    # 'item' is a 'csr_matrix' data structure
    output = [word] + [str(freq) for freq in item.data]
    print (formatted_text.format(*output))
    


In [None]:
# building a category predictor
# a category predictor is used to predict the category to which a given piece
# of text belongs. 

# inorder to build this predicator we will use a statistic called TermFrequency-inverse
# Document Frequency(tf-idf). it helps us to know understand the importance of a given
# word to a document in a set of documents.

# The term Frequency(tf) is basically a measure of how frequently each word appears
# in a given document.

# The inverse Document Frequency (idf), is a measure of how unique a word is to 
# this document in the given set of documents.

# we then combine term frequency and inverse document frequency to formulate a feature
# vector to categorize documents.

from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


# Define the category map that will be used for training.
category_map = {'talk.politics.misc': 'politics', 'rec.autos': 'Autos', 
               'rec.spot.hockey': 'Hockey', 'sci.electronics': 'Electronics',
               'sci.med': 'Medicine'}

# Get the training dataset
training_data = fetch_20newsgroups(subset = 'train',categories = category_map.keys(),
                                  shuffle = True, random_state = 5)

# Build a count vectorizer and extract term counts
count_vectorizer = CountVectorizer()
train_tc = count_vectorizer.fit_transform(training_data.data)
print ("\nDimensions of training data: ", train_tc.shape)

# create the tf-idf transformer
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_tc)



In [None]:
# define test data
input_data = ["you need to be careful with cars when you are driving on slippery roads",
              "A lot of devices can be operated wirelessly", 
             'players need to be careful when they are closed to goal posts',
             'political debates help us understand the perspectives of both sides']

# train a multinomial Naive Bayes classifier
classifier = MultinomialNB().fit(train_tfidf, training_data.target)

# Transform input data using count vectorizer
input_tc = count_vectorizer.transfrom(input_data)

# Transform vectorized data using tfidf transformer
input_tfidf = tfidf.transform(input_tc)

# predict the output categories
predictions = classifier.predict(input_tfidf)

# print the outputs
for sent, category in zip(input_data, predictions):
    print ('\nInput: ', sent, '\nPredicted category:', \
          category_map[training_data.target_names[category]])

In [None]:
# contructing a gender identifier
# in this case, we will use the heuristic to construct a feature vector and 
# use it to train a classifier.
# The heuristic that will be used here is the last N letters of a given name.
#For example if the name ends with 'ia', it's most likely a female name, such
# as Amelia or Genelia. On the other hand, if the name ends with 'rk', it's likely
# a male name such as Mark or Clark.

import random
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy
from nltk.corpus import names


# Extract last N letters from the input word
## and that will act as our 'feature'
def extract_features(word, N = 2):
    last_n_letters  word[-N:]
    return {'feature': last_n_letters.lower()}


if __name__ == "__main__":
    # create training data using labeled names available in NLTK
    male_list = [(name, 'male') for name in names.word('male.txt')]
    female_list = [(name, 'female') for name in names.words('female.txt')]
    data = (male_list + female_list)
    
# seed the random number generator
random.seed(5)

# shuffle the data
random.shuffle(data)

# create sample names for testing: test data
input_names = ['Alexander', 'Danielle', 'David', 'Cheryl']

# Define the number of samples used for train and text
num_train = int(0.8 * len(data))

# Iterate through different lengths to compare the accuracy
for i in range(1,6):
    print ("\nNumber of end letters:", i)
    features = [(extract_features(n, i), gender) for (n, gender) in data]
    
# separate the data into training ans testing
train_data, test_data = features[:num_train], features[num_train:]

# build the NaiveBayes Classifier using the training data
classifier = NaiveBayesClassifier.train(train_data)


# compute the accuracy of the classifier
accuracy = round(100 * nltk_accuracy(classifier, test_data), 2)
print ("Accuracy = " + str(accuracy) + '%')



In [None]:
# Building a Sentiment analyzer

# Sentiment analysis is the process of determininng the sentiment of a given piece
# of text. For example, it can be used to determnine whether a movie review is positive
# or negative.

# we will use Naive Bayes classifier to build this classifier. We first need to extract
# all the unique words from the text.

from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy

# Extract features from the input list words
def extract_features(words):
    return dict([(word, True) for word in words])

if __name__ == "__main__":
    # load the reviews from the corpus
    fileids_pos = movie_reviews.fileids("pos")
    fileids_neg = movie_reviews.fileids("neg")
    
# Extract the features from the reviews
feature_pos = [(extract_features(movie_reviews.words(fileids = [f])), 'Positive') for f in fileids_pos]
feature_neg= [(extract_features(movie_reviews.words(fileids = [f])), 'Negative') for f in fileids_neg]

# Define the train and test split (80% and 20%) 
threshold = 0.8
num_pos = int(threshold * len(features_pos))
num_neg = int(threshold * len(features_neg))


# create training and training datasets
features_train = features_pos[:num_pos] + features_neg[:num_neg]
features_test = features_pos[num_pos:] + features_neg[num_neg:]

# Print the number of datapoints used
print ("\nNumber of training datapoints:", len(features_train))
print ("Number of test datapoints:", len(features_test))

#Train a Naive Bayes classifier
classifier = NaiveBayesClassifier.train(features_train)
print ("\nAccuracy of the classifier:", nltk_accuracy(classfier, features_test))

# print the top N most infomative words
N = 15
print ("\nTop " + str(N) + " most informative words:")
for i, item in enumerate(classifier.most_informative_features()):
    print (str(i+1) + '. ' + item[0])
    if i == N - 1:
        break
        
# Test input movie reviews
input_reviews = ['The costumes in this movie were great',
                 'I think the story was terrible and the characters were very weak',
                'People say that the director of the movie is amazing',
                'This is such an idiotic movie. I will not recommend it to anyone.']

print ("\nMovie review predictions:")
for review in input_reviews:
    print ("\nReview:", review)
    # compute the probablilties
    probabilities = classifier.prob_classify(extract_features(review.split()))

    # Pick the maximum value
    predicted_sentiment = probabilities.max()
    
    # print outputs
    print ("Predicted sentiment: ", predicted_sentiment)
    print ("Probability:", round(probabilities.prob(predicted_sentiment), 2))


In [None]:
# Topic modeling using Latent Dirichlet Allocation

# it is an unsupervised learning algorithm. It helps to organize our documents in an 
# optimal way, which can then be used for analysis.

# Latent Dirichlet Allocation is a topic modelling technique.

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim import models, corpora

# load input data
def load_data(input_file):
    data = []
    with open(input_file, 'r') as f:
        for line in f.readlines():
            data.append(line[:-1])
            
    return data

# Define a function to process the input text.
# processor function for tokenizing, removing stop
# word, and stemming
def process(input_text):
    # create a regular expression tokenizer
    tokenizer = RegexTokenizer(r'\w+')
    
    # create a snowball stemmer to stem the tokenized text
    stemmer = SnowballStemmer('english')
    
    # Get the list of stop words to remove the stop words from the input test
    # because they don't add information.
    stop_words = stopwords.words("english")
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())
    
    # remove the stop-words
    tokens = [x for x in tokens if not x in stop_words]
    
    
    # perform stemming on the tokenized words
    tokens_stemmed = [stemmer.stem(x) for x in tokens]
    return tokens_stemmed


if __name__ == "__main__":
    # load in put data
    data = load_data('data.txt')
    
    # create a list for sentence tokens
    tokens = [process(x) for x in data]
    
    # create a dictionary based on the sentence tokens
    doc_term_mat = [dict_tokens.doc2bow(token) for token in tokens]
    
    # define the number of topics for the LDA model
    num_topics = 2
    
    # Generate the LDA model
    ldamodel = models.ldamodel.LdaModel(doc_term_mat, 
                                       num_topics = num_topics, id2word = dict_tokens, passes = 25)
    
    # print top 5 contributing words for each topic
    num_words = 5
    print ("\nTop " + str(num_words) + " contributing words to each topic: ")
    for item in ldamodel.print_topics(num_topics = num_topics, num_words= num_words):
        print ("\nTopic", item[0])
        
        # print the contributing words along with their relative contributions
        list_of_strings = item[1].split(' + ')
        for text in list_of_strings:
            weight = text.split('*')[0]
            word = text.split('*')[1]
            print (word, '==>', str(round(float(weight) * 100, 2)) + '%')