### Constructing a gender identifier

In [1]:
import random

from nltk import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy
from nltk.corpus import names

In [2]:
def extract_features(name, N = 2):
    #Extract last N letters from the input name and that will act as our feature
    last_n_letters = name[-N:]
    
    return {'feature': last_n_letters.lower()}

In [3]:
#Create training data using labeled names available in NLTK
male_list = [(name, 'male') for name in names.words('male.txt')]
female_list = [(name, 'female') for name in names.words('female.txt')]

data = male_list + female_list

In [4]:
random.seed(5)
random.shuffle(data)

In [5]:
#Create test data
input_names = ['Alexander', 'Danielle', 'David', 'Cheryl']

In [6]:
num_train = int(0.8 * len(data))

#Iterate through different lengths to compare the accuracy
for i in range(1, 6):
    print('\nNumber of letters: ', i)
    
    features = [(extract_features(name, i), gender) for (name, gender) in data]
    train_data, test_data = features[:num_train], features[num_train:]
    
    classifier = NaiveBayesClassifier.train(train_data)
    
    #Compute the accuracy of the classifier
    accuracy = round(100 * nltk_accuracy(classifier, test_data), 2)
    print('Accuracy = ', accuracy, '%')
    
    #Predict outputs for input names using the current-trained classifier model
    for name in input_names:
        print(name, ' ==> ', classifier.classify(extract_features(name, i)))


Number of letters:  1
Accuracy =  74.7 %
Alexander  ==>  male
Danielle  ==>  female
David  ==>  male
Cheryl  ==>  male

Number of letters:  2
Accuracy =  78.79 %
Alexander  ==>  male
Danielle  ==>  female
David  ==>  male
Cheryl  ==>  female

Number of letters:  3
Accuracy =  77.22 %
Alexander  ==>  male
Danielle  ==>  female
David  ==>  male
Cheryl  ==>  female

Number of letters:  4
Accuracy =  69.98 %
Alexander  ==>  male
Danielle  ==>  female
David  ==>  male
Cheryl  ==>  female

Number of letters:  5
Accuracy =  64.63 %
Alexander  ==>  male
Danielle  ==>  female
David  ==>  male
Cheryl  ==>  female


### Building a sentiment analyzer

In [7]:
from nltk.corpus import movie_reviews
from nltk.classify.util import accuracy as nltk_accuracy

In [8]:
def extract_features(words):
    return dict([(word, True) for word in words])

In [9]:
#Load the reviews from the corpus
fileids_pos = movie_reviews.fileids('pos')
fileids_neg = movie_reviews.fileids('neg')

In [10]:
#Extract the features from the reviews
features_pos = [(extract_features(movie_reviews.words(fileids = [f])), 'Positive') for f in fileids_pos]
features_neg = [(extract_features(movie_reviews.words(fileids = [f])), 'Negative') for f in fileids_neg]

In [12]:
#Define the train and test split(80% and 20%)
threshold = 0.8

num_pos = int(threshold * len(features_pos))
num_neg = int(threshold * len(features_neg))

features_train = features_pos[:num_pos] + features_neg[:num_neg]
features_test = features_pos[num_pos:] + features_neg[num_neg:]

In [14]:
print('Number of training datapoints: ', len(features_train))
print('Number of test datapoints: ', len(features_test))

Number of training datapoints:  1600
Number of test datapoints:  400


In [15]:
classifier = NaiveBayesClassifier.train(features_train)
print('Accuracy of the classifier: ', nltk_accuracy(classifier, features_test))

Accuracy of the classifier:  0.735


In [17]:
#Print the top N most informative word
N = 15
print(f'Top {N} most informative words:\n')

for i, item in enumerate(classifier.most_informative_features(), start = 1):
    print(f'{i}. {item[0]}')
    if i == N:
        break

Top 15 most informative words:

1. outstanding
2. insulting
3. vulnerable
4. ludicrous
5. uninvolving
6. astounding
7. avoids
8. fascination
9. affecting
10. animators
11. anna
12. darker
13. seagal
14. symbol
15. idiotic


In [18]:
#Test input movie reviews
input_reviews = ['The costumes in this movie were great', 'I think the story was terrible and the characters were very weak',
                 'People say that the director of the movie is amazing', 
                 'This is such an idiotic movie. I will not recommend it to anyone.']

In [19]:
print('Movie review predictions: \n')
for review in input_reviews:
    print('\nReview: ', review)
    
    #Compute the probabilities
    probabilities = classifier.prob_classify(extract_features(review.split()))
    
    #Pick the maximum value
    predicted_sentiment = probabilities.max()
    
    print('Predicted Sentiment: ', predicted_sentiment)
    print('Probability: ', round(probabilities.prob(predicted_sentiment), 2))

Movie review predictions: 


Review:  The costumes in this movie were great
Predicted Sentiment:  Positive
Probability:  0.59

Review:  I think the story was terrible and the characters were very weak
Predicted Sentiment:  Negative
Probability:  0.8

Review:  People say that the director of the movie is amazing
Predicted Sentiment:  Positive
Probability:  0.6

Review:  This is such an idiotic movie. I will not recommend it to anyone.
Predicted Sentiment:  Negative
Probability:  0.87
