# Sentiment analysis using NLP

## 1. Python modules

In [None]:
#---import modules
import nltk
import string
import random
import seaborn as sns
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews

## 2. Download movie reviews and stopword data for NLTK

In [None]:
#---download the movie review data
nltk.download('movie_reviews')

#---download stop word data (we will need this later to improve the analysis)
nltk.download('stopwords')

## 3. Organize and prepare data for sentiment analysis

### Organize documents (movies reviews) into a list

In [None]:
#---put the movie reviews into a list
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

print('Number of movie reviews',len(documents))
print('Number of words',len(movie_reviews.words()))

### Count positive and negative reviews

In [None]:
p = 0
n = 0
for (d,c) in documents:
    if c == 'pos': p += 1
    else: n += 1
print('Number of positive reviews',p)
print('Number of negative reviews',n)

### Shuffle data (to make things more interesting)

In [None]:
#---shuffle the movie reviews
random.shuffle(documents)

### Apply the bag-of-words model to movie reviews data using NLTK

In [None]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

### Visualize the most common words in the movie reviews data

In [None]:
#---visualize most common words
most_common_words = all_words.most_common(20)
fig, ax = plt.subplots(figsize=(20,10))
all_plot = sns.barplot(x=[w[0] for w in most_common_words], 
                       y=[w[1] for w in most_common_words], palette="viridis")
plt.xticks(rotation=30);

### Remove uninteresting words and re-apply the bag-of-words model to the resulting filtered list of words

In [None]:
#---clean words to remove uninteresting words
stop_words = set(stopwords.words('english'))
filtered_words = []
for w in movie_reviews.words():
    if w not in stop_words:
        if w not in string.punctuation:
            filtered_words.append(w.lower())
clean_words = nltk.FreqDist(filtered_words)
print(clean_words)

### Visualize the most common words in the movie reviews data after filtering

In [None]:
#---visualize most common clean words
most_common_clean_words = clean_words.most_common(20)
fig, ax = plt.subplots(figsize=(20,10))
all_plot = sns.barplot(x=[w[0] for w in most_common_clean_words], 
                       y=[w[1] for w in most_common_clean_words], palette="viridis")
plt.xticks(rotation=30);

## 4. Define features, setup training and test data and run Naive Bayes classifier

### Define feature extractor function

In [None]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

### Trim word list (if too long) to most frequent words and extract features for each document

In [None]:
#---trim to first w words
w = 1000
word_features = list(clean_words)[:w]
featuresets = [(document_features(d), c) for (d,c) in documents]

### Define training and test sets and train Naive Bayes classifier

In [None]:
#---last x documents are for testing
x = 200
train_set, test_set = featuresets[x:], featuresets[:x]
print('Number of documents in training data',len(train_set))
print('Number of documents in test data',len(test_set))

#---Train Naive Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

### Test the Naive Bayes classifier 

In [None]:
score = nltk.classify.accuracy(classifier, test_set)
print('The accuracy of the classifier is %.1f%%' % (score*100))

### Show the most important features as interpreted by Naive Bayes

In [None]:
classifier.show_most_informative_features(10)