# Real-Time Sentiment Analysis on Twitter Data

The goal of this mini project is to collect data from Twitter and perform sentiment analysis on tweets in real-time using the Spark Streaming API. A predictive model classifies the downloaded tweets as being indicators of either positive (1) or negative (0) feelings.

For this task, a corpus (data set) with over 1.5 million prelabeled tweets was collected from a [Kaggle competition](https://inclass.kaggle.com/c/si650winter11) hosted by the University of Michigan. The Naive Bayes algorithm is then used to train the model on such data.

## 1 Imports and basic configuration

In [None]:
# PySpark classes.
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

In [None]:
# NLTK classes and functions.
from nltk.sentiment import SentimentAnalyzer
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.corpus import stopwords
from nltk.sentiment.util import *

In [None]:
# Modules for HTTP requests.
import requests_oauthlib
import requests

In [None]:
# Miscellaneous modules.
import operator
import string
import time
import json
import ast
import re

In [None]:
# Stream update interval.
MINIBATCH_INTERVAL = 10

In [None]:
# Create the streaming context.
ssc = StreamingContext(sc, MINIBATCH_INTERVAL)

## 2 Model training

### 2.1 Data load

In [None]:
# Load the training CSV file to an RDD.
rdd = sc.textFile('labeled_tweets.csv')

In [None]:
# Show the first rows.
rdd.take(10)

In [None]:
# Remove the header.
header = rdd.first()
rdd = rdd.filter(lambda row: row != header)

In [None]:
rdd.take(10)

### 2.2 Preprocessing

In [None]:
# Get a list of English stopwords with and without negation marks.
all_stopwords = sorted(
    stopwords.words('english') + [word + '_NEG' for word in stopwords.words('english')])

In [None]:
def preprocess_row(row):
    """Extract the tweet contents and the sentiment label from a row.
    """
    row = row.split(',')
    translator = str.maketrans({key: None for key in string.punctuation})

    # Remove whitespaces, stopwords, punctuation, and convert to lowercase.
    tweet = re.sub(' +', ' ', row[3]).lower()
    tweet = mark_negation(tweet).translate(translator).split(' ')
    tweet = [word for word in tweet if word != '' and word not in all_stopwords]

    sentiment = int(row[1])

    return tweet, sentiment

In [None]:
# Apply the clean function for all rows.
rdd = rdd.map(preprocess_row)

In [None]:
# Show the results.
rdd.take(10)

In [None]:
# Get the training and test sets.
train_rdd, test_rdd = rdd.randomSplit([0.7, 0.3], seed=42)

In [None]:
# Create the analyzer.
sentiment_analyzer = SentimentAnalyzer()

In [None]:
# Get all words in the training data.
train_data = train_rdd.collect()
train_words = sentiment_analyzer.all_words(train_data)

In [None]:
# Get the top 1000 word features.
unigram_feats = sentiment_analyzer.unigram_word_feats(train_words, top_n=500)

In [None]:
# Define the feature extractor.
sentiment_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

In [None]:
# Extract word features in the training data.
train_feats = sentiment_analyzer.apply_features(train_data)

In [None]:
# Train the Naive Bayes model.
nb_trainer = NaiveBayesClassifier.train
nb_model = sentiment_analyzer.train(nb_trainer, train_feats, save_classifier=True)

In [None]:
# Prepare the test data.
test_data = test_rdd.collect()
test_feats = sentiment_analyzer.apply_features(test_data)

In [None]:
# Make predictions and evaluate resusentiment_analyzern the test set.
test_results = sentiment_analyzer.evaluate(test_data, verbose=True)

To be continued...