# Real-Time Sentiment Analysis on Twitter Data

The goal of this mini project is to collect data from Twitter and perform sentiment analysis on tweets in real-time using the Spark Streaming API. A predictive model classifies the downloaded tweets as being indicators of either positive (1) or negative (0) feelings.

For this task, a corpus (data set) with over 1.5 million prelabeled tweets was collected from a [Kaggle competition](https://inclass.kaggle.com/c/si650winter11) hosted by the University of Michigan. The Naive Bayes algorithm is then used to train the model on such data.

In [None]:
# PySpark classes.
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

In [None]:
# NLTK classes and functions.
from nltk.sentiment import SentimentAnalyzer
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.corpus import stopwords
from nltk.sentiment.util import *

In [None]:
# Modules for HTTP requests.
import requests_oauthlib
import requests

In [None]:
# Miscellaneous modules.
import operator
import string
import time
import json
import ast
import re

## Model training

### Data load

In [None]:
# Load the training CSV file to an RDD.
rdd = sc.textFile('labeled_tweets.csv')

In [None]:
# Show the first rows.
rdd.take(10)

In [None]:
# Remove the header.
header = rdd.first()
rdd = rdd.filter(lambda row: row != header)

In [None]:
rdd.take(10)

### Data preprocessing

In [None]:
# Get a list of English stopwords with and without negation marks.
all_stopwords = sorted(
    stopwords.words('english') + [word + '_NEG' for word in stopwords.words('english')])

In [None]:
def preprocess_row(row):
    """Extract the tweet contents and the sentiment label from a row.
    """
    row = row.split(',')
    translator = str.maketrans({key: None for key in string.punctuation})

    # Remove whitespaces, stopwords, punctuation, and convert to lowercase.
    tweet = re.sub(' +', ' ', row[3]).lower()
    tweet = mark_negation(tweet).translate(translator).split(' ')
    tweet = [word for word in tweet if word != '' and word not in all_stopwords]

    sentiment = row[1]

    return tweet, sentiment

In [None]:
# Apply the clean function for all rows.
rdd = rdd.map(preprocess_row)

In [None]:
# Show the results.
rdd.take(10)

In [None]:
# Get the training and test sets.
train_rdd, test_rdd = rdd.randomSplit([0.7, 0.3], seed=42)

### Feature extraction

In [None]:
# Create the analyzer.
sentiment_analyzer = SentimentAnalyzer()

In [None]:
# Get all words in the training data.
train_data = train_rdd.collect()
train_words = sentiment_analyzer.all_words(train_data)

In [None]:
# Get the top 1000 word features.
unigram_feats = sentiment_analyzer.unigram_word_feats(train_words, top_n=1000)

In [None]:
# Define the feature extractor.
sentiment_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

In [None]:
# Extract word features in the training data.
train_feats = sentiment_analyzer.apply_features(train_data)

### Training and evaluation

In [None]:
# Train the Naive Bayes model.
nb_trainer = NaiveBayesClassifier.train
nb_model = sentiment_analyzer.train(nb_trainer, train_feats, save_classifier=True)

In [None]:
# Prepare the test data.
test_data = test_rdd.collect()
test_feats = sentiment_analyzer.apply_features(test_data)

In [None]:
# Make predictions and evaluate in the test set.
test_results = sentiment_analyzer.evaluate(test_feats, verbose=True)

## Data streaming

In [None]:
# Stream update interval (in seconds).
MINIBATCH_INTERVAL = 10

In [None]:
# Maximum number of tweets downloaded at once.
TWEETS_PER_BATCH = 100

In [None]:
# We're interested in tweets containing this term.
search_term = 'Trump'

In [None]:
# Create the streaming context.
ssc = StreamingContext(sc, MINIBATCH_INTERVAL)

In [None]:
# Configure the stream.
empty_rdd = sc.parallelize([0])
stream = ssc.queueStream([], default=empty_rdd)

### Twitter authentication

In [None]:
# Consumer keys and access tokens for the Twitter API.
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''

In [None]:
# Twitter URLs.
sample_url = 'https://stream.twitter.com/1.1/statuses/sample.json'
filter_url = 'https://stream.twitter.com/1.1/statuses/filter.json?track=' + search_term

In [None]:
# Get the authentication object.
auth = requests_oauthlib.OAuth1(consumer_key, consumer_secret,
                                access_token, access_token_secret)

### Tweet classification

In [None]:
def get_tweets():
    """Connect to Twitter and download a certain number of tweets.
    """
    response = requests.get(filter_url, auth=auth, stream=True)
    print(filter_url, response)
    
    count = 0
    for line in response.iter_lines():
        if count >= TWEETS_PER_BATCH:
            break
        try:
            post = json.loads(line.decode('utf-8'))
            count += 1
            yield post['text']
        except:
            result = False

In [None]:
# Specify the data transform functions.
stream = stream.transform(lambda _, rdd: rdd.flatMap(lambda __: get_tweets()))

In [None]:
def preprocess_and_classify(row):
    """Get tweets, put them in the right format and classify them.
    """
    translator = str.maketrans({key: None for key in string.punctuation})

    # Remove whitespaces, stopwords, punctuation, and convert to lowercase.
    tweet = re.sub(' +', ' ', row).lower()
    tweet = mark_negation(tweet).translate(translator).split(' ')
    tweet = [word for word in tweet if word != '' and word not in all_stopwords]

    # Get the tweet features.
    data = [(tweet, '')]
    data = sentiment_analyzer.apply_features(data)

    # Classify it using the Naive Bayes model.
    sentiment = nb_model.classify(data[0][0])
    print(tweet, sentiment)

    return tweet, sentiment

In [None]:
# All results will be stored in this list.
results = []

In [None]:
def update_results(rdd):
    """Count predictions of each class and add them to the results list.
    """
    global results

    # Count tweets classified as 0 and 1.
    sentiments_rdd = rdd.map(lambda row: (preprocess_and_classify(row)[1], 1))
    counts_rdd = sentiments_rdd.reduceByKey(operator.add)

    # Add these counts to the global results.
    result = [time.strftime("%I:%M:%S"), counts_rdd.collect()]
    results.append(result)

    print(result)

In [None]:
# Specify the function that runs for each minibatch.
stream.foreachRDD(lambda _, rdd: update_results(rdd))

### Streaming control

In [None]:
# Start the streaming.
ssc.start()
# ssc.awaitTermination()

In [None]:
# Wait just until we get a few minibatches.
while True:
    if len(results) > 10:
        break

## Results

In [None]:
# Get the results in an RDD.
results_rdd = sc.parallelize(results)
results_rdd.collect()

In [None]:
# Save the results in a text file.
filename = 'r' + time.strftime("%I%M%S")
results_rdd.saveAsTextFile(filename)

In [None]:
# Stop the streaming.
ssc.stop()