# Real-Time Sentiment Analysis on Twitter Data

The goal of this mini project is to collect data from Twitter and perform sentiment analysis on tweets in real-time using the Spark Streaming API. A predictive model classifies the downloaded tweets as being indicators of either positive (1) or negative (0) feelings.

For this task, a corpus (data set) with over 1.5 million prelabeled tweets was collected from a [Kaggle competition](https://inclass.kaggle.com/c/si650winter11) hosted by the University of Michigan. The Naive Bayes algorithm is then used to train the model on such data.

In [1]:
# PySpark classes.
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

In [2]:
# NLTK classes and functions.
from nltk.sentiment import SentimentAnalyzer
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.corpus import stopwords
from nltk.sentiment.util import *

In [3]:
# Modules for HTTP requests.
import requests_oauthlib
import requests

In [4]:
# Miscellaneous modules.
import operator
import string
import time
import json
import ast
import re

## Model training

### Data load

In [5]:
# Load the training CSV file to an RDD.
rdd = sc.textFile('labeled_tweets.csv')

In [6]:
# Show the first rows.
rdd.take(10)

['ItemID,Sentiment,SentimentSource,SentimentText',
 '1,0,Sentiment140,                     is so sad for my APL friend.............',
 '2,0,Sentiment140,                   I missed the New Moon trailer...',
 '3,1,Sentiment140,              omg its already 7:30 :O',
 "4,0,Sentiment140,          .. Omgaga. Im sooo  im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...",
 '5,0,Sentiment140,         i think mi bf is cheating on me!!!       T_T',
 '6,0,Sentiment140,         or i just worry too much?        ',
 '7,1,Sentiment140,       Juuuuuuuuuuuuuuuuussssst Chillin!!',
 '8,0,Sentiment140,       Sunny Again        Work Tomorrow  :-|       TV Tonight',
 '9,1,Sentiment140,      handed in my uniform today . i miss you already']

In [7]:
# Remove the header.
header = rdd.first()
rdd = rdd.filter(lambda row: row != header)

In [8]:
rdd.take(10)

['1,0,Sentiment140,                     is so sad for my APL friend.............',
 '2,0,Sentiment140,                   I missed the New Moon trailer...',
 '3,1,Sentiment140,              omg its already 7:30 :O',
 "4,0,Sentiment140,          .. Omgaga. Im sooo  im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...",
 '5,0,Sentiment140,         i think mi bf is cheating on me!!!       T_T',
 '6,0,Sentiment140,         or i just worry too much?        ',
 '7,1,Sentiment140,       Juuuuuuuuuuuuuuuuussssst Chillin!!',
 '8,0,Sentiment140,       Sunny Again        Work Tomorrow  :-|       TV Tonight',
 '9,1,Sentiment140,      handed in my uniform today . i miss you already',
 '10,1,Sentiment140,      hmmmm.... i wonder how she my number @-)']

### Data preprocessing

In [9]:
# Get a list of English stopwords with and without negation marks.
all_stopwords = sorted(
    stopwords.words('english') + [word + '_NEG' for word in stopwords.words('english')])

In [10]:
def preprocess_row(row):
    """Extract the tweet contents and the sentiment label from a row.
    """
    row = row.split(',')
    translator = str.maketrans({key: None for key in string.punctuation})

    # Remove whitespaces, stopwords, punctuation, and convert to lowercase.
    tweet = re.sub(' +', ' ', row[3]).lower()
    tweet = mark_negation(tweet).translate(translator).split(' ')
    tweet = [word for word in tweet if word != '' and word not in all_stopwords]

    sentiment = row[1]

    return tweet, sentiment

In [11]:
# Apply the clean function for all rows.
rdd = rdd.map(preprocess_row)

In [12]:
# Show the results.
rdd.take(10)

[(['sad', 'apl', 'friend'], '0'),
 (['missed', 'new', 'moon', 'trailer'], '0'),
 (['omg', 'already', '730'], '1'),
 (['omgaga',
   'im',
   'sooo',
   'im',
   'gunna',
   'cry',
   'ive',
   'dentist',
   'since',
   '11',
   'suposed',
   '2',
   'get',
   'crown',
   'put',
   '30mins'],
  '0'),
 (['think', 'mi', 'bf', 'cheating', 'tt'], '0'),
 (['worry', 'much'], '0'),
 (['juuuuuuuuuuuuuuuuussssst', 'chillin'], '1'),
 (['sunny', 'work', 'tomorrow', 'tv', 'tonight'], '0'),
 (['handed', 'uniform', 'today', 'miss', 'already'], '1'),
 (['hmmmm', 'wonder', 'number'], '1')]

In [13]:
# Get the training and test sets.
train_rdd, test_rdd = rdd.randomSplit([0.7, 0.3], seed=42)

### Feature extraction

In [14]:
# Create the analyzer.
sentiment_analyzer = SentimentAnalyzer()

In [15]:
# Get all words in the training data.
train_data = train_rdd.take(200000)
train_words = sentiment_analyzer.all_words(train_data)

In [16]:
# Get the top 1000 word features.
unigram_feats = sentiment_analyzer.unigram_word_feats(train_words, top_n=1000)

In [17]:
# Define the feature extractor.
sentiment_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

In [18]:
# Extract word features in the training data.
train_feats = sentiment_analyzer.apply_features(train_data)

### Training and evaluation

In [19]:
# Train the Naive Bayes model.
nb_trainer = NaiveBayesClassifier.train
nb_model = sentiment_analyzer.train(nb_trainer, train_feats, save_classifier=True)

Training classifier
Saving True


In [20]:
# Prepare the test data.
test_data = test_rdd.take(50000)
test_feats = sentiment_analyzer.apply_features(test_data)

In [21]:
# Make predictions and evaluate in the test set.
test_results = sentiment_analyzer.evaluate(test_feats, verbose=True)

Evaluating NaiveBayesClassifier results...
Accuracy: 0.71364
F-measure [0]: 0.6273295158771474
F-measure [1]: 0.7674894446248782
Precision [0]: 0.7012102874432677
Precision [1]: 0.7201499360029255
Recall [0]: 0.5675332014693416
Recall [1]: 0.8214906486824723


## Data streaming

In [22]:
# Stream update interval (in seconds).
MINIBATCH_INTERVAL = 10

In [23]:
# Maximum number of tweets downloaded at once.
TWEETS_PER_BATCH = 100

In [24]:
# We're interested in tweets containing this term.
search_term = 'Trump'

In [25]:
# Create the streaming context.
ssc = StreamingContext(sc, MINIBATCH_INTERVAL)

In [26]:
# Configure the stream.
empty_rdd = sc.parallelize([0])
stream = ssc.queueStream([], default=empty_rdd)

### Twitter authentication

In [27]:
# Consumer keys and access tokens for the Twitter API.
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''

In [28]:
# Twitter URLs.
sample_url = 'https://stream.twitter.com/1.1/statuses/sample.json'
filter_url = 'https://stream.twitter.com/1.1/statuses/filter.json?track=' + search_term

In [29]:
# Get the authentication object.
auth = requests_oauthlib.OAuth1(consumer_key, consumer_secret,
                                access_token, access_token_secret)

### Tweet classification

In [30]:
def get_tweets():
    """Connect to Twitter and download a certain number of tweets.
    """
    response = requests.get(filter_url, auth=auth, stream=True)
    print(filter_url, response)
    
    count = 0
    for line in response.iter_lines():
        if count >= TWEETS_PER_BATCH:
            break
        try:
            post = json.loads(line.decode('utf-8'))
            count += 1
            yield post['text']
        except:
            result = False

In [31]:
# Specify the data transform functions.
stream = stream.transform(lambda _, rdd: rdd.flatMap(lambda __: get_tweets()))

In [32]:
def preprocess_and_classify(row):
    """Get tweets, put them in the right format and classify them.
    """
    translator = str.maketrans({key: None for key in string.punctuation})

    # Remove whitespaces, stopwords, punctuation, and convert to lowercase.
    tweet = re.sub(' +', ' ', row).lower()
    tweet = mark_negation(tweet).translate(translator).split(' ')
    tweet = [word for word in tweet if word != '' and word not in all_stopwords]

    # Get the tweet features.
    data = [(tweet, '')]
    data = sentiment_analyzer.apply_features(data)

    # Classify it using the Naive Bayes model.
    sentiment = nb_model.classify(data[0][0])
    print(tweet, sentiment)

    return tweet, sentiment

In [33]:
# All results will be stored in this list.
results = []

In [34]:
def update_results(rdd):
    """Count predictions of each class and add them to the results list.
    """
    global results

    # Count tweets classified as 0 and 1.
    sentiments_rdd = rdd.map(lambda row: (preprocess_and_classify(row)[1], 1))
    counts_rdd = sentiments_rdd.reduceByKey(operator.add)

    # Add these counts to the global results.
    result = [time.strftime("%I:%M:%S"), counts_rdd.collect()]
    results.append(result)

    print(result)

In [35]:
# Specify the function that runs for each minibatch.
stream.foreachRDD(lambda _, rdd: update_results(rdd))

### Streaming control

In [36]:
# Start the streaming.
ssc.start()
# ssc.awaitTermination()

In [37]:
# Wait just until we get a few minibatches.
while True:
    if len(results) > 10:
        break

['04:57:11', []]
['04:57:21', [('0', 30), ('1', 66)]]
['04:57:31', [('1', 79), ('0', 15)]]
['04:57:41', [('1', 74), ('0', 22)]]
['04:57:50', [('1', 79), ('0', 17)]]
['04:58:01', [('1', 74), ('0', 22)]]
['04:58:10', [('1', 75), ('0', 21)]]
['04:58:20', []]
['04:58:30', [('1', 78), ('0', 18)]]
['04:58:41', [('0', 28), ('1', 68)]]
['04:58:51', [('1', 76), ('0', 21)]]


## Results

In [38]:
# Get the results in an RDD.
results_rdd = sc.parallelize(results)
results_rdd.collect()

[['04:57:11', []],
 ['04:57:21', [('0', 30), ('1', 66)]],
 ['04:57:31', [('1', 79), ('0', 15)]],
 ['04:57:41', [('1', 74), ('0', 22)]],
 ['04:57:50', [('1', 79), ('0', 17)]],
 ['04:58:01', [('1', 74), ('0', 22)]],
 ['04:58:10', [('1', 75), ('0', 21)]],
 ['04:58:20', []],
 ['04:58:30', [('1', 78), ('0', 18)]],
 ['04:58:41', [('0', 28), ('1', 68)]],
 ['04:58:51', [('1', 76), ('0', 21)]]]

In [39]:
# Save the results in a text file.
filename = 'r' + time.strftime("%I%M%S")
results_rdd.saveAsTextFile(filename)

In [40]:
# Stop the streaming.
ssc.stop()