# Twitter Sentiment Analysis

In [0]:
!pip install gensim --upgrade
!pip install keras --upgrade
!pip install pandas --upgrade


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [0]:
# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Utility
import re

In [0]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Settings

In [0]:
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

### Read Dataset

### Dataset details
* **target**: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
* **ids**: The id of the tweet ( 2087)
* **date**: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
* **flag**: The query (lyx). If there is no query, then this value is NO_QUERY.
* **user**: the user that tweeted (robotickilldozr)
* **text**: the text of the tweet (Lyx is cool)

In [0]:
df = spark.sql("select * from default.training_1600000_processed_noemoticon_2_csv TABLESAMPLE")

In [0]:
print("Dataset size:", len(df.columns))

Dataset size: 6


In [0]:
df.show(n=5)

+------+----------+--------------------+--------+---------------+--------------------+
|target|        id|                date|    flag|           user|                text|
+------+----------+--------------------+--------+---------------+--------------------+
|     0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|     0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|     0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|     0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|     0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+------+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



### Pre-Process dataset

In [0]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [0]:
def preprocessDef(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

spark.udf.register("preprocess", preprocessDef)
preprocess = udf(preprocessDef)

In [0]:
df = df.select("target", preprocess("text").alias("text"))

### Split train and test

In [0]:
df = df.select("target", "text")
df_train, df_test = df.randomSplit([TRAIN_SIZE, 1-TRAIN_SIZE])
print("TRAIN size:", df_train.count())
print("TEST size:", df_test.count())

TRAIN size: 1280748
TEST size: 319252


### Word2Vec

In [0]:
documents = df_train.select("target", split("text", " ").alias("text"))
w2v_model = Word2Vec(vectorSize=W2V_SIZE, windowSize=W2V_WINDOW, minCount=W2V_MIN_COUNT, inputCol="text", outputCol="features")

In [0]:
w2v_model = w2v_model.fit(documents)

In [0]:
w2v_model.findSynonyms("love",5).show()

+--------+------------------+
|    word|        similarity|
+--------+------------------+
|   loves|0.5852648019790649|
|   adore|0.5670005083084106|
|amaziing|0.5297737717628479|
|  looove|0.5258116722106934|
|     luv|0.5237181782722473|
+--------+------------------+



In [0]:
documents_test = df_test.select("target", split("text", " ").alias("text"))
train_word2vec_df = w2v_model.transform(documents)
test_word2vec_df = w2v_model.transform(documents_test)

### Model

In [0]:
train_word2vec_df.cache()

DataFrame[target: int, text: array<string>, features: vector]

In [0]:
# Train a logistic regression model
MAX_ITERATIONS = 10
lr = LogisticRegression(featuresCol="features", labelCol="target", maxIter=MAX_ITERATIONS)
lr_model = lr.fit(train_word2vec_df)

In [0]:
# Make predictions on the testing set
predictions = lr_model.transform(test_word2vec_df)

In [0]:
predictions.show(n=5)

+------+----+--------------------+--------------------+--------------------+----------+
|target|text|            features|       rawPrediction|         probability|prediction|
+------+----+--------------------+--------------------+--------------------+----------+
|     0|  []|[-8.0287439050152...|[8.17675542045878...|[0.57680723859971...|       0.0|
|     0|  []|[-8.0287439050152...|[8.17675542045878...|[0.57680723859971...|       0.0|
|     0|  []|[-8.0287439050152...|[8.17675542045878...|[0.57680723859971...|       0.0|
|     0|  []|[-8.0287439050152...|[8.17675542045878...|[0.57680723859971...|       0.0|
|     0|  []|[-8.0287439050152...|[8.17675542045878...|[0.57680723859971...|       0.0|
+------+----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



### Evaluation

In [0]:
trainingSummary = lr_model.summary

# Obtain the objective per iteration

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.7239683372529179
FPR: 0.2760294485282593
TPR: 0.7239683372529178
F-measure: 0.723967275072013
Precision: 0.7239734990500657
Recall: 0.7239683372529178


In [0]:
import numpy as np
tp = np.zeros(2,)
fp = np.zeros(2,)

print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))
    fp[i] = rate

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))
    tp[i] = rate

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))


False positive rate by label:
label 0: 0.27812808250820964
label 1: 0.2739330287671318
True positive rate by label:
label 0: 0.7260669712328681
label 1: 0.7218719174917904
Precision by label:
label 0: 0.7228223589963743
label 1: 0.7251234245632519
Recall by label:
label 0: 0.7260669712328681
label 1: 0.7218719174917904
F-measure by label:
label 0: 0.7244410321564836
label 1: 0.7234940178373799


In [0]:
display(predictions.select("prediction").groupby("prediction").count())

prediction,count
0.0,160542
4.0,158710


Databricks visualization. Run in Databricks to view.