In [None]:
# Run this cell and restart the Kernel before running the other cells
import sys
!{sys.executable} -m pip install nltk --user

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .getOrCreate()

In [None]:
# Change the date to today

# Parse JSON and display the number of tweets
df_tweets = spark.read.json("hdfs:///user/nobody/tweet-lake/raw/2018/08/14/*")
df_tweets.createOrReplaceTempView("tweets")
df_tweets.count()

In [None]:
# Find tweets containing emojis and use the emojis to define if they are positive or negative tweets
df_positive_tweets = spark.sql("SELECT text FROM tweets WHERE (text LIKE '%\U0001F60D%' OR text LIKE '%\U0001F60A%' OR text LIKE '%\U0001F604%' OR text LIKE '%\U0001F603%' OR text LIKE '%\U0001F600%' OR text LIKE '%\U0001F606%') AND text NOT LIKE '%\U0001F62D%' AND text NOT LIKE '%\U0001F612%' AND text NOT LIKE '%\U0001F629%' AND text NOT LIKE '%\U0001F61E%' AND text NOT LIKE '%\U0001F62A%'")
df_positive_tweets.createOrReplaceTempView("positive_tweets")
df_negative_tweets = spark.sql("SELECT text FROM tweets WHERE (text LIKE '%\U0001F62D%' OR text LIKE '%\U0001F612%' OR text LIKE '%\U0001F629%' OR text LIKE '%\U0001F61E%' OR text LIKE '%\U0001F62A%') AND text NOT LIKE '%\U0001F60D%' AND text NOT LIKE '%\U0001F60A%' AND text NOT LIKE '%\U0001F604%' AND text NOT LIKE '%\U0001F603%' AND text NOT LIKE '%\U0001F600%' AND text NOT LIKE '%\U0001F606%'")
df_negative_tweets.createOrReplaceTempView("negative_tweets")
df_sentiments = spark.sql("(SELECT text, CAST(1 AS DOUBLE) AS sentiment FROM positive_tweets) UNION ALL (SELECT text, CAST(0 AS DOUBLE) AS sentiment FROM negative_tweets) ORDER BY RAND()")
df_sentiments.count()

In [None]:
# Define a function to clean (a little bit) the tweets
# Taken from https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-2-333514854913
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner_updated(row):
    text = row.text
    stripped = re.sub(combined_pat, '', text)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    simple_spaced = re.sub(' +',' ',letters_only)
    return simple_spaced, row.sentiment

In [None]:
# Clean the tweets and only keep the ones that have more than 50 characters after cleanup
rdd_clean = df_sentiments.rdd.map(tweet_cleaner_updated)
df_clean = rdd_clean.toDF(["text","sentiment"])
df_clean.createOrReplaceTempView("clean")
df_final = spark.sql("SELECT * FROM clean WHERE LENGTH(text) > 50")
df_final.show()

In [None]:
# Split the dataset for training, validation and testing purpose
(train_set, val_set, test_set) = df_final.randomSplit([0.90, 0.05, 0.05])

In [None]:
# Prepare the data
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.show(5)

In [None]:
# Create and evaluate the model
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=20)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

In [None]:
# Save the model in HDFS
lrModel.save("hdfs:///user/client/tweets_model")