In [1]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc =SparkContext()
sqlContext = SQLContext(sc)
data = sqlContext.read.format('csv').options(header='true', inferschema='true').load('text_emotion.csv')


In [4]:
print(data.head(5))
drop_list=['tweet_id','author']
data=data.select([column for column in data.columns if column not in drop_list])
data.show(5)

[Row(tweet_id=1956967341, sentiment='empty', author='xoshayzers', content='@tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =['), Row(tweet_id=1956967666, sentiment='sadness', author='wannamama', content='Layin n bed with a headache  ughhhh...waitin on your call...'), Row(tweet_id=1956967696, sentiment='sadness', author='coolfunky', content='Funeral ceremony...gloomy friday...'), Row(tweet_id=1956967789, sentiment='enthusiasm', author='czareaquino', content='wants to hang out with friends SOON!'), Row(tweet_id=1956968416, sentiment='neutral', author='xkilljoyx', content='@dannycastillo We want to trade with someone who has Houston tickets, but no one will.')]
+----------+--------------------+
| sentiment|             content|
+----------+--------------------+
|     empty|@tiffanylue i kno...|
|   sadness|Layin n bed with ...|
|   sadness|Funeral ceremony....|
|enthusiasm|wants to hang out...|
|   neutral|@dannycastillo We...|
+----------+-------

In [6]:
data.printSchema()

root
 |-- sentiment: string (nullable = true)
 |-- content: string (nullable = true)



In [7]:
from pyspark.sql.functions import col
data.groupBy("sentiment") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+----------+-----+
| sentiment|count|
+----------+-----+
|   neutral| 8638|
|     worry| 8459|
| happiness| 5209|
|   sadness| 5165|
|      love| 3842|
|  surprise| 2187|
|       fun| 1776|
|    relief| 1526|
|      hate| 1323|
|     empty|  827|
|enthusiasm|  759|
|   boredom|  179|
|     anger|  110|
+----------+-----+



In [8]:
data.groupBy("content") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|             content|count|
+--------------------+-----+
|I just received a...|   14|
|FREE UNLIMITED RI...|   13|
| Happy Mother's Day!|   10|
|   Happy Mothers Day|   10|
|  happy mother's day|    8|
|http://snipurl.co...|    7|
|        Good Morning|    6|
|   happy mothers day|    5|
|        Good morning|    5|
| happy mother's day!|    4|
|  Happy mothers day!|    4|
|Happy Mother's Da...|    4|
|   i have a headache|    4|
|            Headache|    4|
|                   0|    4|
|@DougieMcfly Haha...|    3|
|  Happy Mothers Day!|    3|
|  HAPPY MOTHERS DAY!|    3|
|Good morning ever...|    3|
|             nothing|    3|
+--------------------+-----+
only showing top 20 rows



In [9]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)")
# stop words
add_stopwords = ["http","https","amp","rt","t","c","the","@","#","is","this","the","that"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

+----------+--------------------+--------------------+--------------------+------------+-----+
| sentiment|             content|               words|            filtered|    features|label|
+----------+--------------------+--------------------+--------------------+------------+-----+
|     empty|@tiffanylue i kno...|[ i know  i was l...|[ i know  i was l...|(1018,[],[])|  9.0|
|   sadness|Layin n bed with ...|[layin n bed with...|[layin n bed with...|(1018,[],[])|  3.0|
|   sadness|Funeral ceremony....|[funeral ceremony...|[funeral ceremony...|(1018,[],[])|  3.0|
|enthusiasm|wants to hang out...|[wants to hang ou...|[wants to hang ou...|(1018,[],[])| 10.0|
|   neutral|@dannycastillo We...|[ we want to trad...|[ we want to trad...|(1018,[],[])|  0.0|
+----------+--------------------+--------------------+--------------------+------------+-----+
only showing top 5 rows



In [11]:
# set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 28008
Test Dataset Count: 11992


In [12]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("content","sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+---------+------------------------------+-----+----------+
|                       content|sentiment|                   probability|label|prediction|
+------------------------------+---------+------------------------------+-----+----------+
|@AngelaIsshay oh that's wha...| surprise|[0.6180130460190141,0.07249...|  5.0|       0.0|
|#frenchieb-day #frenchieb-d...|  neutral|[0.6045925760682105,0.10797...|  0.0|       0.0|
|#frenchieb-day #frenchieb-d...|  neutral|[0.6045925760682105,0.10797...|  0.0|       0.0|
|@Dani___ okay, i'll finally...|    worry|[0.5916788072573377,0.08004...|  1.0|       0.0|
|@filos @elliottucker thanks...|happiness|[0.5570483594039325,0.10966...|  2.0|       0.0|
|@r_u_b_y_l totes it's gonna...|happiness|[0.5457999874351035,0.04101...|  2.0|       0.0|
|Sorry for the apparent spam...|    worry|[0.5453218908102234,0.11541...|  1.0|       0.0|
|@heycassadee SUPER EXCITED....|happiness|[0.5230171482299447,0.11847...|  2.0|       0.0|

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.16604458421594048

In [14]:
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("content","sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+---------+------------------------------+-----+----------+
|                       content|sentiment|                   probability|label|prediction|
+------------------------------+---------+------------------------------+-----+----------+
|@wishinghearts oh wow! I ha...|     love|[0.8370347841026007,0.03386...|  4.0|       0.0|
|Is wondering why my message...|    worry|[0.8189054337246018,0.01792...|  1.0|       0.0|
|i jus love doin night shift...|      fun|[0.7800398985268713,0.03904...|  6.0|       0.0|
|: experiencing the unique #...|     love|[0.7748558683956319,0.07558...|  4.0|       0.0|
|Going out to eat with my en...|      fun|[0.771389017508997,0.048978...|  6.0|       0.0|
|@maroon5princess I used to ...|      fun|[0.7660878676121049,0.03296...|  6.0|       0.0|
|@aMj89 Until @twitter bring...|  neutral|[0.7508590423002167,0.04367...|  0.0|       0.0|
|@jojototh @abduzeedo firefo...|     hate|[0.742498051950582,0.045592...|  8.0|       0.0|

In [15]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)


0.16917440122122518

In [16]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.17466151364040416

In [18]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("content","sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+---------+------------------------------+-----+----------+
|                       content|sentiment|                   probability|label|prediction|
+------------------------------+---------+------------------------------+-----+----------+
|#frenchieb-day #frenchieb-d...|  neutral|[0.9999985676382841,8.01437...|  0.0|       0.0|
|#frenchieb-day #frenchieb-d...|  neutral|[0.9999985676382841,8.01437...|  0.0|       0.0|
|@pro01, @if__fi: ??????? ??...|  neutral|[0.9943831803147599,9.54821...|  0.0|       0.0|
|????, ?? #sctest ????? ?? ?...| surprise|[0.9769181613803867,0.00581...|  5.0|       0.0|
|??????? ????????? ?? Google...|  neutral|[0.9261944635458372,0.02311...|  0.0|       0.0|
|@xoangelbabiixo @Babygirl94...|    worry|[0.897646918378875,0.033665...|  1.0|       0.0|
|@erinhosborn Thanks Legolas...|happiness|[0.8867027204178377,0.02042...|  2.0|       0.0|
|@Atomik re: the job ... sti...|  neutral|[0.8599918839285794,0.06840...|  0.0|       0.0|

In [19]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.17517654934338553

In [21]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("content","sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+---------+------------------------------+-----+----------+
|                       content|sentiment|                   probability|label|prediction|
+------------------------------+---------+------------------------------+-----+----------+
|@simbaaa @mahimaaa SORRY! I...|  sadness|[0.22419877030254007,0.2190...|  3.0|       0.0|
|you always seem to know exa...|     love|[0.22348869414567857,0.2026...|  4.0|       0.0|
|@Jean_Pierce Well the Enter...|     love|[0.22304441539826048,0.2085...|  4.0|       0.0|
|FOR @john_b_waters and  @mr...|  neutral|[0.22304441539826048,0.2085...|  0.0|       0.0|
|@r_u_b_y_l totes it's gonna...|happiness|[0.22304441539826048,0.2085...|  2.0|       0.0|
|4 novos followers http://mi...|    worry|[0.22295078099920929,0.2201...|  1.0|       0.0|
|@kenyaimagine ... much more...|  sadness|[0.22295078099920929,0.2201...|  3.0|       0.0|
|@nomaditation  oo? u?op ?p?...|  neutral|[0.22277043538910077,0.2055...|  0.0|       0.0|

In [22]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.10322277595507494