## Sentiment Analysis with TextBlob

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
import string

sc = SparkContext()
spark = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()

In [2]:
lines=sc.textFile("data.txt")
df = (lines.flatMap(lambda line: line.split('b\''))
      .filter(lambda line: '\\' not in line)
      .filter(lambda line: line != '')
      .map(lambda line: (line, )).toDF(['tweet']))

df.createOrReplaceTempView('tweets')
# spark.sql( 'show tables from default' ).show()
tweets = spark.sql('select * from tweets')
tweets.show(5)

+--------------------+
|               tweet|
+--------------------+
|Listening on port...|
|Received request ...|
|If Ashley Purdy l...|
|@jennyhalasz I pr...|
|@Starecrows the o...|
+--------------------+
only showing top 5 rows



In [3]:
regexTokenizer = RegexTokenizer(inputCol='tweet', outputCol='words', pattern='\\W')
regexTokenized = regexTokenizer.transform(tweets)
regexTokenized.show(5)

+--------------------+--------------------+
|               tweet|               words|
+--------------------+--------------------+
|Listening on port...|[listening, on, p...|
|Received request ...|[received, reques...|
|If Ashley Purdy l...|[if, ashley, purd...|
|@jennyhalasz I pr...|[jennyhalasz, i, ...|
|@Starecrows the o...|[starecrows, the,...|
+--------------------+--------------------+
only showing top 5 rows



In [4]:
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
stop_words_removed = remover.transform(regexTokenized.select(['words']))
stop_words_removed.show(5)

+--------------------+--------------------+
|               words|            filtered|
+--------------------+--------------------+
|[listening, on, p...|[listening, port,...|
|[received, reques...|[received, reques...|
|[if, ashley, purd...|[ashley, purdy, l...|
|[jennyhalasz, i, ...|[jennyhalasz, pro...|
|[starecrows, the,...|[starecrows, open...|
+--------------------+--------------------+
only showing top 5 rows



In [5]:
from pyspark.sql import functions as f
cleaned_df = stop_words_removed.withColumn('cleaned', f.expr('filter(filtered, x -> not(length(x) < 4))'))
cleaned_df.show(5)

+--------------------+--------------------+--------------------+
|               words|            filtered|             cleaned|
+--------------------+--------------------+--------------------+
|[listening, on, p...|[listening, port,...|[listening, port,...|
|[received, reques...|[received, reques...|[received, reques...|
|[if, ashley, purd...|[ashley, purdy, l...|[ashley, purdy, l...|
|[jennyhalasz, i, ...|[jennyhalasz, pro...|[jennyhalasz, pro...|
|[starecrows, the,...|[starecrows, open...|[starecrows, open...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [6]:
df = cleaned_df.toPandas()
df.head()

Unnamed: 0,words,filtered,cleaned
0,"[listening, on, port, 9999]","[listening, port, 9999]","[listening, port, 9999]"
1,"[received, request, from, 127, 0, 0, 1, 46120]","[received, request, 127, 0, 0, 1, 46120]","[received, request, 46120]"
2,"[if, ashley, purdy, left, bvb, how, am, i, gon...","[ashley, purdy, left, bvb, gonna, look, naked,...","[ashley, purdy, left, gonna, look, naked, wome..."
3,"[jennyhalasz, i, probably, would, have, gone, ...","[jennyhalasz, probably, gone, guitar, stuff, h...","[jennyhalasz, probably, gone, guitar, stuff, h..."
4,"[starecrows, the, opening, with, the, guitar, ...","[starecrows, opening, guitar, part, loops, end]","[starecrows, opening, guitar, part, loops]"


In [7]:
import pandas as pd
from textblob import TextBlob

ta = []
for line in df.cleaned.items():
    blob = TextBlob(' '.join(line[1]))
    if blob.sentiment.polarity != 0 and blob.sentiment.subjectivity !=0:
#         print(blob.sentiment.polarity, blob.sentiment.subjectivity, ' '.join(line[1]))
        ta.append((blob.sentiment.polarity, blob.sentiment.subjectivity, ' '.join(line[1])))
 
sentiment_df = pd.DataFrame(ta, columns=('polarity', 'subjectivity', 'tweet'))
sentiment_df['label'] = sentiment_df.polarity.apply(lambda x: 'pos' if x > 0 else 'neg')
print(sentiment_df['label'].value_counts())
sentiment_df.head()

pos    36
neg    22
Name: label, dtype: int64


Unnamed: 0,polarity,subjectivity,tweet,label
0,-0.003409,0.35,ashley purdy left gonna look naked women bass ...,neg
1,-0.15,0.5,jamesbut_ bass guitar world smallest violin,neg
2,-0.15,0.4,merle travis dark dungeon solo guitar 1951 cou...,neg
3,-0.3,0.4,sportsourcea slmandel least rutgers rock guitar,neg
4,-0.15,0.5,jtsom guitar drums bass piano violin voice man...,neg


In [8]:
# split data
import math
import random

classifier_tuples = list(sentiment_df.drop(['polarity', 'subjectivity'], axis=1).itertuples(index=False, name=None))
train=random.sample(classifier_tuples, math.ceil(len(classifier_tuples) * .7))
test=[t for t in classifier_tuples if t not in train]

In [9]:
from textblob.classifiers import NaiveBayesClassifier
cl = NaiveBayesClassifier(train)
print("Classifier Accuracy: {:.2f}".format(cl.accuracy(test)))

Classifier Accuracy: 0.69


In [10]:
cl.show_informative_features(10)

Most Informative Features
      contains(electric) = True              neg : pos    =      1.5 : 1.0
          contains(solo) = True              neg : pos    =      1.5 : 1.0
          contains(want) = True              neg : pos    =      1.5 : 1.0
         contains(music) = True              neg : pos    =      1.5 : 1.0
          contains(like) = True              neg : pos    =      1.5 : 1.0
    contains(relaxation) = True              neg : pos    =      1.5 : 1.0
  contains(zakkwyldebls) = False             pos : neg    =      1.3 : 1.0
          contains(song) = False             pos : neg    =      1.3 : 1.0
       contains(reveals) = False             pos : neg    =      1.3 : 1.0
   contains(guitarworld) = False             pos : neg    =      1.3 : 1.0
