## Sentiment Analysis Alternative

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
import string

sc = SparkContext()
spark = SparkSession.builder.appName("SentimentAlternative").getOrCreate()

In [2]:
lines=sc.textFile("data.txt")

df = (lines.flatMap(lambda line: line.split('b\''))
      .filter(lambda line: '\\' not in line)
      .filter(lambda line: line != '')
      .map(lambda line: (line, )).toDF(['tweet']))

df.createOrReplaceTempView('tweets')
tweets = spark.sql('select * from tweets')

stopwords = StopWordsRemover().getStopWords()
add_stopwords = ['http', 'https']
# print(stopwords)
regexTokenizer = RegexTokenizer(inputCol='tweet', outputCol='words', pattern='([0-9\@\W])').setMinTokenLength(3)
regexTokenized = regexTokenizer.transform(tweets)

remover = StopWordsRemover(inputCol='words', outputCol='cleaned').setStopWords(stopwords + add_stopwords)
filtered_df = remover.transform(regexTokenized.select(['words']))
filtered_df.select('cleaned').show(truncate=False)

+---------------------------------------------------------------------------------------------------------+
|cleaned                                                                                                  |
+---------------------------------------------------------------------------------------------------------+
|[listening, port]                                                                                        |
|[received, request]                                                                                      |
|[ashley, purdy, left, bvb, gonna, look, naked, women, bass, guitar, see, live]                           |
|[jennyhalasz, probably, gone, guitar, stuff, iuao, dro]                                                  |
|[starecrows, opening, guitar, part, loops, end]                                                          |
|[jamesbut_, bass, guitar, world, smallest, violin]                                                       |
|[merle, travis, dark, dunge

In [3]:
df = filtered_df.toPandas()
df.head()

Unnamed: 0,words,cleaned
0,"[listening, port]","[listening, port]"
1,"[received, request, from]","[received, request]"
2,"[ashley, purdy, left, bvb, how, gonna, look, n...","[ashley, purdy, left, bvb, gonna, look, naked,..."
3,"[jennyhalasz, probably, would, have, gone, wit...","[jennyhalasz, probably, gone, guitar, stuff, i..."
4,"[starecrows, the, opening, with, the, guitar, ...","[starecrows, opening, guitar, part, loops, end]"


In [4]:
import pandas as pd
from textblob import TextBlob

ta = []
for line in df.cleaned.items():
    blob = TextBlob(' '.join(line[1]))
    if blob.sentiment.polarity != 0 and blob.sentiment.subjectivity !=0:
#         print(blob.sentiment.polarity, blob.sentiment.subjectivity, ' '.join(line[1]))
        ta.append((blob.sentiment.polarity, blob.sentiment.subjectivity, ' '.join(line[1])))
 
sentiment_df = pd.DataFrame(ta, columns=('polarity', 'subjectivity', 'tweet'))
sentiment_df['label'] = sentiment_df.polarity.apply(lambda x: 'pos' if x > 0 else 'neg')
print(sentiment_df['label'].value_counts())
sentiment_df.head()

pos    44
neg    22
Name: label, dtype: int64


Unnamed: 0,polarity,subjectivity,tweet,label
0,-0.003409,0.35,ashley purdy left bvb gonna look naked women b...,neg
1,-0.15,0.5,jamesbut_ bass guitar world smallest violin,neg
2,-0.15,0.4,merle travis dark dungeon solo guitar via coun...,neg
3,0.25,0.55,sportsourcea slmandel least rutgers rock guita...,pos
4,-0.5,1.0,need electric guitar stop making sad ass music,neg


In [5]:
# split data
import math
import random

classifier_tuples = list(sentiment_df.drop(['polarity', 'subjectivity'], axis=1).itertuples(index=False, name=None))
train=random.sample(classifier_tuples, math.ceil(len(classifier_tuples) * .7))
test=[t for t in classifier_tuples if t not in train]

In [6]:
from textblob.classifiers import NaiveBayesClassifier
cl = NaiveBayesClassifier(train)
print("Classifier Accuracy: {:.2f}".format(cl.accuracy(test)))

Classifier Accuracy: 0.71


In [7]:
cl.show_informative_features(10)

Most Informative Features
      contains(electric) = True              neg : pos    =      3.4 : 1.0
           contains(one) = True              neg : pos    =      2.9 : 1.0
          contains(play) = True              pos : neg    =      2.4 : 1.0
          contains(want) = True              neg : pos    =      2.1 : 1.0
          contains(bass) = False             pos : neg    =      1.4 : 1.0
         contains(never) = False             pos : neg    =      1.3 : 1.0
          contains(nail) = False             pos : neg    =      1.3 : 1.0
   contains(guitarworld) = False             pos : neg    =      1.3 : 1.0
          contains(sutm) = False             pos : neg    =      1.3 : 1.0
         contains(zvtfa) = False             pos : neg    =      1.3 : 1.0
