## NLP Tools
shoutout to https://www.pieriandata.com/

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, HashingTF, IDF, CountVectorizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
import string

sc = SparkContext()
spark = SparkSession.builder.appName("SQLBasics").getOrCreate()

In [2]:
lines=sc.textFile("data.txt")

In [3]:
df = (lines.flatMap(lambda line: line.split('b\''))
      .filter(lambda line: '\\' not in line)
      .filter(lambda line: line != '')
      .map(lambda line: (line, )).toDF(['tweet']))

df.createOrReplaceTempView('tweets')
spark.sql( 'show tables from default' ).show()
tweets = spark.sql('select * from tweets')
tweets.show(5)

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   tweets|       true|
+--------+---------+-----------+

+--------------------+
|               tweet|
+--------------------+
|Listening on port...|
|Received request ...|
|If Ashley Purdy l...|
|@jennyhalasz I pr...|
|@Starecrows the o...|
+--------------------+
only showing top 5 rows



In [4]:
regexTokenizer = RegexTokenizer(inputCol="tweet", outputCol="words", pattern="\\W")

countTokens = udf(lambda words: len(words), IntegerType())

regexTokenized = regexTokenizer.transform(tweets)
regexTokenized.select("tweet", "words").withColumn("tokens", countTokens(col("words"))).show(n=5, truncate=True)

+--------------------+--------------------+------+
|               tweet|               words|tokens|
+--------------------+--------------------+------+
|Listening on port...|[listening, on, p...|     4|
|Received request ...|[received, reques...|     8|
|If Ashley Purdy l...|[if, ashley, purd...|    22|
|@jennyhalasz I pr...|[jennyhalasz, i, ...|    16|
|@Starecrows the o...|[starecrows, the,...|    14|
+--------------------+--------------------+------+
only showing top 5 rows



In [5]:
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
stop_words_removed = remover.transform(regexTokenized.select(['words']))
stop_words_removed.show(5)

+--------------------+--------------------+
|               words|            filtered|
+--------------------+--------------------+
|[listening, on, p...|[listening, port,...|
|[received, reques...|[received, reques...|
|[if, ashley, purd...|[ashley, purdy, l...|
|[jennyhalasz, i, ...|[jennyhalasz, pro...|
|[starecrows, the,...|[starecrows, open...|
+--------------------+--------------------+
only showing top 5 rows



In [6]:
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(stop_words_removed)

idf = IDF(inputCol="rawFeatures", outputCol="idf")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("filtered", "idf").show(5)

+--------------------+--------------------+
|            filtered|                 idf|
+--------------------+--------------------+
|[listening, port,...|(20,[4,14,16],[1....|
|[received, reques...|(20,[1,4,12,13,15...|
|[ashley, purdy, l...|(20,[2,3,6,10,12,...|
|[jennyhalasz, pro...|(20,[1,3,5,11,16,...|
|[starecrows, open...|(20,[0,3,4,6,10],...|
+--------------------+--------------------+
only showing top 5 rows



In [7]:
cv = CountVectorizer(inputCol="filtered", outputCol="countvectorizer", vocabSize=3, minDF=2.0)

model = cv.fit(rescaledData)

result = model.transform(rescaledData)
result.select("filtered", "idf", "countvectorizer").show(5)

+--------------------+--------------------+--------------------+
|            filtered|                 idf|     countvectorizer|
+--------------------+--------------------+--------------------+
|[listening, port,...|(20,[4,14,16],[1....|           (3,[],[])|
|[received, reques...|(20,[1,4,12,13,15...|           (3,[],[])|
|[ashley, purdy, l...|(20,[2,3,6,10,12,...|       (3,[0],[1.0])|
|[jennyhalasz, pro...|(20,[1,3,5,11,16,...|(3,[0,1,2],[1.0,1...|
|[starecrows, open...|(20,[0,3,4,6,10],...|       (3,[0],[1.0])|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [8]:
pandas_df = result.toPandas()
pandas_df.head()

Unnamed: 0,words,filtered,rawFeatures,idf,countvectorizer
0,"[listening, on, port, 9999]","[listening, port, 9999]","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 1.2472533126843983, 0.0, ...","(0.0, 0.0, 0.0)"
1,"[received, request, from, 127, 0, 0, 1, 46120]","[received, request, 127, 0, 0, 1, 46120]","(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.2095129847015513, 0.0, 0.0, 1.24725331...","(0.0, 0.0, 0.0)"
2,"[if, ashley, purdy, left, bvb, how, am, i, gon...","[ashley, purdy, left, bvb, gonna, look, naked,...","(0.0, 0.0, 2.0, 2.0, 0.0, 0.0, 2.0, 0.0, 0.0, ...","(0.0, 0.0, 3.019235154303779, 0.17301848581856...","(1.0, 0.0, 0.0)"
3,"[jennyhalasz, i, probably, would, have, gone, ...","[jennyhalasz, probably, gone, guitar, stuff, h...","(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.2095129847015513, 0.0, 0.0865092429092...","(1.0, 1.0, 1.0)"
4,"[starecrows, the, opening, with, the, guitar, ...","[starecrows, opening, guitar, part, loops, end]","(2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...","(2.65459204071587, 0.0, 0.0, 0.086509242909282...","(1.0, 0.0, 0.0)"


In [9]:
import pandas as pd
from textblob import TextBlob

ta = []
for line in pandas_df.filtered.items():
    blob = TextBlob(' '.join(line[1]))
    if blob.sentiment.polarity != 0 and blob.sentiment.subjectivity !=0:
#         print(blob.sentiment.polarity, blob.sentiment.subjectivity, ' '.join(line[1]))
        ta.append((blob.sentiment.polarity, blob.sentiment.subjectivity, ' '.join(line[1])))
 
sentiment_df = pd.DataFrame(ta, columns=('polarity', 'subjectivity', 'tweet'))
sentiment_df['label'] = sentiment_df.polarity.apply(lambda x: 'pos' if x > 0 else 'neg')
print(sentiment_df['label'].value_counts())
print(sentiment_df.head())

pos    44
neg    22
Name: label, dtype: int64
   polarity  subjectivity                                              tweet  \
0 -0.003409          0.35  ashley purdy left bvb gonna look naked women b...   
1 -0.150000          0.50      b jamesbut_ bass guitar world smallest violin   
2 -0.150000          0.40  merle travis dark dungeon solo guitar 1951 via...   
3  0.250000          0.55  sportsourcea slmandel least rutgers rock guita...   
4 -0.500000          1.00     need electric guitar stop making sad ass music   

  label  
0   neg  
1   neg  
2   neg  
3   pos  
4   neg  


In [10]:
# split data
import math
import random

classifier_tuples = list(sentiment_df.drop(['polarity', 'subjectivity'], axis=1).itertuples(index=False, name=None))
train=random.sample(classifier_tuples, math.ceil(len(classifier_tuples) * .8))
test=[t for t in classifier_tuples if t not in train]

In [11]:
from textblob.classifiers import NaiveBayesClassifier
cl = NaiveBayesClassifier(train)
print("Classifier Accuracy: {:.2f}".format(cl.accuracy(test)))

Classifier Accuracy: 0.50


In [12]:
cl.show_informative_features(10)

Most Informative Features
          contains(want) = True              neg : pos    =      2.4 : 1.0
           contains(one) = True              neg : pos    =      2.4 : 1.0
          contains(like) = True              neg : pos    =      2.4 : 1.0
         contains(music) = True              neg : pos    =      2.4 : 1.0
        contains(guitar) = False             neg : pos    =      2.4 : 1.0
    contains(relaxation) = True              neg : pos    =      2.4 : 1.0
           contains(via) = True              pos : neg    =      1.5 : 1.0
          contains(look) = True              neg : pos    =      1.5 : 1.0
         contains(never) = False             pos : neg    =      1.4 : 1.0
    contains(zvtfa5sutm) = False             pos : neg    =      1.4 : 1.0
