## NLP Tools
shoutout to https://www.pieriandata.com/

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, HashingTF, IDF, CountVectorizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
import string

sc = SparkContext()
spark = SparkSession.builder.appName("SQLBasics").getOrCreate()

In [2]:
lines=sc.textFile("data.txt")

In [3]:
df = (lines.flatMap(lambda line: line.split('b\''))
      .filter(lambda line: '\\' not in line)
      .filter(lambda line: line != '')
      .map(lambda line: (line, )).toDF(['tweet']))

df.createOrReplaceTempView('tweets')
spark.sql( 'show tables from default' ).show()
tweets = spark.sql('select * from tweets')
tweets.show(5)

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   tweets|       true|
+--------+---------+-----------+

+--------------------+
|               tweet|
+--------------------+
|Listening on port...|
|Received request ...|
|If Ashley Purdy l...|
|@jennyhalasz I pr...|
|@Starecrows the o...|
+--------------------+
only showing top 5 rows



In [5]:
regexTokenizer = RegexTokenizer(inputCol="tweet", outputCol="words", pattern="\\W")

countTokens = udf(lambda words: len(words), IntegerType())

regexTokenized = regexTokenizer.transform(tweets)
regexTokenized.select("tweet", "words").withColumn("tokens", countTokens(col("words"))).show(n=5, truncate=True)

+--------------------+--------------------+------+
|               tweet|               words|tokens|
+--------------------+--------------------+------+
|Listening on port...|[listening, on, p...|     4|
|Received request ...|[received, reques...|     8|
|If Ashley Purdy l...|[if, ashley, purd...|    22|
|@jennyhalasz I pr...|[jennyhalasz, i, ...|    16|
|@Starecrows the o...|[starecrows, the,...|    14|
+--------------------+--------------------+------+
only showing top 5 rows



In [6]:
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
stop_words_removed = remover.transform(regexTokenized.select(['words']))
stop_words_removed.show(5)

+--------------------+--------------------+
|               words|            filtered|
+--------------------+--------------------+
|[listening, on, p...|[listening, port,...|
|[received, reques...|[received, reques...|
|[if, ashley, purd...|[ashley, purdy, l...|
|[jennyhalasz, i, ...|[jennyhalasz, pro...|
|[starecrows, the,...|[starecrows, open...|
+--------------------+--------------------+
only showing top 5 rows



In [7]:
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(stop_words_removed)

idf = IDF(inputCol="rawFeatures", outputCol="idf")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("filtered", "idf").show(5)

+--------------------+--------------------+
|            filtered|                 idf|
+--------------------+--------------------+
|[listening, port,...|(20,[4,14,16],[1....|
|[received, reques...|(20,[1,4,12,13,15...|
|[ashley, purdy, l...|(20,[2,3,6,10,12,...|
|[jennyhalasz, pro...|(20,[1,3,5,11,16,...|
|[starecrows, open...|(20,[0,3,4,6,10],...|
+--------------------+--------------------+
only showing top 5 rows



In [8]:
cv = CountVectorizer(inputCol="filtered", outputCol="countvectorizer", vocabSize=3, minDF=2.0)

model = cv.fit(rescaledData)

result = model.transform(rescaledData)
result.select("filtered", "idf", "countvectorizer").show(5)

+--------------------+--------------------+--------------------+
|            filtered|                 idf|     countvectorizer|
+--------------------+--------------------+--------------------+
|[listening, port,...|(20,[4,14,16],[1....|           (3,[],[])|
|[received, reques...|(20,[1,4,12,13,15...|           (3,[],[])|
|[ashley, purdy, l...|(20,[2,3,6,10,12,...|       (3,[0],[1.0])|
|[jennyhalasz, pro...|(20,[1,3,5,11,16,...|(3,[0,1,2],[1.0,1...|
|[starecrows, open...|(20,[0,3,4,6,10],...|       (3,[0],[1.0])|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [9]:
pandas_df = result.toPandas()
pandas_df.head()

Unnamed: 0,words,filtered,rawFeatures,idf,countvectorizer
0,"[listening, on, port, 9999]","[listening, port, 9999]","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 1.2472533126843983, 0.0, ...","(0.0, 0.0, 0.0)"
1,"[received, request, from, 127, 0, 0, 1, 46120]","[received, request, 127, 0, 0, 1, 46120]","(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.2095129847015513, 0.0, 0.0, 1.24725331...","(0.0, 0.0, 0.0)"
2,"[if, ashley, purdy, left, bvb, how, am, i, gon...","[ashley, purdy, left, bvb, gonna, look, naked,...","(0.0, 0.0, 2.0, 2.0, 0.0, 0.0, 2.0, 0.0, 0.0, ...","(0.0, 0.0, 3.019235154303779, 0.17301848581856...","(1.0, 0.0, 0.0)"
3,"[jennyhalasz, i, probably, would, have, gone, ...","[jennyhalasz, probably, gone, guitar, stuff, h...","(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.2095129847015513, 0.0, 0.0865092429092...","(1.0, 1.0, 1.0)"
4,"[starecrows, the, opening, with, the, guitar, ...","[starecrows, opening, guitar, part, loops, end]","(2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...","(2.65459204071587, 0.0, 0.0, 0.086509242909282...","(1.0, 0.0, 0.0)"


In [16]:
from textblob import TextBlob

for line in pandas_df.filtered.items():
    blob = TextBlob(' '.join(line[1]))
    print(blob.sentiment.polarity, blob.sentiment.subjectivity, ' '.join(line[1]))

0.0 0.0 listening port 9999
0.0 0.0 received request 127 0 0 1 46120
-0.003409090909090917 0.35 ashley purdy left bvb gonna look naked women bass guitar see live
0.0 0.0 jennyhalasz probably gone guitar stuff https co iuao7j0dro
0.0 0.0 starecrows opening guitar part loops end
-0.15000000000000002 0.5 b jamesbut_ bass guitar world smallest violin
-0.15 0.4 merle travis dark dungeon solo guitar 1951 via r countrymusic https co kqoqnumt5g
0.0 0.0 zager guitar giveaway https co oa3pp8mjie via zager_guitars
0.0 0.0 wed 11 13 grossmont guitar ensemble https co audy2ffwnv
0.0 0.0 b rt gtraddict impress people play guitar guitarist guitarplayer https co tvpbqrajnm
0.0 0.0 metallica songs basically bunch guitar noise dude growling microphone change mind
0.0 0.0 motleycrue corymarksmusic ffdp mick shhhhiiitttt underrated guitar player master slide method
0.0 0.0 spooky guitar riff limbo stuck head since ago remember song
0.25 0.55 sportsourcea slmandel least rutgers rock guitar lol
0.0 0.0 spoo

0.3499999999999999 0.8833333333333333 rt trustinjonas wow look cool guitar nick holding nice https co xjom1f5utd
0.0 0.5 c d open chords never knew existed https co h0fqfmmjte guitar guitarist guitarplayer guitars guitarlessons
0.7 0.6000000000000001 baller85son guitar good jamiepruitt
-0.6 1.0 rt zosowz jimmy awkward always holding guitar bb https co lxkr2x6mtr
0.0 0.0 yesdragonxxx asweeneymusic sucioperro fender fendergbi guitar https co d1wtupoupl
0.0 0.0 strymon bigsky reverb guitar effects pedal demo guitar https co jvdu8hd8xm
0.13636363636363635 0.45454545454545453 b rt espguitarsusa let go jump phase esp 2020 new guitar preview https co xqvpxuzc2o https co iqaww1mlhx
0.3499999999999999 0.8833333333333333 rt trustinjonas wow look cool guitar nick holding nice https co xjom1f5utd
0.0 0.0 thegoldminer81 guitar hero
-0.25 0.625 rt guitarworld zakkwyldebls reveals one song never able nail guitar https co zvtfa5sutm
0.0 0.0 rt sawceofficial frank ocean room guitar cover https co 6vapc

In [None]:
# split data

In [None]:
# from textblob.classifiers import NaiveBayesClassifier
# cl = NaiveBayesClassifier(train)
# cl.accuracy(test) 