## CountVectorize

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.ml.feature import Tokenizer, RegexTokenizer, CountVectorizer, StopWordsRemover
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
import string

sc = SparkContext()
spark = SparkSession.builder.appName('CountVectorizer').getOrCreate()

In [2]:
lines=sc.textFile('data.txt')
df = (lines.flatMap(lambda line: line.split('b\''))
      .filter(lambda line: '\\' not in line)
      .filter(lambda line: line != '')
      .map(lambda line: (line, )).toDF(['tweet']))

df.createOrReplaceTempView('tweets')
tweets = spark.sql('select * from tweets')
tweets.show(n=5, truncate=False)

+----------------------------------------------------------------------------------------------------+
|tweet                                                                                               |
+----------------------------------------------------------------------------------------------------+
|Listening on port: 9999                                                                             |
|Received request from: ('127.0.0.1', 46120)                                                         |
|If Ashley Purdy left BVB, how am I gonna look at naked women on a bass guitar when I see them live?'|
|@jennyhalasz I probably would have gone with this. Or more guitar stuff. https://t.co/Iuao7j0Dro'   |
|@Starecrows the opening with the guitar part, and it just loops with no end'                        |
+----------------------------------------------------------------------------------------------------+
only showing top 5 rows



In [3]:
regexTokenizer = RegexTokenizer(inputCol='tweet', outputCol='words', pattern='\\W')
regexTokenized = regexTokenizer.transform(tweets)
regexTokenized.show(5)

+--------------------+--------------------+
|               tweet|               words|
+--------------------+--------------------+
|Listening on port...|[listening, on, p...|
|Received request ...|[received, reques...|
|If Ashley Purdy l...|[if, ashley, purd...|
|@jennyhalasz I pr...|[jennyhalasz, i, ...|
|@Starecrows the o...|[starecrows, the,...|
+--------------------+--------------------+
only showing top 5 rows



In [4]:
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
stop_words_removed = remover.transform(regexTokenized.select(['words']))
stop_words_removed.show(5)

+--------------------+--------------------+
|               words|            filtered|
+--------------------+--------------------+
|[listening, on, p...|[listening, port,...|
|[received, reques...|[received, reques...|
|[if, ashley, purd...|[ashley, purdy, l...|
|[jennyhalasz, i, ...|[jennyhalasz, pro...|
|[starecrows, the,...|[starecrows, open...|
+--------------------+--------------------+
only showing top 5 rows



In [5]:
from pyspark.sql import functions as f
cleaned_df = stop_words_removed.withColumn('cleaned', f.expr('filter(filtered, x -> not(length(x) < 3))'))
cleaned_df.show(5)

+--------------------+--------------------+--------------------+
|               words|            filtered|             cleaned|
+--------------------+--------------------+--------------------+
|[listening, on, p...|[listening, port,...|[listening, port,...|
|[received, reques...|[received, reques...|[received, reques...|
|[if, ashley, purd...|[ashley, purdy, l...|[ashley, purdy, l...|
|[jennyhalasz, i, ...|[jennyhalasz, pro...|[jennyhalasz, pro...|
|[starecrows, the,...|[starecrows, open...|[starecrows, open...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [6]:
from pyspark.sql.functions import explode, count

sum_of_words = cleaned_df.withColumn('cleaned', explode(col('cleaned'))).groupBy('cleaned').agg(count('*'))
sum_of_words.show(5)

+------------+--------+
|     cleaned|count(1)|
+------------+--------+
|       still|       5|
|trustinjonas|       2|
|    received|       1|
|        bone|       1|
|    keyboard|       1|
+------------+--------+
only showing top 5 rows



In [7]:
num_features = sum_of_words.groupBy().sum().collect()[0][0]
print(num_features)

1357


In [8]:
cv = CountVectorizer(inputCol='cleaned', outputCol='vectors', vocabSize=num_features, minDF=2.0)

model = cv.fit(cleaned_df)

cnt_vect = model.transform(cleaned_df)
cnt_vect.select('cleaned', 'vectors').show(n=10, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------
 cleaned | [listening, port, 9999]                                                                  
 vectors | (166,[],[])                                                                              
-RECORD 1-------------------------------------------------------------------------------------------
 cleaned | [received, request, 127, 46120]                                                          
 vectors | (166,[],[])                                                                              
-RECORD 2-------------------------------------------------------------------------------------------
 cleaned | [ashley, purdy, left, bvb, gonna, look, naked, women, bass, guitar, see, live]           
 vectors | (166,[0,24,44,53],[1.0,1.0,1.0,1.0])                                                     
-RECORD 3----------------------------------------------------------------------------------

In [9]:
import pandas as pd

terms_df = pd.DataFrame(model.vocabulary, columns=['word'])
terms_df.head()

Unnamed: 0,word
0,guitar
1,https
2,play
3,via
4,one
