## RegEx

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

sc = SparkContext()
spark = SparkSession.builder.appName('RegEx').getOrCreate()

In [2]:
lines=sc.textFile('data.txt')

In [3]:
import re

m = ( lines.flatMap(lambda line: line.split(' '))
     .filter(lambda word: re.match(r'[a-zA-Z][^-@\'\\][^http]+', word))
     .map(lambda word: (word.lower(), 1))
     .reduceByKey(lambda x, y: x + y))
    
df = spark.createDataFrame(m, ['word', 'count'])
df.createOrReplaceTempView('words')
words = spark.sql('select word, count from words order by count desc limit 20')
words.show()

+-----------+-----+
|       word|count|
+-----------+-----+
|     guitar|  421|
|        the|  261|
|        and|  192|
|       this|  104|
|        you|   96|
|        his|   81|
|        for|   78|
|    playing|   68|
|       play|   57|
|       that|   55|
|        how|   55|
|        her|   55|
|    singing|   48|
|        who|   47|
|       like|   40|
|performance|   38|
|   chanyeol|   38|
|     missed|   38|
|   kyungsoo|   38|
|     tuning|   36|
+-----------+-----+



In [4]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover

df = (lines.flatMap(lambda line: line.split('b\''))
      .filter(lambda line: '\\' not in line)
      .filter(lambda line: line != '')
      .map(lambda line: (line, )).toDF(['tweet']))

df.createOrReplaceTempView('tweets')
tweets = spark.sql('select * from tweets')

stopwords = StopWordsRemover().getStopWords()
add_stopwords = ['http', 'https']
# print(stopwords)
regexTokenizer = RegexTokenizer(inputCol='tweet', outputCol='words', pattern='([0-9\@\W])').setMinTokenLength(3)
regexTokenized = regexTokenizer.transform(tweets)

remover = StopWordsRemover(inputCol='words', outputCol='cleaned').setStopWords(stopwords + add_stopwords)
filtered_df = remover.transform(regexTokenized.select(['words']))
filtered_df.select('cleaned').show(truncate=False)

+---------------------------------------------------------------------------------------------------------+
|cleaned                                                                                                  |
+---------------------------------------------------------------------------------------------------------+
|[listening, port]                                                                                        |
|[received, request]                                                                                      |
|[ashley, purdy, left, bvb, gonna, look, naked, women, bass, guitar, see, live]                           |
|[jennyhalasz, probably, gone, guitar, stuff, iuao, dro]                                                  |
|[starecrows, opening, guitar, part, loops, end]                                                          |
|[jamesbut_, bass, guitar, world, smallest, violin]                                                       |
|[merle, travis, dark, dunge

In [5]:
from pyspark.sql.functions import explode, count, col, desc

sum_of_words = filtered_df.withColumn('cleaned', explode(col('cleaned'))).groupBy('cleaned').agg(count('*'))
sum_of_words.sort(desc('count(1)')).show()

+--------------+--------+
|       cleaned|count(1)|
+--------------+--------+
|        guitar|     168|
|          play|      39|
|           one|      14|
|           via|      14|
|           img|      12|
|          send|      12|
|          plus|      12|
|          song|      11|
|     fvckxford|      11|
|         girls|      10|
|       playing|       8|
|       youtube|       8|
|          hero|       7|
|         never|       7|
|          need|       6|
|           lot|       6|
|personallyrich|       6|
|          bass|       5|
|    undefeated|       5|
|          solo|       5|
+--------------+--------+
only showing top 20 rows



In [6]:
num_features = sum_of_words.groupBy().sum().collect()[0][0]
print(num_features)

1309
