## RegEx

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

sc = SparkContext()
spark = SparkSession.builder.appName('RegEx').getOrCreate()

In [None]:
lines=sc.textFile('data.txt')

In [None]:
import re

m = ( lines.flatMap(lambda line: line.split(' '))
     .filter(lambda word: re.match(r'[a-zA-Z][^-@\'\\][^http]+', word))
     .map(lambda word: (word.lower(), 1))
     .reduceByKey(lambda x, y: x + y))
    
df = spark.createDataFrame(m, ['word', 'count'])
df.createOrReplaceTempView('words')
words = spark.sql('select word, count from words order by count desc limit 20')
words.show()

In [None]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover

df = (lines.flatMap(lambda line: line.split('b\''))
      .filter(lambda line: '\\' not in line)
      .filter(lambda line: line != '')
      .map(lambda line: (line, )).toDF(['tweet']))

df.createOrReplaceTempView('tweets')
tweets = spark.sql('select * from tweets')

stopwords = StopWordsRemover().getStopWords()
add_stopwords = ['http', 'https']
# print(stopwords)
regexTokenizer = RegexTokenizer(inputCol='tweet', outputCol='words', pattern='([0-9\@\W])').setMinTokenLength(3)
regexTokenized = regexTokenizer.transform(tweets)

remover = StopWordsRemover(inputCol='words', outputCol='cleaned').setStopWords(stopwords + add_stopwords)
filtered_df = remover.transform(regexTokenized.select(['words']))
filtered_df.select('cleaned').show(truncate=False)

In [None]:
from pyspark.sql.functions import explode, count, col, desc

sum_of_words = filtered_df.withColumn('cleaned', explode(col('cleaned'))).groupBy('cleaned').agg(count('*'))
sum_of_words.sort(desc('count(1)')).show()