# Big Data Assignment 3: Word Count PySpark

In [2]:
from pyspark import SparkConf, SparkContext
import re

def tokenize_and_clean(line):
    # Rule 1: Convert all words to lowercase
    # Rule 2: Words are separated by any number of spaces or tabs (handled using regex)
    # Rule 3: Remove special characters (except trailing ".", ";", ",", "?", ":")
    # Rule 4: Ignore words with less than 4 characters (handled by the regex pattern)
    return re.findall(r'\b[a-zA-Z]{4,}\b', re.sub(r'[^a-zA-Z\s]', '', line.lower()))

def count_words(file_path):
    conf = SparkConf().setAppName("WordCount")
    sc = SparkContext(conf=conf)

    book_rdd = sc.textFile(file_path)

    # Tokenize and clean each line and create a flat list of words
    words_rdd = book_rdd.flatMap(tokenize_and_clean)

    # Count occurrences of each word
    # Rule 7: Must use some Python functions for transformations
    word_count_rdd = words_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

    # Sort words by frequency in descending order and take the top 5
    # Rule 8: Keep transformations as simple as possible and use Python functions
    top_5_words = word_count_rdd.sortBy(lambda pair: pair[1], ascending=False).take(5)

    # Rule 5: Not using collect() for debugging purposes in the submitted solution, single comment line for each transformation
    # Rule 6: Be able to explain transformations (explained in comments inside the code)

    sc.stop()
    return top_5_words

if __name__ == "__main__":
    # input path
    input_path = "/Users/ericluong/Documents/School/Big Data MSIS 2527/Week 5/205-0.txt"
    result = count_words(input_path)

    print("Output: top-5 words:\n")
    for i, (word, count) in enumerate(result, start=1):
        print(f"({word}, {count})")


                                                                                

Output: top-5 words:

(that, 1334)
(with, 921)
(which, 871)
(they, 713)
(have, 674)
