In [5]:
import findspark
findspark.init()

In [6]:
import time

In [7]:
import re

# Define a function to clean the words
def clean_word(word):
    # Remove links starting with http, https, or www
    if word.startswith(('http', 'https', 'www')):
        return ''
    # Remove numbers and special characters
    return re.sub(r'[^a-zA-Z]', '', word)

start_time = time.time()

# Read existing file in local and count occurrences of each word in the file
try:
    with open("C:/Users/hanhn/Downloads/619251.txt", "r", encoding="utf-8") as file:
        lines = file.readlines()
    
    word_count = {}
    for line in lines:
        words = line.split()
        for word in words:
            cleaned_word = clean_word(word).lower()  # Clean and convert to lowercase
            if cleaned_word:  # Check if the cleaned word is not empty
                if cleaned_word in word_count:
                    word_count[cleaned_word] += 1
                else:
                    word_count[cleaned_word] = 1

    with open("output/wordCounts.txt", "w", encoding="utf-8") as output:
        for word in word_count:
            output.write(word + " " + str(word_count[word]) + "\n")

    print("Word count done!")
except UnicodeDecodeError as e:
    print(f"Error reading file: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

# End timing
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")


Word count done!
Elapsed time: 0.19713258743286133 seconds


In [8]:
from pyspark import SparkContext, SparkConf
import time
import re

# Set up Spark context
conf = SparkConf().setAppName("GroupByKeyBenchmark").setMaster("local")
sc = SparkContext(conf=conf)

# Read the file
file_path = "C:/Users/hanhn/Downloads/619251.txt"
text_file = sc.textFile(file_path)

# Define a function to filter out unwanted words
def is_valid_word(word):
    # Regular expression to filter out numbers, special characters, and URLs
    return not re.match(r'^[0-9]|[^\w\s]|http|www', word)

# Start timing
start_time = time.time()

# Count the words using groupByKey with filtering
word_pairs = text_file.flatMap(lambda line: line.split()) \
                      .filter(is_valid_word) \
                      .map(lambda word: (word, 1))
grouped_words = word_pairs.groupByKey()
word_counts = grouped_words.map(lambda kv: (kv[0], sum(kv[1])))

# Collect the result to trigger the computation
word_counts.collect()



# Save the result
output_path = "output/groupByKey"
word_counts.saveAsTextFile(output_path)

# End timing
end_time = time.time()
elapsed_time = end_time - start_time

# Stop the Spark context
sc.stop()

print(f"GroupByKey elapsed time: {elapsed_time} seconds")


GroupByKey elapsed time: 3.3725528717041016 seconds


In [9]:
from pyspark import SparkContext, SparkConf
import time
import re

# Set up Spark context
conf = SparkConf().setAppName("ReduceByKeyBenchmark").setMaster("local")
sc = SparkContext(conf=conf)

# Read the file
file_path = "C:/Users/hanhn/Downloads/619251.txt"
text_file = sc.textFile(file_path)

# Define a function to filter out unwanted words
def is_valid_word(word):
    # Regular expression to filter out numbers, special characters, and URLs
    return not re.match(r'^[0-9]|[^\w\s]|http|www', word)

# Start timing
start_time = time.time()

# Count the words using reduceByKey with filtering
word_pairs = text_file.flatMap(lambda line: line.split()) \
                      .filter(is_valid_word) \
                      .map(lambda word: (word, 1))
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

# Collect the result to trigger the computation
word_counts.collect()

# Save the result
output_path = "output/reduceByKey"
word_counts.saveAsTextFile(output_path)

# End timing
end_time = time.time()
elapsed_time = end_time - start_time

# Stop the Spark context
sc.stop()

print(f"ReduceByKey elapsed time: {elapsed_time} seconds")


ReduceByKey elapsed time: 2.458261251449585 seconds
