In [1]:
import re

# **Task1-a**
*  In this step, we read the Text-date.txt file line by line. In each line, we changed the uppercase letters to lowercase letters, removed the punctuation marks, and split it into words. Thus, we made it ready for MapReduce.

In [2]:
def text_clean_and_tokenizate(text):

    text = text.lower()

    text = re.sub(r"\b[0-9]+\b\s*", " ", text)

    text = re.sub(r'\s+', ' ', text).strip()

    return text.split(' ')


In [3]:
file_path = 'Text-data.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    paragraphs = file.readlines()

In [28]:
for i, line in enumerate(paragraphs):
    tokens = text_clean_and_tokenizate(line)
    if tokens and i < 10:
        print(f"Line {i+1} tokens: {tokens}")

Line 1 tokens: ['the', 'first', 'secretary', 'of', 'agriculture', 'was', 'norman', 'j.', 'colman,', 'who', 'was', 'appointed', 'by', 'cleveland']
Line 2 tokens: ['the', 'first', 'secretary', 'of', 'energy', 'was', 'james', 'r.', 'schlesinger', 'who', 'was', 'appointed', 'by', 'carter']
Line 3 tokens: ['the', 'first', 'secretary', 'of', 'health', 'and', 'human', 'services', 'was', 'patricia', 'roberts', 'harris', 'who', 'was', 'appointed', 'by']
Line 4 tokens: ['the', 'first', 'secretary', 'of', 'housing', 'and', 'urban', 'development', 'was', 'robert', 'c.', 'weaver', 'who', 'was', 'appointed', 'by']
Line 5 tokens: ['the', 'first', 'secretary', 'of', 'state', 'was', 'thomas', 'jefferson']
Line 6 tokens: ['the', 'first', 'baseman', 'made', 'putouts']
Line 7 tokens: ['the', 'first', 'task', 'is', 'the', 'assignment', 'of', 'an', 'address', 'to', 'each', 'datum']
Line 8 tokens: ['the', 'first', 'intifada', 'ended', 'when', 'israel', 'granted', 'limited', 'autonomy', 'to', 'the', 'palestin

# **Task1-b**
*  In this stage, each paragraph was cleaned and separated into words. For each word, a pair of the form (word, 1) was generated.

In [7]:
def map_phase(paragraphs):
    mapped = []
    for paragraph in paragraphs:
        tokens = text_clean_and_tokenizate(paragraph)
        for token in tokens:
            if token:  # skip empty words
                mapped.append((token, 1))
    return mapped

In [18]:
mapped_data = map_phase(paragraphs)

mapped_data[:100]

[('the', 1),
 ('first', 1),
 ('secretary', 1),
 ('of', 1),
 ('agriculture', 1),
 ('was', 1),
 ('norman', 1),
 ('j.', 1),
 ('colman,', 1),
 ('who', 1),
 ('was', 1),
 ('appointed', 1),
 ('by', 1),
 ('cleveland', 1),
 ('the', 1),
 ('first', 1),
 ('secretary', 1),
 ('of', 1),
 ('energy', 1),
 ('was', 1),
 ('james', 1),
 ('r.', 1),
 ('schlesinger', 1),
 ('who', 1),
 ('was', 1),
 ('appointed', 1),
 ('by', 1),
 ('carter', 1),
 ('the', 1),
 ('first', 1),
 ('secretary', 1),
 ('of', 1),
 ('health', 1),
 ('and', 1),
 ('human', 1),
 ('services', 1),
 ('was', 1),
 ('patricia', 1),
 ('roberts', 1),
 ('harris', 1),
 ('who', 1),
 ('was', 1),
 ('appointed', 1),
 ('by', 1),
 ('the', 1),
 ('first', 1),
 ('secretary', 1),
 ('of', 1),
 ('housing', 1),
 ('and', 1),
 ('urban', 1),
 ('development', 1),
 ('was', 1),
 ('robert', 1),
 ('c.', 1),
 ('weaver', 1),
 ('who', 1),
 ('was', 1),
 ('appointed', 1),
 ('by', 1),
 ('the', 1),
 ('first', 1),
 ('secretary', 1),
 ('of', 1),
 ('state', 1),
 ('was', 1),
 ('thomas

# **Task1-c**
*  In the shuffle phase, the (word, 1) pairs coming from the map phase were grouped based on words. The 1 values ​​belonging to these words were added to a list to show the number of times each word occurred

In [19]:
from collections import defaultdict

def shuffle_phase(mapped_data):
    shuffled = defaultdict(list)
    for word, count in mapped_data:
        shuffled[word].append(count)
    return dict(shuffled)

In [26]:
shuffled_data = shuffle_phase(mapped_data)
print(shuffled_data)



# **Task1-d**

In [24]:
def reduce_phase(shuffled_data):

    reduced = {word: sum(counts) for word, counts in shuffled_data.items()}

    return reduced

In [25]:
reduced_data = reduce_phase(shuffled_data)

top_10 = sorted(reduced_data.items(), key=lambda item: item[1], reverse=True)[:10]

print("the top 10 final word frequencies")
for word, freq in top_10:
    print(f"{word}: {freq}")


the top 10 final word frequencies
is: 144252
the: 77872
a: 76969
of: 70819
to: 43002
or: 35375
in: 31373
and: 26973
an: 14541
that: 13807


# **Task1-e**

 **Fine granularity in MapReduce refers to dividing a large task into many small subtasks (such as per word, per line, per file block).
This is important because:**

*  It allows parallelism, making the process faster on distributed systems.

*  It increases fault tolerance, because failure of one small task does not affect the whole process.

*  It improves load balancing, as work is more evenly distributed among worker nodes.

Source
*  https://dylancastillo.co/posts/nlp-snippets-clean-and-tokenize-text-with-python.html
*  https://alifanibigdata.wordpress.com/map-reduce-using-python/
