In [2]:
# Answer 1:
# Birthdate: 1 Dec 1997
import re
import collections

def mapper(line):
  # It'll split the line into words & emits a key-value pair for each word, where the key is the word and the value is 1
  words = re.split('\W*\s+\W*', line)
  for word in words:
    yield word, 1

def reducer(key, values):
  # It'll count the number of times the key (word) appears in the values and emits a key-value pair with the total count
  count = sum(values)
  yield key, count

def main():
  # Reading the input file nad applying the mapper and reducer functions, printing the results

  # Reading the input file
  with open('file.txt', 'r') as f:
    lines = f.readlines()

  # Dictionary for storing the word counts
  word_counts = collections.defaultdict(int)

  # Mapper function for each line
  for line in lines:
    for key, value in mapper(line):
      word_counts[key] += value

  # Printing results
  for key, value in word_counts.items():
    print(f'{key}: {value}')

if __name__ == '__main__':
  main()


THE: 1
OTHER: 1
MINISTER: 1
It: 10
was: 40
nearing: 1
midnight: 1
and: 59
the: 159
Prime: 51
Minister: 47
sitting: 2
alone: 4
in: 34
his: 58
office: 6
reading: 2
a: 80
long: 7
memo: 2
that: 47
slipping: 1
through: 1
brain: 1
without: 2
leaving: 2
slightest: 1
trace: 1
of: 69
meaning: 1
behind: 4
He: 21
waiting: 2
for: 15
call: 3
from: 12
President: 3
far: 2
distant: 1
country: 3
between: 1
wondering: 1
when: 6
wretched: 1
man: 8
would: 6
telephone: 3
trying: 3
to: 71
suppress: 1
unpleasant: 1
memories: 1
what: 5
had: 70
been: 14
very: 14
tiring: 1
difficult: 2
week: 6
there: 5
not: 17
much: 5
space: 1
head: 3
anything: 2
else: 2
The: 14
more: 6
he: 35
attempted: 1
focus: 1
on: 16
print: 1
page: 2
before: 5
him: 13
clearly: 1
could: 5
see: 7
gloating: 1
face: 4
one: 8
political: 1
opponents: 1
This: 1
particular: 1
opponent: 2
appeared: 3
news: 3
day: 4
only: 3
enumerate: 1
all: 9
terrible: 1
things: 2
happened: 2
last: 4
as: 16
though: 4
anyone: 2
needed: 1
reminding: 1
but: 8
also: 1


In [4]:
import string
from itertools import groupby

# Tokenizing lines
def tokenize(line):
    return line.strip().split()

# Function for checking if a word is English
def is_english(word, english_words):
    cleaned_word = ''.join(e for e in word if e.isalnum()).lower()
    return cleaned_word not in english_words

# Function to perform the Map step
def mapper(lines, english_words):
    word_counts = {}
    for line in lines:
        words = tokenize(line)
        for word in words:
            # Removing punctuation
            cleaned_word = ''.join(e for e in word if e.isalnum())
            if is_english(cleaned_word, english_words):
                yield (cleaned_word, 1)

# Function to perform the Reduce step
def reducer(word_counts):
    word_count = {}
    for word, count in word_counts:
        if word in word_count:
            word_count[word] += count
        else:
            word_count[word] = count
    return word_count

# Reading English words
with open('english3.txt', 'r') as english_words_file:
    english_words = set(english_words_file.read().splitlines())

# Reading the input file
with open('file2.txt', 'r') as input_file:
    input_lines = input_file.readlines()

# Performing Map
mapped_data = list(mapper(input_lines, english_words))

# Sorting and grouping the mapped data by key (word)
mapped_data.sort()
word_groups = [(word, [count for _, count in group]) for word, group in groupby(mapped_data, key=lambda x: x[0])]

# Performing the Reduce
word_counts = reducer(word_groups)

# Non-English words with their counts
for word, count in word_counts.items():
    print(f'{word}: {count}')


: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
ArryWith: [1]
Auror: [1]
Azkaban: [1]
Crookshanks: [1]
CurseBreaker: [1]
Delacour: [1]
Diagon: [1, 1]
Dont: [1]
DumbledoreIt: [1]
Eet: [1]
Georgeve: [1]
Gringotts: [1]
HarryLooking: [1]
HarryWhats: [1]
Harrys: [1, 1, 1, 1]
Headof: [1]
Hedwig: [1]
Hegroped: [1]
HermioneShe: [1]
Howre: [1]
ICome: [1]
Im: [1, 1]
Imean: [1, 1, 1, 1]
Ive: [1, 1, 1]
JK: [1, 1, 1]
Lestrange: [1]
MrsWeasley: [1, 1]
Muggles: [1]
PhlegmStop: [1]
PukingPastille: [1]
Ronrolling: [1]
Rons: [1, 1]
RowlingBed: [1]
RowlingWere: [1]
Rowlingnight: [1]
Shenever: [1]
Siriuss: [1]
SlughornOh: [1]
Slytherin: [1]
Thats: [1, 1, 1, 1]
Theyre: [1]
Theyve: [1]
Triwizard: [1]
Umbridge: [1, 1, 1]
Voldemort: [1]
Weasley: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
WeasleyHarry: [1]
YouKnowWho: [1]
YouNot: [1]
Youve: [1]
aboutMrs: [1]
againBill: [1]
againOh: [1]
allknow: [1]
amusedYeah: [1]
andGeorges