## Section A - Working with the RDD API

In [6]:
from pyspark.sql import SparkSession
from operator import add
import re

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.250:7077") \
        .appName("Koushik_A3")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# RDD API
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

## Question A1

#### Question A.1.1 Read the English transcripts with Spark, and count the number of lines.

In [7]:
# Number of lines in the English transcript.
english_transcript = spark_context.textFile("hdfs://192.168.2.250:9000/europarl/europarl-v7.nl-en.en")
print(english_transcript.count())



1997775


                                                                                

#### Question A.1.2 Do the same with the other language

In [8]:
# Number of lines in the Dutch transcript.
netherlands_transcript = spark_context.textFile("hdfs://192.168.2.250:9000/europarl/europarl-v7.nl-en.nl")
print(netherlands_transcript.count())



1997775


                                                                                

#### Question A.1.3 Verify that the line counts are the same for the two languages.

In [9]:
#checking if the line count are same in the both the transcript.
if english_transcript.count() == netherlands_transcript.count():
    print(f'Line counts are same: {english_transcript.count()}')
else:
    print('not same')



Line counts are same: 1997775


                                                                                

#### Question A.1.4 Count the number of partitions

In [4]:
# Number of partitons.
print(english_transcript.getNumPartitions())

3


In [5]:
# Number of partitons.
print(netherlands_transcript.getNumPartitions())

3


## Question A2

#### Question A.2.1 Pre-process the text from both RDDs by doing the following:
##### 1) Lowercase the text
##### 2) Tokenize the text


In [10]:
# Preprocessing the text by lowering and tokenizing them into words
def lower_tokenize(line):
    line = re.sub(r'[^\w]'," ",line)
    return line.lower().split()
    

In [11]:

english_transcript_preprocessed = english_transcript.map(lower_tokenize)

In [12]:
netherlands_transcript_preprocessed = netherlands_transcript.map(lower_tokenize)

#### Question A.2.2 Inspect 10 entries from each of your RDDs to verify your pre-processing.

In [108]:
# Checking the preprocessed text.
for i in english_transcript_preprocessed.take(10):
    print(i)

[Stage 67:>                                                         (0 + 1) / 1]

['resumption', 'of', 'the', 'session']
['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period']
['although', 'as', 'you', 'will', 'have', 'seen', 'the', 'dreaded', 'millennium', 'bug', 'failed', 'to', 'materialise', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful']
['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days', 'during', 'this', 'part', 'session']
['in', 'the', 'meantime', 'i', 'should', 'like', 'to', 'observe', 'a', 'minute', 's', 'silence', 'as', 'a', 'number', 'of', 'members', 'have', 'requested', 'on', 'behalf', 'of', 'all', 'the', 'vi

                                                                                

In [109]:
# Checking the preprocessed text.
for i in netherlands_transcript_preprocessed.take(10):
    print(i)

[Stage 68:>                                                         (0 + 1) / 1]

['hervatting', 'van', 'de', 'zitting']
['ik', 'verklaar', 'de', 'zitting', 'van', 'het', 'europees', 'parlement', 'die', 'op', 'vrijdag', '17', 'december', 'werd', 'onderbroken', 'te', 'zijn', 'hervat', 'ik', 'wens', 'u', 'allen', 'een', 'gelukkig', 'nieuwjaar', 'en', 'hoop', 'dat', 'u', 'een', 'goede', 'vakantie', 'heeft', 'gehad']
['zoals', 'u', 'heeft', 'kunnen', 'constateren', 'is', 'de', 'grote', 'millenniumbug', 'uitgebleven', 'de', 'burgers', 'van', 'een', 'aantal', 'van', 'onze', 'lidstaten', 'zijn', 'daarentegen', 'door', 'verschrikkelijke', 'natuurrampen', 'getroffen']
['u', 'heeft', 'aangegeven', 'dat', 'u', 'deze', 'vergaderperiode', 'een', 'debat', 'wilt', 'over', 'deze', 'rampen']
['nu', 'wil', 'ik', 'graag', 'op', 'verzoek', 'van', 'een', 'aantal', 'collega', 's', 'een', 'minuut', 'stilte', 'in', 'acht', 'nemen', 'ter', 'nagedachtenis', 'van', 'de', 'slachtoffers', 'ik', 'doel', 'hiermee', 'met', 'name', 'op', 'de', 'slachtoffers', 'van', 'het', 'noodweer', 'dat', 'versc

                                                                                

#### Question A.2.3 Verify that the line counts still match after the pre-processing.

In [110]:
# verify number of lines
if english_transcript_preprocessed.count() == netherlands_transcript_preprocessed.count():
    print(f'Line counts are same: {english_transcript_preprocessed.count()}')
else:
    print('not same')



Line counts are same: 1997775


                                                                                

## Question 1.A.3

#### Question A.3.1 Use Spark to compute the 10 most frequently according words in the English language corpus. Repeat for the other language. 

In [27]:
# function for mapping and reducing
def flat_list(l):
    return l
def map_key(word):
    return (word , 1)

def sort_key(key):
    return -key[1]
    

In [112]:
mapped_english_transcript = english_transcript_preprocessed.flatMap(flat_list).map(map_key).reduceByKey(add)


In [113]:
mapped_netherlands_transcript = netherlands_transcript_preprocessed.flatMap(flat_list).map(map_key).reduceByKey(add)

In [114]:
# Top 10 most frequently occured word in English transcript
print('top 10 most frequently occured word in English transcript')
for i,j in mapped_english_transcript.takeOrdered(10, sort_key):
    print(f'{i}: {j}')

top 10 most frequently occured word in English transcript




the: 3799961
of: 1799784
to: 1673342
and: 1426903
in: 1189698
that: 909311
a: 841063
is: 837897
for: 580560
we: 573706


                                                                                

In [115]:
# Top 10 most frequently occured word in English transcript
print('top 10 most frequently occured word in Dutch transcript')
for i,j in mapped_netherlands_transcript.takeOrdered(10, sort_key):
    print(f'{i}: {j}')

top 10 most frequently occured word in Dutch transcript




de: 3862249
van: 2053837
het: 1726514
en: 1370030
in: 1003820
dat: 992734
een: 992302
te: 727012
is: 701511
voor: 595325


                                                                                

#### Question A.3.2 Verify that your results are reasonable.

Downloaded the english and dutch transcripts from the link: http://www.statmt.org/europarl/ and used grep command to find top 10 most frequently occuring words in english and in dutch got almost same words came as most frequent words and the counts differed between output from grep command and the map-reduce.

## Question A.4

#### A.4.1 Use this parallel corpus to mine some translations in the form of word pairs, for the twolanguages. Do this by pairing words found on short lines with the same number of words respectively. We (incorrectly) assume the words stay in the same order when translated.

#### Key the lines by their line number

In [13]:
en_1 = english_transcript_preprocessed.zipWithIndex()
print(en_1.take(5))

                                                                                

[(['resumption', 'of', 'the', 'session'], 0), (['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period'], 1), (['although', 'as', 'you', 'will', 'have', 'seen', 'the', 'dreaded', 'millennium', 'bug', 'failed', 'to', 'materialise', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful'], 2), (['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days', 'during', 'this', 'part', 'session'], 3), (['in', 'the', 'meantime', 'i', 'should', 'like', 'to', 'observe', 'a', 'minute', 's', 'silence', 'as', 'a', 'number', 'of', 'members', 'have', 'requested', 'on', 'behalf

                                                                                

In [14]:


nl_1 = netherlands_transcript_preprocessed.zipWithIndex()
print(nl_1.take(5))

[Stage 8:>                                                          (0 + 1) / 1]

[(['hervatting', 'van', 'de', 'zitting'], 0), (['ik', 'verklaar', 'de', 'zitting', 'van', 'het', 'europees', 'parlement', 'die', 'op', 'vrijdag', '17', 'december', 'werd', 'onderbroken', 'te', 'zijn', 'hervat', 'ik', 'wens', 'u', 'allen', 'een', 'gelukkig', 'nieuwjaar', 'en', 'hoop', 'dat', 'u', 'een', 'goede', 'vakantie', 'heeft', 'gehad'], 1), (['zoals', 'u', 'heeft', 'kunnen', 'constateren', 'is', 'de', 'grote', 'millenniumbug', 'uitgebleven', 'de', 'burgers', 'van', 'een', 'aantal', 'van', 'onze', 'lidstaten', 'zijn', 'daarentegen', 'door', 'verschrikkelijke', 'natuurrampen', 'getroffen'], 2), (['u', 'heeft', 'aangegeven', 'dat', 'u', 'deze', 'vergaderperiode', 'een', 'debat', 'wilt', 'over', 'deze', 'rampen'], 3), (['nu', 'wil', 'ik', 'graag', 'op', 'verzoek', 'van', 'een', 'aantal', 'collega', 's', 'een', 'minuut', 'stilte', 'in', 'acht', 'nemen', 'ter', 'nagedachtenis', 'van', 'de', 'slachtoffers', 'ik', 'doel', 'hiermee', 'met', 'name', 'op', 'de', 'slachtoffers', 'van', 'het',

                                                                                

#### Swap the key and value - so that the line number is the key

In [15]:
def swap_element(rdd):
    return rdd.map(lambda x: (x[1],x[0]))


en_2 = swap_element(en_1)
nl_2 = swap_element(nl_1)

In [16]:
print(en_2.take(5))

[(0, ['resumption', 'of', 'the', 'session']), (1, ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period']), (2, ['although', 'as', 'you', 'will', 'have', 'seen', 'the', 'dreaded', 'millennium', 'bug', 'failed', 'to', 'materialise', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful']), (3, ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days', 'during', 'this', 'part', 'session']), (4, ['in', 'the', 'meantime', 'i', 'should', 'like', 'to', 'observe', 'a', 'minute', 's', 'silence', 'as', 'a', 'number', 'of', 'members', 'have', 'requested', 'on', 'beh

                                                                                

In [17]:
print(nl_2.take(5))

[(0, ['hervatting', 'van', 'de', 'zitting']), (1, ['ik', 'verklaar', 'de', 'zitting', 'van', 'het', 'europees', 'parlement', 'die', 'op', 'vrijdag', '17', 'december', 'werd', 'onderbroken', 'te', 'zijn', 'hervat', 'ik', 'wens', 'u', 'allen', 'een', 'gelukkig', 'nieuwjaar', 'en', 'hoop', 'dat', 'u', 'een', 'goede', 'vakantie', 'heeft', 'gehad']), (2, ['zoals', 'u', 'heeft', 'kunnen', 'constateren', 'is', 'de', 'grote', 'millenniumbug', 'uitgebleven', 'de', 'burgers', 'van', 'een', 'aantal', 'van', 'onze', 'lidstaten', 'zijn', 'daarentegen', 'door', 'verschrikkelijke', 'natuurrampen', 'getroffen']), (3, ['u', 'heeft', 'aangegeven', 'dat', 'u', 'deze', 'vergaderperiode', 'een', 'debat', 'wilt', 'over', 'deze', 'rampen']), (4, ['nu', 'wil', 'ik', 'graag', 'op', 'verzoek', 'van', 'een', 'aantal', 'collega', 's', 'een', 'minuut', 'stilte', 'in', 'acht', 'nemen', 'ter', 'nagedachtenis', 'van', 'de', 'slachtoffers', 'ik', 'doel', 'hiermee', 'met', 'name', 'op', 'de', 'slachtoffers', 'van', 'he

#### Join the two RDDs together according to the line number key, so you have pairs of matching lines.

In [18]:
en_nl_join = en_2.join(nl_2)
print(en_nl_join.take(5))

[Stage 12:>                                                         (0 + 1) / 1]

[(933654, (['we', 'must', 'switch', 'over', 'to', 'the', 'digital', 'age', 'for', 'which', 'we', 'must', 'implement', 'e', 'inclusion', 'in', 'all', 'areas', 'and', 'for', 'everyone'], ['we', 'moeten', 'omschakelen', 'naar', 'het', 'digitale', 'tijdperk', 'om', 'dat', 'te', 'bereiken', 'moeten', 'we', 'zorgen', 'dat', 'alle', 'gebieden', 'en', 'alle', 'mensen', 'worden', 'opgenomen', 'in', 'de', 'e', 'samenleving'])), (1010370, (['travel', 'money', 'however', 'is', 'not', 'the', 'true', 'essence', 'of', 'economic', 'and', 'monetary', 'union', 'it', 'is', 'the', 'common', 'monetary', 'policy'], ['reisgeld', 'is', 'echter', 'niet', 'de', 'ware', 'essentie', 'van', 'de', 'economische', 'en', 'monetaire', 'unie', 'dat', 'is', 'het', 'gemeenschappelijk', 'monetair', 'beleid'])), (1025256, (['there', 'again', 'we', 'have', 'already', 'proposed', 'pilot', 'countries', 'involving', 'the', 'largest', 'number', 'of', 'member', 'states'], ['ook', 'op', 'dat', 'gebied', 'hebben', 'we', 'al', 'pilo

                                                                                

#### Filter to exclude line pairs that have an empty/missing “corresponding” sentence.

In [22]:
en_nl_filter = en_nl_join.filter(lambda x: x[1][0] != [] and x[1][1] != [])

print(en_nl_filter.take(5))

[Stage 20:>                                                         (0 + 1) / 1]

[(816642, (['barriers', 'also', 'exist', 'that', 'limit', 'mortgage', 'lenders', 'incentives', 'to', 'operate', 'cross', 'border', 'thus', 'preventing', 'new', 'and', 'innovative', 'products', 'and', 'processes', 'from', 'being', 'introduced', 'in', 'other', 'markets', 'across', 'europe'], ['er', 'bestaan', 'ook', 'belemmeringen', 'die', 'verhinderen', 'dat', 'kredietverstrekkers', 'gestimuleerd', 'worden', 'om', 'grensoverschrijdend', 'actief', 'te', 'zijn', 'daardoor', 'wordt', 'voorkomen', 'dat', 'nieuwe', 'en', 'innovatieve', 'producten', 'en', 'processen', 'op', 'andere', 'markten', 'in', 'europa', 'worden', 'geïntroduceerd'])), (817248, (['as', 'mrs', 'grossetête', 'has', 'indicated', 'the', 'construction', 'industry', 'also', 'creates', 'economic', 'activity', 'as', 'you', 'well', 'know'], ['zoals', 'mevrouw', 'grossetête', 'heeft', 'aangegeven', 'en', 'zoals', 'u', 'wel', 'weet', 'ontplooit', 'ook', 'de', 'bouwsector', 'economische', 'activiteiten'])), (818412, (['there', 'is',

                                                                                

#### Filter to leave only pairs of sentences with a small number of words per sentence this should give a more reliable translation.

In [30]:
en_nl_len_filter = en_nl_filter.filter(lambda x: len(x[1][0]) <= 8 and len(x[1][1]) <= 8)

print(en_nl_len_filter.take(5))

[Stage 34:>                                                         (0 + 1) / 1]

[(982224, (['closing', 'of', 'the', 'session'], ['sluiting', 'van', 'de', 'zitting'])), (1363620, (['children', 'are', 'precious', 'they', 'must', 'be', 'protected'], ['kinderen', 'zijn', 'kwetsbaar', 'en', 'moeten', 'worden', 'beschermd'])), (1504842, (['i', 'find', 'that', 'regrettable'], ['ik', 'vind', 'dit', 'betreurenswaardig'])), (1611762, (['labelling', 'is', 'consumer', 'information', 'and', 'should', 'be', 'compulsory'], ['etikettering', 'is', 'consumentenvoorlichting', 'en', 'dient', 'verplicht', 'te', 'worden'])), (840936, (['mr', 'francis', 'wurtz', '48', 'votes'], ['de', 'heer', 'francis', 'wurtz', '48', 'stemmen']))]


                                                                                

#### Filter to leave only pairs of sentences with the same number of words in each sentence.

In [31]:
en_nl_equal_len_filter = en_nl_len_filter.filter(lambda x: len(x[1][0]) == len(x[1][1]))

print(en_nl_equal_len_filter.take(5))

[Stage 36:>                                                         (0 + 1) / 1]

[(81054, (['question', 'no', '46', 'by', 'h', '0600', '00'], ['vraag', 'nr', '46', 'van', 'h', '0600', '00'])), (246546, (['as', 'europeans', 'how', 'can', 'we', 'find', 'another', 'way'], ['welke', 'ander', 'antwoord', 'weten', 'wij', 'europeanen', 'te', 'geven'])), (276834, (['applause'], ['applaus'])), (279144, (['the', 'debate', 'is', 'closed'], ['het', 'debat', 'is', 'gesloten'])), (354714, (['applause'], ['applaus']))]


                                                                                

#### For each sentence pair, map so that you pair each (in order) word in the two sentences. We no longer need the line numbers.

In [32]:
en_nl_map = en_nl_equal_len_filter.map(lambda x: list(zip(x[1][0], x[1][1])))

print(en_nl_map.take(5))

[Stage 38:>                                                         (0 + 1) / 1]

[[('the', 'de'), ('minutes', 'notulen'), ('were', 'worden'), ('approved', 'goedgekeurd')], [('we', 'wij'), ('must', 'moeten'), ('wait', 'wachten'), ('until', 'tot'), ('2005', '2005')], [('the', 'het'), ('debate', 'debat'), ('is', 'is'), ('closed', 'gesloten')], [('welcome', 'welkomstwoord')], [('that', 'dat'), ('is', 'is'), ('the', 'de'), ('main', 'belangrijkste'), ('argument', 'reden')]]


                                                                                

#### Use reduce to count the number of occurrences of the word-translation-pairs.

In [33]:
en_nl_map_reduce = en_nl_map.flatMap(flat_list).map(map_key).reduceByKey(add)
print(en_nl_map_reduce.take(10))



[(('in', 'in'), 1117), (('diversity', 'verscheidenheid'), 5), (('for', 'dank'), 7), (('it', 'het'), 1156), (('intervention', 'interventie'), 2), (('any', 'enkel'), 1), (('to', 'tegen'), 36), (('by', 'van'), 743), (('h', 'h'), 554), (('algeria', 'algerije'), 19)]


                                                                                

#### Print some of the most frequently occurring pairs of words.

In [34]:
for i,j in en_nl_map_reduce.takeOrdered(10,sort_key):
    print(f'{i[0]}:{i[1]} => {j}')

is:is => 12297
the:de => 8135
the:het => 6959
debate:debat => 5045
closed:gesloten => 4765
applause:applaus => 3714
that:dat => 3240
of:van => 2954
i:ik => 2837
a:een => 2719


### Do your translations seem reasonable? Use a dictionary to check a few.

Checked with Google translate and the dutch words matched with corresponding english words.

In [35]:
spark_context.stop()