## Importing Modules for pyspark

In [36]:
from pyspark.sql import SparkSession
from operator import add
import re

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.250:7077") \
        .appName("Koushik_A3")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# RDD API
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

In [2]:
english_transcript = spark_context.textFile("hdfs://192.168.2.250:9000/europarl/europarl-v7.nl-en.en")
print(english_transcript.count())

[Stage 0:>                                                          (0 + 3) / 3]

1997775


                                                                                

In [3]:
netherlands_transcript = spark_context.textFile("hdfs://192.168.2.250:9000/europarl/europarl-v7.nl-en.nl")
print(netherlands_transcript.count())



1997775


                                                                                

In [4]:
print(english_transcript.getNumPartitions())

3


In [5]:
print(netherlands_transcript.getNumPartitions())

3


In [105]:
def lower_tokenize(line):
    line = re.sub(r'[^\w]'," ",line)
    return line.lower().split()
    

In [106]:
english_transcript_preprocessed = english_transcript.map(lower_tokenize)

In [107]:
netherlands_transcript_preprocessed = netherlands_transcript.map(lower_tokenize)

In [108]:

for i in english_transcript_preprocessed.take(10):
    print(i)

[Stage 67:>                                                         (0 + 1) / 1]

['resumption', 'of', 'the', 'session']
['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period']
['although', 'as', 'you', 'will', 'have', 'seen', 'the', 'dreaded', 'millennium', 'bug', 'failed', 'to', 'materialise', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful']
['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days', 'during', 'this', 'part', 'session']
['in', 'the', 'meantime', 'i', 'should', 'like', 'to', 'observe', 'a', 'minute', 's', 'silence', 'as', 'a', 'number', 'of', 'members', 'have', 'requested', 'on', 'behalf', 'of', 'all', 'the', 'vi

                                                                                

In [109]:
for i in netherlands_transcript_preprocessed.take(10):
    print(i)

[Stage 68:>                                                         (0 + 1) / 1]

['hervatting', 'van', 'de', 'zitting']
['ik', 'verklaar', 'de', 'zitting', 'van', 'het', 'europees', 'parlement', 'die', 'op', 'vrijdag', '17', 'december', 'werd', 'onderbroken', 'te', 'zijn', 'hervat', 'ik', 'wens', 'u', 'allen', 'een', 'gelukkig', 'nieuwjaar', 'en', 'hoop', 'dat', 'u', 'een', 'goede', 'vakantie', 'heeft', 'gehad']
['zoals', 'u', 'heeft', 'kunnen', 'constateren', 'is', 'de', 'grote', 'millenniumbug', 'uitgebleven', 'de', 'burgers', 'van', 'een', 'aantal', 'van', 'onze', 'lidstaten', 'zijn', 'daarentegen', 'door', 'verschrikkelijke', 'natuurrampen', 'getroffen']
['u', 'heeft', 'aangegeven', 'dat', 'u', 'deze', 'vergaderperiode', 'een', 'debat', 'wilt', 'over', 'deze', 'rampen']
['nu', 'wil', 'ik', 'graag', 'op', 'verzoek', 'van', 'een', 'aantal', 'collega', 's', 'een', 'minuut', 'stilte', 'in', 'acht', 'nemen', 'ter', 'nagedachtenis', 'van', 'de', 'slachtoffers', 'ik', 'doel', 'hiermee', 'met', 'name', 'op', 'de', 'slachtoffers', 'van', 'het', 'noodweer', 'dat', 'versc

                                                                                

In [110]:
if english_transcript_preprocessed.count() == netherlands_transcript_preprocessed.count():
    print(f'Line counts are same: {english_transcript_preprocessed.count()}')
else:
    print('not same')



Line counts are same: 1997775


                                                                                

In [111]:
def flat_list(l):
    return l
def map_key(word):
    return (word , 1)

def sort_key(key):
    return -key[1]
    

In [112]:
mapped_english_transcript = english_transcript_preprocessed.flatMap(flat_list).map(map_key).reduceByKey(add)


In [113]:
mapped_netherlands_transcript = netherlands_transcript_preprocessed.flatMap(flat_list).map(map_key).reduceByKey(add)

In [114]:
print('top 10 most frequently occured word in English transcript')
for i,j in mapped_english_transcript.takeOrdered(10, sort_key):
    print(f'{i}: {j}')

top 10 most frequently occured word in English transcript




the: 3799961
of: 1799784
to: 1673342
and: 1426903
in: 1189698
that: 909311
a: 841063
is: 837897
for: 580560
we: 573706


                                                                                

In [115]:
print('top 10 most frequently occured word in Dutch transcript')
for i,j in mapped_netherlands_transcript.takeOrdered(10, sort_key):
    print(f'{i}: {j}')

top 10 most frequently occured word in Dutch transcript




de: 3862249
van: 2053837
het: 1726514
en: 1370030
in: 1003820
dat: 992734
een: 992302
te: 727012
is: 701511
voor: 595325


                                                                                

In [116]:
en_1 = english_transcript_preprocessed.zipWithIndex()
print(en_1.take(5))

                                                                                

[(['resumption', 'of', 'the', 'session'], 0), (['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period'], 1), (['although', 'as', 'you', 'will', 'have', 'seen', 'the', 'dreaded', 'millennium', 'bug', 'failed', 'to', 'materialise', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful'], 2), (['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days', 'during', 'this', 'part', 'session'], 3), (['in', 'the', 'meantime', 'i', 'should', 'like', 'to', 'observe', 'a', 'minute', 's', 'silence', 'as', 'a', 'number', 'of', 'members', 'have', 'requested', 'on', 'behalf

                                                                                

In [117]:


nl_1 = netherlands_transcript_preprocessed.zipWithIndex()
print(nl_1.take(5))

                                                                                

[(['hervatting', 'van', 'de', 'zitting'], 0), (['ik', 'verklaar', 'de', 'zitting', 'van', 'het', 'europees', 'parlement', 'die', 'op', 'vrijdag', '17', 'december', 'werd', 'onderbroken', 'te', 'zijn', 'hervat', 'ik', 'wens', 'u', 'allen', 'een', 'gelukkig', 'nieuwjaar', 'en', 'hoop', 'dat', 'u', 'een', 'goede', 'vakantie', 'heeft', 'gehad'], 1), (['zoals', 'u', 'heeft', 'kunnen', 'constateren', 'is', 'de', 'grote', 'millenniumbug', 'uitgebleven', 'de', 'burgers', 'van', 'een', 'aantal', 'van', 'onze', 'lidstaten', 'zijn', 'daarentegen', 'door', 'verschrikkelijke', 'natuurrampen', 'getroffen'], 2), (['u', 'heeft', 'aangegeven', 'dat', 'u', 'deze', 'vergaderperiode', 'een', 'debat', 'wilt', 'over', 'deze', 'rampen'], 3), (['nu', 'wil', 'ik', 'graag', 'op', 'verzoek', 'van', 'een', 'aantal', 'collega', 's', 'een', 'minuut', 'stilte', 'in', 'acht', 'nemen', 'ter', 'nagedachtenis', 'van', 'de', 'slachtoffers', 'ik', 'doel', 'hiermee', 'met', 'name', 'op', 'de', 'slachtoffers', 'van', 'het',

                                                                                

In [118]:
def swap_element(rdd):
    return rdd.map(lambda x: (x[1],x[0]))


en_2 = swap_element(en_1)
nl_2 = swap_element(nl_1)

In [119]:
print(en_2.take(5))

[(0, ['resumption', 'of', 'the', 'session']), (1, ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period']), (2, ['although', 'as', 'you', 'will', 'have', 'seen', 'the', 'dreaded', 'millennium', 'bug', 'failed', 'to', 'materialise', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful']), (3, ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days', 'during', 'this', 'part', 'session']), (4, ['in', 'the', 'meantime', 'i', 'should', 'like', 'to', 'observe', 'a', 'minute', 's', 'silence', 'as', 'a', 'number', 'of', 'members', 'have', 'requested', 'on', 'beh

In [120]:
print(nl_2.take(5))

[(0, ['hervatting', 'van', 'de', 'zitting']), (1, ['ik', 'verklaar', 'de', 'zitting', 'van', 'het', 'europees', 'parlement', 'die', 'op', 'vrijdag', '17', 'december', 'werd', 'onderbroken', 'te', 'zijn', 'hervat', 'ik', 'wens', 'u', 'allen', 'een', 'gelukkig', 'nieuwjaar', 'en', 'hoop', 'dat', 'u', 'een', 'goede', 'vakantie', 'heeft', 'gehad']), (2, ['zoals', 'u', 'heeft', 'kunnen', 'constateren', 'is', 'de', 'grote', 'millenniumbug', 'uitgebleven', 'de', 'burgers', 'van', 'een', 'aantal', 'van', 'onze', 'lidstaten', 'zijn', 'daarentegen', 'door', 'verschrikkelijke', 'natuurrampen', 'getroffen']), (3, ['u', 'heeft', 'aangegeven', 'dat', 'u', 'deze', 'vergaderperiode', 'een', 'debat', 'wilt', 'over', 'deze', 'rampen']), (4, ['nu', 'wil', 'ik', 'graag', 'op', 'verzoek', 'van', 'een', 'aantal', 'collega', 's', 'een', 'minuut', 'stilte', 'in', 'acht', 'nemen', 'ter', 'nagedachtenis', 'van', 'de', 'slachtoffers', 'ik', 'doel', 'hiermee', 'met', 'name', 'op', 'de', 'slachtoffers', 'van', 'he

                                                                                

In [121]:
en_nl_join = en_2.join(nl_2)
print(en_nl_join.take(5))

[Stage 83:>                                                         (0 + 1) / 1]

[(1069044, (['slavery', 'is', 'still', 'the', 'custom', 'there', 'which', 'means', 'people', 'are', 'deprived', 'of', 'their', 'cultural', 'and', 'religious', 'identity', 'and', 'their', 'personality'], ['slavernij', 'is', 'hier', 'nog', 'gewoon', 'wat', 'betekent', 'dat', 'mensen', 'worden', 'beroofd', 'van', 'hun', 'culturele', 'en', 'religieuze', 'identiteit', 'en', 'van', 'hun', 'persoonlijkheid'])), (1142934, (['i', 'also', 'urge', 'the', 'commission', 'and', 'member', 'states', 'to', 'provide', 'financing', 'for', 'the', 'measures', 'aimed', 'at', 'cutting', 'the', 'pollution', 'produced', 'by', 'coal', 'powered', 'installations'], ['ik', 'verzoek', 'de', 'commissie', 'en', 'de', 'lidstaten', 'bovendien', 'om', 'maatregelen', 'te', 'financieren', 'die', 'de', 'vervuiling', 'beperken', 'die', 'wordt', 'veroorzaakt', 'door', 'koleninstallaties'])), (1160646, (['as', 'mr', 'harbour', 'and', 'the', 'commissioner', 'have', 'pointed', 'out', 'simplification', 'is', 'part', 'of', 'this'

                                                                                

In [122]:
en_nl_filter = en_nl_join.filter(lambda x: x[1][0] != "" and x[1][1] != "")

print(en_nl_filter.take(5))

[Stage 85:>                                                         (0 + 1) / 1]

[(41994, (['mr', 'president', 'i', 'join', 'with', 'previous', 'speakers', 'in', 'demanding', 'the', 'immediate', 'and', 'unconditional', 'abolition', 'of', 'the', 'death', 'penalty', 'in', 'the', 'united', 'states', 'and', 'everywhere', 'else', 'in', 'the', 'world'], ['mijnheer', 'de', 'voorzitter', 'ik', 'steun', 'de', 'voorafgaande', 'sprekers', 'die', 'gepleit', 'hebben', 'voor', 'de', 'onmiddellijke', 'en', 'onvoorwaardelijke', 'afschaffing', 'van', 'de', 'doodstraf', 'in', 'de', 'verenigde', 'staten', 'en', 'de', 'rest', 'van', 'de', 'wereld'])), (180906, (['there', 'are', 'two', 'central', 'principles', 'if', 'the', 'code', 'is', 'to', 'be', 'fully', 'effective'], ['er', 'zijn', 'twee', 'kernprincipes', 'waaraan', 'de', 'code', 'moet', 'voldoen', 'om', 'volkomen', 'effectief', 'te', 'zijn'])), (205716, (['four', 'additional', 'amendments', 'have', 'been', 'tabled', 'including', 'one', 'by', 'my', 'group', 'and', 'three', 'by', 'the', 'greens'], ['er', 'zijn', 'nog', 'vier', 'ame

                                                                                

In [123]:
en_nl_len_filter = en_nl_filter.filter(lambda x: len(x[1][0]) <= 8 and len(x[1][1]) <= 8)

print(en_nl_len_filter.take(5))

[Stage 87:>                                                         (0 + 1) / 1]

[(1175532, (['employing', 'compulsion', 'in', 'this', 'situation', 'is', 'completely', 'unacceptable'], ['dwang', 'uitoefenen', 'is', 'in', 'deze', 'context', 'absoluut', 'onaanvaardbaar'])), (947238, (['the', 'proposal', 'reeks', 'of', 'covert', 'protectionism'], ['dit', 'voorstel', 'riekt', 'naar', 'verkapt', 'protectionisme'])), (1014588, (['petitions', 'see', 'minutes'], ['verzoekschriften', 'zie', 'notulen'])), (1293018, (['so', 'what', 'is', 'the', 'reality'], ['wat', 'betekende', 'dit', 'in', 'de', 'praktijk'])), (1365744, (['written', 'statements', 'rule', '149'], ['schriftelijke', 'verklaringen', 'artikel', '149']))]


                                                                                

In [124]:
en_nl_equal_len_filter = en_nl_len_filter.filter(lambda x: len(x[1][0]) == len(x[1][1]))

print(en_nl_equal_len_filter.take(5))

[Stage 89:>                                                         (0 + 1) / 1]

[(941406, (['report', 'inés', 'ayala', 'sender'], ['verslag', 'inés', 'ayala', 'sender'])), (956892, (['the', 'debate', 'is', 'closed'], ['het', 'debat', 'is', 'gesloten'])), (1001016, (['6'], ['6'])), (1219230, (['that', 'is', 'my', 'question'], ['dat', 'is', 'mijn', 'vraag'])), (1432200, (['what', 'a', 'difference'], ['wat', 'een', 'verschil']))]


                                                                                

In [125]:
en_nl_map = en_nl_equal_len_filter.map(lambda x: list(zip(x[1][0], x[1][1])))

print(en_nl_map.take(5))

[Stage 91:>                                                         (0 + 1) / 1]

[[('it', 'het'), ('was', 'was'), ('a', 'een'), ('conference', 'conferentie'), ('on', 'over'), ('water', 'water')], [('the', 'de'), ('gsp', 'sap'), ('agreements', 'overeenkomsten'), ('must', 'moeten'), ('be', 'worden'), ('suspended', 'opgeschort')], [('otherwise', 'anders'), ('we', 'belanden'), ('are', 'we'), ('heading', 'in'), ('towards', 'een'), ('communicative', 'communicatieve'), ('chaos', 'chaos')], [('the', 'het'), ('united', 'verenigd'), ('kingdom', 'koninkrijk'), ('is', 'is'), ('a', 'een'), ('great', 'geweldig'), ('nation', 'land')], [('4', '4')]]


                                                                                

In [126]:
en_nl_map_reduce = en_nl_map.flatMap(flat_list).map(map_key).reduceByKey(add)
print(en_nl_map_reduce.take(10))



[(('vote', 'daarom'), 4), (('this', 'dit'), 1820), (('the', 'het'), 6959), (('still', 'statuut'), 1), (('waiting', 'wachten'), 5), (('for', 'wij'), 10), (('this', 'nog'), 11), (('the', 'de'), 8135), (('these', 'doelstellingen'), 3), (('objectives', 'volledig'), 2)]


                                                                                

In [127]:
for i,j in en_nl_map_reduce.takeOrdered(10,sort_key):
    print(f'{i[0]}:{i[1]} => {j}')

is:is => 12297
the:de => 8135
the:het => 6959
debate:debat => 5045
closed:gesloten => 4765
applause:applaus => 3714
that:dat => 3240
of:van => 2954
i:ik => 2837
a:een => 2719


In [128]:
spark_context.stop()