## Application settings

In [1]:
from pyspark.sql import SparkSession


spark_session = SparkSession.builder\
    .master("spark://192.168.2.70:7077") \
    .appName("emiresenov")\
    .config("spark.dynamicAllocation.enabled", True)\
    .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
    .config("spark.shuffle.service.enabled", True)\
    .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
    .config("spark.cores.max", 4)\
    .getOrCreate()


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/16 19:18:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Question A1

In [2]:
# Define Spark context and read files

spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

linesEng = spark_context.textFile('hdfs://192.168.2.70:9000/europarl/europarl-v7.sv-en.en')

linesSV = spark_context.textFile('hdfs://192.168.2.70:9000/europarl/europarl-v7.sv-en.sv')

In [3]:
# Count lines in both languages and control that they are the same

numLinesEng = linesEng.count()

print("Number of lines, English version:", numLinesEng)

numLinesSV = linesSV.count()

print("Number of lines, Swedish version:", numLinesSV)

assert(numLinesEng == numLinesSV)

                                                                                

Number of lines, English version: 1862234




Number of lines, Swedish version: 1862234


                                                                                

In [4]:
# Counting the number of partitions

partitionsEng = linesEng.getNumPartitions()
partitionsSV = linesSV.getNumPartitions()

print("Number of partitions for English RDD: ", partitionsEng)
print("Number of partitions for Swedish RDD: ", partitionsSV)

Number of partitions for English RDD:  2
Number of partitions for Swedish RDD:  3


## Question A2 

In [5]:
# Pre-processing function: lowercases the text and splits on space
def caseAndToken(lines):
    return lines.map(lambda line: line.lower().split(' '))

# Process text from RDDS
ppLinesEng = caseAndToken(linesEng)
ppLinesSV = caseAndToken(linesSV)

print(ppLinesEng.take(10))
print(ppLinesSV.take(10))

# Verify that line counts still match
assert(ppLinesSV.count() == ppLinesEng.count())

                                                                                

[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'behalf', 'of', '

                                                                                

[['återupptagande', 'av', 'sessionen'], ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'], ['som', 'ni', 'kunnat', 'konstatera', 'ägde', '"den', 'stora', 'år', '2000-buggen"', 'aldrig', 'rum.', 'däremot', 'har', 'invånarna', 'i', 'ett', 'antal', 'av', 'våra', 'medlemsländer', 'drabbats', 'av', 'naturkatastrofer', 'som', 'verkligen', 'varit', 'förskräckliga.'], ['ni', 'har', 'begärt', 'en', 'debatt', 'i', 'ämnet', 'under', 'sammanträdesperiodens', 'kommande', 'dagar.'], ['till', 'dess', 'vill', 'jag', 'att', 'vi,', 'som', 'ett', 'antal', 'kolleger', 'begärt,', 'håller', 'en', 'tyst', 'minut', 'för', 'offren', 'för', 'bl.a.', 'stormarna', 'i', 'de', 'länder', 'i', 'europeiska', 'unionen', 'som', 'drabbats.'], ['jag', 'ber', 'er', 'resa', 'er', 'för', 'en', 'tyst', 'minut.'], 

                                                                                

## Question A3

In [6]:
from operator import add

def wordFreq(ppLines):
    return (ppLines.filter(lambda x: len(x) > 0)       # filter out empty lines
        .flatMap(lambda x: x)                          # flatMap to single words
        .filter(lambda x: len(x) > 0)                  # filter out empty words
        .map(lambda x: (x,1))                          # create (word, 1) pairs
        .reduceByKey(add)                              # reduce the key-value pair by adding up
        .sortBy(lambda x: x[1])                        # sort by value
        .collect()                                     # collect the result
        [-10:])                                        # take the last 10 entries

print(wordFreq(ppLinesEng))
print(wordFreq(ppLinesSV))

                                                                                

[('we', 522849), ('for', 534242), ('is', 758050), ('a', 773522), ('that', 797516), ('in', 1085993), ('and', 1288401), ('to', 1539760), ('of', 1659758), ('the', 3498375)]


                                                                                

[('vi', 539797), ('en', 620310), ('är', 694381), ('av', 738068), ('för', 908680), ('som', 913276), ('det', 924866), ('i', 1050774), ('och', 1344830), ('att', 1706293)]


### A 3.2

#### English
The 10 most common words used in the English language is 

1. the
2. of
3. and
4. a
5. to
6. in
7. is
8. you
9. that
10. it

We can expect some deviation here as the texts are transcripts from the European Parliament, but overall, the result makes sense as we see a lot of the same words appearing in the top 10. Aditionally, all of the most common words collected appear in the top 40 list of the 100 most common English words. The word that deviates the most is the word "we" which appears at rank 36.

Source: https://www.espressoenglish.net/the-100-most-common-words-in-english/

#### Swedish

For the Swedish language, all of the words except for "vi" appears in the top 10 most common words used in the Swedish language where "vi" appears at rank 25. This makes sense as "vi" is the translation of the word "we", confirming that this slight outlier sees a lot of use in the transcripts for both languages.

Source: https://larare.at/svenska/moment/lingvistik/vanligaste_orden_i_svenska_spraket.html

## Question A.4

In [7]:
# 1 - Key the lines by their line number
en1 = ppLinesEng.zipWithIndex()
sv1 = ppLinesSV.zipWithIndex()

print(sv1.take(2))

[Stage 22:>                                                         (0 + 1) / 1]

[(['återupptagande', 'av', 'sessionen'], 0), (['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'], 1)]


                                                                                

In [8]:
# 2 - Swap the key and value
en2 = en1.map(lambda x: (x[1], x[0]))
sv2 = sv1.map(lambda x: (x[1], x[0]))

print(sv2.take(2))

[Stage 23:>                                                         (0 + 1) / 1]

[(0, ['återupptagande', 'av', 'sessionen']), (1, ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'])]


                                                                                

In [9]:
# 3 - Join the two RDDS
joined = en2.join(sv2)
print(joined.take(2))

[Stage 25:>                                                         (0 + 1) / 1]

[(1985, (['fourthly,', 'we', 'need', 'the', 'shipowner', 'to', 'bear', 'liability,', 'and', 'not', 'just', 'to', 'the', 'tune', 'of', 'a', 'derisory', 'usd', '12', 'million,', 'but', 'to', 'the', 'value', 'of', 'at', 'least', 'usd', '400', 'million,', 'to', 'be', 'covered', 'per', 'insurance', 'certificate.'], ['för', 'det', 'fjärde', 'måste', 'det', 'finnas', 'ett', 'ansvar', 'hos', 'fartygsägaren,', 'och', 'inte', 'bara', 'skrattretande', '12', 'miljoner', 'dollar,', 'utan', 'minst', '400', 'miljoner', 'dollar,', 'vilket', 'han', 'måste', 'styrka', 'med', 'hjälp', 'av', 'försäkringsbevis.'])), (26795, (['b5-0283/2000', 'by', 'mr', 'elles', 'and', 'others', 'on', 'behalf', 'of', 'the', 'group', 'of', 'the', 'european', "people'", 's', 'party', '(christian', 'democrats)', 'and', 'european', 'democrats;'], ['b5-0283/2000', 'av', 'elles', 'med', 'flera', 'för', 'europeiska', 'folkpartiets', 'grupp', '(kristdemokrater)', 'och', 'europademokrater;']))]


                                                                                

In [10]:
# 4 - Filter missing corresponding sentences
join2 = joined.filter(lambda x: len(x[1][0]) > 1)
join3 = join2.filter(lambda x: len(x[1][1]) > 1)

print(joined.count())
print(join2.count())
print(join3.count())

                                                                                

1862234


                                                                                

1840400




1835711


                                                                                

In [11]:
# 5 - Filter for sentences with less than 10 words
join4 = join3.filter(lambda x: len(x[1][0]) < 10 and len(x[1][1]) < 10)
print(join4.count())



188176


                                                                                

In [12]:
# 6 - Filter to leave only equally worded sentences
join5 = join4.filter(lambda x: len(x[1][0]) == len(x[1][1]))
print(join5.count())



63913


                                                                                

In [13]:
# 7 - Pair the words in the sentences
join6 = join5.map(lambda x: list(zip(x[1][0], x[1][1])))
join6.take(2)

                                                                                

[[('the', 'jag'),
  ('debate', 'förklarar'),
  ('is', 'debatten'),
  ('closed.', 'avslutad.')],
 [('that', 'detta'),
  ('is', 'är'),
  ('an', 'en'),
  ('important', 'viktig'),
  ('task.', 'uppgift.')]]

In [14]:
# 8 & 9 - Reduce to count word-translation pair occurrences, print most frequent word-pairs
join7 = (join6.flatMap(lambda x : x)
        .map(lambda x: (x,1))
        .reduceByKey(add)
        .sortBy(lambda x: x[1])
        .collect()
        [-10:])
join7

                                                                                

[(('not', 'inte'), 2650),
 (('that', 'det'), 2806),
 (('it', 'det'), 2866),
 (('and', 'och'), 2882),
 (('a', 'en'), 2888),
 (('closed.', 'avslutad.'), 2964),
 (('this', 'detta'), 3252),
 (('i', 'jag'), 5020),
 (('we', 'vi'), 5530),
 (('is', 'är'), 10040)]

## Result

The top 10 most frequently occuring word pairs are all correctly translated, concluding that the translations indeed seem reasonable.