# Prerequisites

In [None]:
# Update packages and install required java version
!apt-get update
!apt-get install openjdk-21-jdk-headless -qq > /dev/null

# download and unzip spark
!wget -nc -q https://downloads.apache.org/spark/spark-4.0.0/spark-4.0.0-bin-hadoop3.tgz
!tar xf spark-4.0.0-bin-hadoop3.tgz

# get data for labs
!wget -nc -O around_the_world_in_80_days.txt https://www.gutenberg.org/ebooks/103.txt.utf-8

# install findspark
!pip install -q findspark

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Connecting to security.                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
                                                                               Get:3 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Connectin                                                                               Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
0% [Waiting for headers] [Waiting for headers] [4 InRelease 6,555 B/6,555 B 1000% [Waiting for headers] [Waiting for headers] [Connected to ppa.launchpadconte                                                                               Hit:5 http://archi

In [None]:
import os
import findspark

# set env vars for java and spark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-4.0.0-bin-hadoop3"

# start findspark so notebook can interact with spark
findspark.init()


In [None]:
# what does findspark do? use the ?? magic command to find out
# Note 1: in colab, this may open in a side panel
# Note 2: this magic command is often helpful when encountering an object in a
# notebook that is unfamiliar. More information will be displayed if it exists
?? findspark

# 1. Word Count

Instructions:  
For each cell marked "double-click and add explanation here" please answer the question in your own words.  
In the section where you complete the code to perform basic nlp text cleaning and exploration tasks, the goal is to chain all of the transformations together in a single function. For learning and exploration purposes, it is acceptable to have each step seperate, but the last cell in this section should be one function with all transformations chained together.  
For steps c and f, it is acceptable to use your favorite chatbot to generate a list of common stop words (c) and punctuation (e) for use in the code. As these are common steps in nlp/text processing tasks, there are pleanty of libraries to help with this such as nltk, but there is no need to import extra dependencies for this lab unless you are already familiar with working with them.

In [None]:
# start a spark session and create spark context for making rdd
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("word_count") \
    .getOrCreate()

sc = spark.sparkContext

In [None]:
# Defind the rdd
rdd = sc.textFile('/content/around_the_world_in_80_days.txt')

In [None]:
# view the first x lines of the rdd
rdd.take(20)

['The Project Gutenberg eBook of Around the World in Eighty Days',
 '    ',
 'This ebook is for the use of anyone anywhere in the United States and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever. You may copy it, give it away or re-use it under the terms',
 'of the Project Gutenberg License included with this ebook or online',
 'at www.gutenberg.org. If you are not located in the United States,',
 'you will have to check the laws of the country where you are located',
 'before using this eBook.',
 '',
 'Title: Around the World in Eighty Days',
 '',
 'Author: Jules Verne',
 '',
 'Release date: January 1, 1994 [eBook #103]',
 '                Most recently updated: October 29, 2024',
 '',
 'Language: English',
 '',
 '']

In [None]:
# example lambda function
words = rdd.flatMap(lambda lines: lines.split(' '))

In [None]:
# Note and explain the output of the below command
words

PythonRDD[3] at RDD at PythonRDD.scala:56

double-click and add explanation here

<ADD EXPLANATION HERE>

In [None]:
# Note and explain the output of the following command, focusing on the difference with the
# above command
words.collect()

['The',
 'Project',
 'Gutenberg',
 'eBook',
 'of',
 'Around',
 'the',
 'World',
 'in',
 'Eighty',
 'Days',
 '',
 '',
 '',
 '',
 '',
 'This',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'in',
 'the',
 'United',
 'States',
 'and',
 'most',
 'other',
 'parts',
 'of',
 'the',
 'world',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'almost',
 'no',
 'restrictions',
 'whatsoever.',
 'You',
 'may',
 'copy',
 'it,',
 'give',
 'it',
 'away',
 'or',
 're-use',
 'it',
 'under',
 'the',
 'terms',
 'of',
 'the',
 'Project',
 'Gutenberg',
 'License',
 'included',
 'with',
 'this',
 'ebook',
 'or',
 'online',
 'at',
 'www.gutenberg.org.',
 'If',
 'you',
 'are',
 'not',
 'located',
 'in',
 'the',
 'United',
 'States,',
 'you',
 'will',
 'have',
 'to',
 'check',
 'the',
 'laws',
 'of',
 'the',
 'country',
 'where',
 'you',
 'are',
 'located',
 'before',
 'using',
 'this',
 'eBook.',
 '',
 'Title:',
 'Around',
 'the',
 'World',
 'in',
 'Eighty',
 'Days',
 '',
 'Author:',
 'Jules'

double-click and add explanation here

In [None]:
# nicer print
for w in words.collect():
    print(w)

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
was
drawing
near
his
last
turning-point.
The
bonds
were
quoted,
no
longer
at
a
hundred
below
par,
but
at
twenty,
at
ten,
and
at
five;
and
paralytic
old
Lord
Albemarle
bet
even
in
his
favour.

A
great
crowd
was
collected
in
Pall
Mall
and
the
neighbouring
streets
on
Saturday
evening;
it
seemed
like
a
multitude
of
brokers
permanently
established
around
the
Reform
Club.
Circulation
was
impeded,
and
everywhere
disputes,
discussions,
and
financial
transactions
were
going
on.
The
police
had
great
difficulty
in
keeping
back
the
crowd,
and
as
the
hour
when
Phileas
Fogg
was
due
approached,
the
excitement
rose
to
its
highest
pitch.

The
five
antagonists
of
Phileas
Fogg
had
met
in
the
great
saloon
of
the
club.
John
Sullivan
and
Samuel
Fallentin,
the
bankers,
Andrew
Stuart,
the
engineer,
Gauthier
Ralph,
the
director
of
the
Bank
of
England,
and
Thomas
Flanagan,
the
brewer,
one
and
all
waited
anxiously.

When


In [None]:
# Print first x words
words.take(20)

['The',
 'Project',
 'Gutenberg',
 'eBook',
 'of',
 'Around',
 'the',
 'World',
 'in',
 'Eighty',
 'Days',
 '',
 '',
 '',
 '',
 '',
 'This',
 'ebook',
 'is',
 'for']

In [None]:
# Use cell magic command to help understand what the rdd.flatMap function is doing in the next cell.
# Insert a text/markdown cell and explain in your own words.

In [None]:
# Initialize a word counter by creating a tuple with word and cound of 1
words = rdd.flatMap(lambda lines: lines.split(' ')) \
                    .map(lambda word: (word, 1))

for w in words.collect():
    print(w)

In [None]:
import string
from collections import Counter
# a. count the occurence of each word
words_list = words.collect()  # Ramène les données en mémoire Python
words_counter = Counter(words_list)
print(words_counter)


Counter({'the': 4316, '': 2182, 'of': 1875, 'and': 1792, 'to': 1690, 'a': 1261, 'in': 991, 'was': 974, 'his': 804, 'he': 720, 'at': 576, 'with': 550, 'not': 500, 'had': 484, 'The': 482, 'on': 482, 'that': 460, 'which': 414, 'for': 407, 'as': 396, 'by': 377, 'Mr.': 371, 'Fogg': 331, 'it': 322, 'be': 307, 'from': 302, 'were': 300, 'this': 292, 'is': 288, 'would': 266, 'have': 259, 'you': 243, 'an': 222, 'Passepartout': 219, 'but': 216, 'Phileas': 214, 'He': 208, 'I': 207, 'or': 185, 'they': 183, 'him': 181, 'who': 179, 'are': 169, 'been': 167, 'said': 155, 'her': 146, 'their': 145, 'It': 139, 'if': 138, 'could': 136, 'its': 133, 'all': 131, 'Fogg,': 130, 'one': 125, 'no': 124, 'Fix': 122, 'did': 119, 'Passepartout,': 117, '“I': 115, 'so': 112, 'upon': 112, 'more': 109, 'will': 108, 'about': 102, 'only': 96, 'when': 96, 'two': 91, 'up': 91, 'out': 90, 'my': 90, 'hundred': 90, 'some': 90, 'A': 89, 'replied': 89, 'now': 87, 'himself': 85, 'into': 84, 'after': 84, 'thousand': 83, 'without': 

In [None]:
# b. a common first step in text analysis, change all capital letters to lower case
words = words.map(lambda w: w.lower())
words.take(20)

['the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'around',
 'the',
 'world',
 'in',
 'eighty',
 'days',
 '',
 '',
 '',
 '',
 '',
 'this',
 'ebook',
 'is',
 'for']

In [None]:
# c. eliminate the stop words.
# Liste simplifiée de stopwords (mots très fréquents mais peu informatifs)
stopwords = set([
    "the","and","is","in","it","of","to","a","i","that","for","on","you","with",
    "as","at","be","this","have","or","by","an","not","are","from","but","they",''
])

# On filtre les mots, en retirant ceux qui sont des stopwords
# stopwords est une liste Python
filtered_words = words.filter(lambda w: w not in stopwords)
filtered_words.take(20)


['project',
 'gutenberg',
 'ebook',
 'around',
 'world',
 'eighty',
 'days',
 'ebook',
 'use',
 'anyone',
 'anywhere',
 'united',
 'states',
 'most',
 'other',
 'parts',
 'world',
 'no',
 'cost',
 'almost']

In [None]:
# d. sort in alphabetical order
sorted_alpha = filtered_words.sortBy(lambda w: w)
sorted_alpha.take(20)

['#103]',
 '$5,000)',
 '&c.,',
 '($1',
 '(801)',
 '(a)',
 '(and',
 '(any',
 '(b)',
 '(c)',
 '(does',
 '(if',
 '(japan),',
 '(or',
 '(or',
 '(or',
 '(saturday,',
 '(sort',
 '(sunday)',
 '(trademark/copyright)']

In [None]:
# e. sort descending by word frequency
# Compter les occurrences
word_counts = filtered_words.map(lambda w: (w, 1)) \
                            .reduceByKey(lambda a, b: a + b)

# Trier par fréquence décroissante
sorted_by_freq = word_counts.sortBy(lambda x: x[1], ascending=False)

# Afficher les résultats
for word, count in sorted_by_freq.collect():
    print(word, count)


was 986
he 930
his 855
had 503
which 490
mr. 373
fogg 365
were 303
would 274
phileas 250
passepartout 239
him 183
who 182
if 170
been 167
her 157
said 157
their 147
all 141
could 139
no 137
one 133
its 133
fogg, 132
fix 129
did 126
passepartout, 121
so 118
when 117
more 116
“i 115
upon 113
what 112
will 111
about 108
there 107
only 102
some 101
after 101
two 98
up 95
she 95
we 93
hundred 92
out 92
now 92
my 91
replied 89
himself 87
project 87
without 86
into 84
thousand 83
and, 83
time 82
before 81
made 81
like 80
any 76
chapter 74
than 74
train 74
your 73
them 73
fix, 72
very 71
going 70
these 70
do 70
where 69
seemed 69
great 68
being 67
found 66
twenty 66
then 66
took 66
fogg. 65
left 65
should 64
hong 63
other 62
must 62
aouda 61
hours 61
through 61
last 58
long 58
him, 58
master 58
between 57
still 57
english 57
steamer 56
might 55
passed 55
asked 55
“you 54
three 54
sir 54
gutenberg™ 54
miles 54
having 54
new 53
him. 53
returned 51
has 51
off 51
go 51
minutes 51
while 50
aouda, 5

In [None]:
# f. remove punctuations and blank spaces
import re

# Fonction pour nettoyer chaque mot
def clean_word(w):
    w = re.sub(r'[^\w\s]', '', w)  # supprime ponctuations
    w = w.strip()                    # supprime espaces au début/fin
    return w

# Appliquer la fonction sur chaque mot
cleaned_words = filtered_words.map(clean_word).filter(lambda w: w != "")


# 2. What does the following cell block do?
Comment the code below line by line after the provided hash-tag. You should be able to explain each line while respecting the pep8 style guide of 79 characters or less per line!

In [None]:
 # Create an RDD of tuples (name, age)
dataRDD = sc.parallelize([("Brooke", 20), ("Denny", 31), ("Jules", 30),
("TD", 35), ("Brooke", 25)])

# Transform each tuple to prepare for calculating the average age by name
agesRDD = (dataRDD
  # Convert (name, age) into (name, (age, 1)) to count occurrences
  .map(lambda x: (x[0], (x[1], 1)))
  # Group by name and sum ages and occurrence counts
  .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
  # Calculate average age by name: sum of ages / number of occurrences
  .map(lambda x: (x[0], x[1][0]/x[1][1])))

# Where to go from here

Further exploration for students who complete the lab before the end of the session or want to go further.

- perform eda on the original french version of the [book](https://www.gutenberg.org/ebooks/46541.txt.utf-8) and compare the two
- recomplete the exercises using a the docker install
- install java and spark directly onto host machine and either rexplore this notebook or perform eda on other data sets
- write a simple python timer function for seeing how quickly your rdd runs as written. change the order of the steps in order to make the rdd run as optimally as possible