# Prerequisites

In [None]:
# Update packages and install required java version
!apt-get update
!apt-get install openjdk-21-jdk-headless -qq > /dev/null

# download and unzip spark
!wget -nc -q https://downloads.apache.org/spark/spark-4.0.0/spark-4.0.0-bin-hadoop3.tgz
!tar xf spark-4.0.0-bin-hadoop3.tgz

# get data for labs
!wget -nc -O around_the_world_in_80_days.txt https://www.gutenberg.org/ebooks/103.txt.utf-8

# install findspark
!pip install -q findspark

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [Connecting to security.                                                                               Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://cli.github.com/packages stable/main amd64 Packages [346 B]
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,006 kB]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubun

In [None]:
import os
import findspark

# set env vars for java and spark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-4.0.0-bin-hadoop3"

# start findspark so notebook can interact with spark
findspark.init()


In [None]:
# what does findspark do? use the ?? magic command to find out
# Note 1: in colab, this may open in a side panel
# Note 2: this magic command is often helpful when encountering an object in a
# notebook that is unfamiliar. More information will be displayed if it exists
?? findspark

# 1. Word Count

Instructions:  
For each cell marked "double-click and add explanation here" please answer the question in your own words.  
In the section where you complete the code to perform basic nlp text cleaning and exploration tasks, the goal is to chain all of the transformations together in a single function. For learning and exploration purposes, it is acceptable to have each step seperate, but the last cell in this section should be one function with all transformations chained together.  
For steps c and f, it is acceptable to use your favorite chatbot to generate a list of common stop words (c) and punctuation (e) for use in the code. As these are common steps in nlp/text processing tasks, there are pleanty of libraries to help with this such as nltk, but there is no need to import extra dependencies for this lab unless you are already familiar with working with them.

In [None]:
# start a spark session and create spark context for making rdd
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("word_count") \
    .getOrCreate()

sc = spark.sparkContext

In [None]:
# Defind the rdd
rdd = sc.textFile('/content/around_the_world_in_80_days.txt')

In [None]:
# view the first x lines of the rdd
rdd.take(20)

In [None]:
# example lambda function
words = rdd.flatMap(lambda lines: lines.split(' '))

In [None]:
# Note and explain the output of the below command
words

# Explaining the command

Writing a variable alone in a cell does the same as a print.

Here it displays the object type of the variable "words" wich is a RDD object and also where it comes from "PythonRDD.scala:56".


<ADD EXPLANATION HERE>

In [None]:
# Note and explain the output of the following command, focusing on the difference with the
# above command
words.collect()

# Explaining the command


The collect() function returns a list containing all the elements in the RDD words.

In [None]:
# nicer print
for w in words.collect():
    print(w)

In [None]:
# Print first x words
words.take(20)

['The',
 'Project',
 'Gutenberg',
 'eBook',
 'of',
 'Around',
 'the',
 'World',
 'in',
 'Eighty',
 'Days',
 '',
 '',
 '',
 '',
 '',
 'This',
 'ebook',
 'is',
 'for']

In [None]:
# Use cell magic command to help understand what the rdd.flatMap function is doing in the next cell.
# Insert a text/markdown cell and explain in your own words.

?? rdd.flatMap

# rdd.flatMap

rdd.flatMap is a method that takes in argument a function and applies it to the Rdd passed in parameter while assigning it to a new flattened RDD. In the cell bellow it splits text lines into individual words, creates (word, 1) tuples for each word, and prints all the tuples.

In [None]:
# Initialize a word counter by creating a tuple with word and cound of 1
words = rdd.flatMap(lambda lines: lines.split(' ')) \
                    .map(lambda word: (word, 1))

for w in words.collect():
    print(w)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
('was', 1)
('drawing', 1)
('near', 1)
('his', 1)
('last', 1)
('turning-point.', 1)
('The', 1)
('bonds', 1)
('were', 1)
('quoted,', 1)
('no', 1)
('longer', 1)
('at', 1)
('a', 1)
('hundred', 1)
('below', 1)
('par,', 1)
('but', 1)
('at', 1)
('twenty,', 1)
('at', 1)
('ten,', 1)
('and', 1)
('at', 1)
('five;', 1)
('and', 1)
('paralytic', 1)
('old', 1)
('Lord', 1)
('Albemarle', 1)
('bet', 1)
('even', 1)
('in', 1)
('his', 1)
('favour.', 1)
('', 1)
('A', 1)
('great', 1)
('crowd', 1)
('was', 1)
('collected', 1)
('in', 1)
('Pall', 1)
('Mall', 1)
('and', 1)
('the', 1)
('neighbouring', 1)
('streets', 1)
('on', 1)
('Saturday', 1)
('evening;', 1)
('it', 1)
('seemed', 1)
('like', 1)
('a', 1)
('multitude', 1)
('of', 1)
('brokers', 1)
('permanently', 1)
('established', 1)
('around', 1)
('the', 1)
('Reform', 1)
('Club.', 1)
('Circulation', 1)
('was', 1)
('impeded,', 1)
('and', 1)
('everywhere', 1)
('disputes,', 1)
('discussions,', 1)
('and'

In [None]:
# a. count the occurence of each word

word_count= {}

for w in words.collect():
  if w[0] in word_count:
    word_count[w[0]] += 1
  else:
    word_count[w[0]] = 1

for w in word_count:
  print(w, word_count[w])

The 482
Project 79
Gutenberg 22
eBook 4
of 1875
Around 4
the 4316
World 3
in 991
Eighty 3
Days 3
 2182
This 46
ebook 2
is 288
for 407
use 16
anyone 6
anywhere 4
United 23
States 10
and 1792
most 43
other 59
parts 4
world 30
at 576
no 124
cost 9
with 550
almost 19
restrictions 2
whatsoever. 2
You 31
may 38
copy 9
it, 37
give 17
it 322
away 16
or 185
re-use 2
under 41
terms 21
License 8
included 3
this 292
online 4
www.gutenberg.org. 4
If 31
you 243
are 169
not 500
located 7
States, 6
will 108
have 259
to 1690
check 4
laws 11
country 18
where 65
before 77
using 6
eBook. 2
Title: 1
Author: 1
Jules 2
Verne 2
Release 1
date: 1
January 1
1, 1
1994 1
[eBook 1
#103] 1
Most 2
recently 1
updated: 1
October 11
29, 1
2024 1
Language: 1
English 57
*** 4
START 1
OF 49
THE 64
PROJECT 4
GUTENBERG 3
EBOOK 2
AROUND 4
WORLD 2
IN 81
EIGHTY 2
DAYS 2
[Illustration] 1
by 377
Contents 1
CHAPTER 74
I. 2
WHICH 76
PHILEAS 36
FOGG 34
AND 17
PASSEPARTOUT 20
ACCEPT 2
EACH 4
OTHER, 2
ONE 2
AS 6
MASTER, 4
OTHER 3
MAN

In [None]:
# b. a common first step in text analysis, change all capital letters to lower case
words_lower = {}
for word in word_count.keys():
  words_lower[word.lower()] = word_count[word]

for w in words_lower:
  print(w, words_lower[w])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
yellow. 1
barber’s 1
shaved 1
old, 2
permitted 5
wear 1
yellow, 2
imperial 1
colour. 1
why, 1
funny. 1
“carnatic,” 6
walking 4
disappointed. 2
bad,” 1
club!” 2
accosted 2
merry 2
chagrin. 2
inveigh 1
come! 1
escape, 1
manage 2
detain 3
america?” 1
teeth. 2
persuade 1
yourself 2
us. 2
berth.” 1
cabins 1
persons. 2
tickets, 1
informed 3
repairs 2
completed, 3
announced. 1
suit 1
better,” 1
know.” 1
move; 1
keeping 8
invited 4
tavern 2
eye 2
entering, 1
handsomely 1
decorated, 1
camp-bed 1
cushions. 1
bed 5
sleep. 3
customers 1
drinking 2
beer, 1
porter, 3
brandy; 1
smoking, 1
pipes 3
stuffed 1
balls 2
essence 1
rose. 1
smokers, 1
narcotic, 2
sots. 1
smoking-house 1
wretched, 1
cadaverous, 1
idiotic 1
miserable 2
drug 1
opium, 2
million 1
pounds—thousands 1
despicable 1
vices 1
afflict 1
humanity! 1
evil 1
stringent 1
laws. 2
exclusively 1
classes, 1
ravages 1
arrested. 1
smoked 1
empire; 1
accustomed 2
dispense 1
suffering 

In [None]:
# c. eliminate the stop words.

words_lower_stop_words = {}

stop_words = [
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and",
    "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being",
    "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't",
    "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during",
    "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't",
    "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here",
    "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i",
    "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's",
    "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself",
    "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought",
    "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she",
    "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than",
    "that", "that's", "the", "their", "theirs", "them", "themselves", "then",
    "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've",
    "this", "those", "through", "to", "too", "under", "until", "up", "very", "was",
    "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what",
    "what's", "when", "when's", "where", "where's", "which", "while", "who",
    "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you",
    "you'd", "you'll", "you're", "you've", "your", "yours", "yourself",
    "yourselves"
]


for w in words_lower:
  if  w not in stop_words:
    words_lower_stop_words[w] = words_lower[w]

for w in words_lower_stop_words:
  print(w, words_lower_stop_words[w])


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
flower-boats, 1
floating 3
parterres. 1
noticed 1
yellow. 1
barber’s 1
shaved 1
old, 2
permitted 5
wear 1
yellow, 2
imperial 1
colour. 1
why, 1
funny. 1
“carnatic,” 6
walking 4
disappointed. 2
bad,” 1
club!” 2
accosted 2
merry 2
chagrin. 2
inveigh 1
come! 1
escape, 1
manage 2
detain 3
america?” 1
teeth. 2
persuade 1
us. 2
berth.” 1
cabins 1
persons. 2
tickets, 1
informed 3
repairs 2
completed, 3
announced. 1
suit 1
better,” 1
know.” 1
move; 1
keeping 8
invited 4
tavern 2
eye 2
entering, 1
handsomely 1
decorated, 1
camp-bed 1
cushions. 1
bed 5
sleep. 3
customers 1
drinking 2
beer, 1
porter, 3
brandy; 1
smoking, 1
pipes 3
stuffed 1
balls 2
essence 1
rose. 1
smokers, 1
narcotic, 2
sots. 1
smoking-house 1
wretched, 1
cadaverous, 1
idiotic 1
miserable 2
drug 1
opium, 2
million 1
pounds—thousands 1
despicable 1
vices 1
afflict 1
humanity! 1
evil 1
stringent 1
laws. 2
exclusively 1
classes, 1
ravages 1
arrested. 1
smoked 1
empir

In [None]:
# d. sort in alphabetical order

words_lower_stop_words_sorted = {}

for w in sorted(words_lower_stop_words):
  words_lower_stop_words_sorted[w] = words_lower_stop_words[w]

for w in words_lower_stop_words_sorted:
  print(w, words_lower_stop_words_sorted[w])


 2182
#103] 1
$5,000) 1
&c., 1
($1 1
(801) 1
(a) 1
(and 1
(any 1
(b) 1
(c) 1
(does 1
(if 1
(japan), 1
(or 3
(saturday, 1
(sort 1
(sunday) 1
(trademark/copyright) 1
(www.gutenberg.org), 1
(“the 1
*** 4
- 3
------- 1
. 3
..... 1
........ 1
......... 1
............. 2
................. 1
................... 1
.................... 1
............................................ 1
1, 1
1,000. 1
1,170, 1
1. 1
1.a. 1
1.b. 1
1.c 1
1.c. 1
1.d. 1
1.e 1
1.e. 1
1.e.1 3
1.e.1. 2
1.e.2. 1
1.e.3. 1
1.e.4. 1
1.e.5. 1
1.e.6. 1
1.e.7 2
1.e.7. 1
1.e.8 2
1.e.8. 2
1.e.9. 3
1.f. 1
1.f.1. 1
1.f.2. 1
1.f.3, 3
1.f.3. 2
1.f.4. 1
1.f.5. 1
1.f.6. 1
10th, 1
11 1
11.40 1
117, 1
117. 1
11th 1
11th, 5
11th?” 1
12th 3
13 2
13th 2
13th, 1
14th 2
14th, 1
14th; 1
1500 1
158½; 1
15th, 1
16th 1
1756, 1
17th 2
1814. 1
1825; 1
1837, 1
1839 1
1842; 1
1843, 1
1845, 1
1849—a 1
1853, 1
1862 1
1867, 1
1872, 2
18th, 1
1994 1
19th 1
2. 1
20% 1
2001, 1
2024 1
20th 1
20th, 2
20th. 1
20th—thus 1
21st 11
22 1
22nd, 1
22nd; 1
23rd 4
25th

In [None]:
# e. sort descending by word frequency

words_lower_stop_words_sorted_freq = {}

for w in sorted(words_lower_stop_words_sorted, key=words_lower_stop_words_sorted.get, reverse=True):
  words_lower_stop_words_sorted_freq[w] = words_lower_stop_words_sorted[w]


for i, w in enumerate(words_lower_stop_words_sorted_freq ):
  print(w, words_lower_stop_words_sorted_freq[w])
  if i > 10:
    break

 2182
mr. 371
fogg 331
passepartout 219
phileas 214
said 155
fogg, 130
passepartout, 117
“i 115
hundred 90
replied 89
now 87


In [None]:
# f. remove punctuations and blank spaces

words_lower_stop_words_sorted_freq_punc: dict[str, int] = {}

punctuations = [
    "!", '"', "#", "$", "£", "€", '•', "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/",
    ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|",
    "}", "~", " ", "“", "—", "”", "‘", "’"
]

for w, freq in words_lower_stop_words_sorted.items():
    cleaned = w
    for p in punctuations:
        cleaned = cleaned.replace(p, "")
    cleaned = cleaned.strip()
    if cleaned:
        words_lower_stop_words_sorted_freq_punc[cleaned] = freq

for i, (w, freq) in enumerate(words_lower_stop_words_sorted_freq_punc.items()):
    print(w, freq)



103 1
5000 1
c 1
1 1
801 1
a 2
and 1
any 1
b 1
does 2
if 5
japan 2
or 2
saturday 1
sort 17
sunday 2
trademarkcopyright 1
wwwgutenbergorg 4
the 2
1000 1
1170 1
1a 1
1b 1
1c 1
1d 1
1e 1
1e1 2
1e2 1
1e3 1
1e4 1
1e5 1
1e6 1
1e7 1
1e8 2
1e9 3
1f 1
1f1 1
1f2 1
1f3 2
1f4 1
1f5 1
1f6 1
10th 1
11 1
1140 1
117 1
11th 1
12th 3
13 2
13th 1
14th 1
1500 1
158½ 1
15th 1
16th 1
1756 1
17th 2
1814 1
1825 1
1837 1
1839 1
1842 1
1843 1
1845 1
1849a 1
1853 1
1862 1
1867 1
1872 2
18th 1
1994 1
19th 1
2 1
20 1
2001 1
2024 1
20th 1
20ththus 1
21st 1
22 1
22nd 1
23rd 4
25th 1
28th 1
29 1
2nd 1
3 1
30 1
30th 2
3rd 1
4 1
43 1
43the 1
48 1
4th 2
5 1
50 1
501c3 1
5961887 1
5th 1
6 1
635 1
60 1
646221541 1
6th 1
7 1
720 2
723 1
7th 3
840 1
845 1
80 1
809 1
84116 1
8th 1
9 1
90 2
9th 2
ebook 2
illustration 1
daily 9
detective 1
eastward 1
eighty 1
illustrated 2
must 3
pall 2
railway 1
seventynine 1
standard 1
suez 4
tabledhôte 1
this 1
times 1
very 1
visa 2
visaed 2
viâ 4
westward 1
am 2
abandon 5
abandoned 5
aband

# 2. What does the following cell block do?
Comment the code below line by line after the provided hash-tag. You should be able to explain each line while respecting the pep8 style guide of 79 characters or less per line!

In [None]:
 # Create an RDD of tuples (name, age)
dataRDD = sc.parallelize([("Brooke", 20), ("Denny", 31), ("Jules", 30),
("TD", 35), ("Brooke", 25)])

# Try to undestand what this code does (line by line)
agesRDD = (dataRDD
  #Transform each record into (name,(age,1)), preparing for the aggregation
  .map(lambda x: (x[0], (x[1], 1)))
  #Groups by name and sums ages (x[0] + y[0]) and counts (x[1]+y[1])
  .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
  #Now we compute the average age per name by dividing total age by total count
  .map(lambda x: (x[0], x[1][0]/x[1][1])))

print(agesRDD.collect())



[('Brooke', 22.5), ('Denny', 31.0), ('Jules', 30.0), ('TD', 35.0)]


# Where to go from here

Further exploration for students who complete the lab before the end of the session or want to go further.

- perform eda on the original french version of the [book](https://www.gutenberg.org/ebooks/46541.txt.utf-8) and compare the two
- recomplete the exercises using a the docker install
- install java and spark directly onto host machine and either rexplore this notebook or perform eda on other data sets
- write a simple python timer function for seeing how quickly your rdd runs as written. change the order of the steps in order to make the rdd run as optimally as possible