For help, look here:
https://spark.apache.org/docs/latest/rdd-programming-guide.html

In [2]:
# Check out pre-loaded dataset
display(dbutils.fs.ls('dbfs:/'))

path,name,size
dbfs:/FileStore/,FileStore/,0
dbfs:/databricks-datasets/,databricks-datasets/,0
dbfs:/databricks-results/,databricks-results/,0
dbfs:/tmp/,tmp/,0


In [3]:
# Create a rdd (sc = SparkContext)
rdd = sc.textFile("dbfs:/databricks-datasets/SPARK_README.md")

In [4]:
# Read 20 lines 
rdd.take(20)

In [5]:
# Example: lambda functions  
words = rdd.flatMap(lambda lines: lines.split(" "))

for w in words.collect():
  print(w)

In [6]:
# Take the previous function and
# 1. count all the words
rdd2=words.map(lambda x: (x,1))
rdd3=rdd2.reduceByKey(lambda a,b: a+b)
for r in rdd3.collect():
  print(r)

In [7]:
# 2. change all capital letters to lower case
def Func(lines):
  lines = lines.lower()
  lines = lines.split()
  return lines
rdd4 = words.map(Func)
rdd4.take(20)

In [8]:
# 3. eliminate stopwords 
stopwords = ['is','am','are','the','for','a']
rdd5 = rdd4.filter(lambda x: x not in stopwords)
rdd5.take(10)

In [9]:
# 4. sort in alphabetical order
rdd6=rdd3.sortByKey()
for i in rdd6.collect():
     print (i)

In [10]:
!pip install nltk

In [11]:
# 5. sort from most to least frequent word
rdd7 = rdd6.map(lambda x:(x[1],x[0]))
rdd7.sortByKey(False).take(20)

In [12]:
# 6.** remove punctuations 
def lower_clean_str(x):
  punc='!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~-'
  lowercased_str = x.lower()
  for ch in punc:
    lowercased_str = lowercased_str.replace(ch, '')
  return lowercased_str
rdd = rdd.map(lower_clean_str)
rdd=rdd.flatMap(lambda x: x.split(" "))
#filter white spaces
rdd = rdd.filter(lambda x:x!='')
#Count how many times each word occurs
count=rdd.map(lambda  word:(word,1))
#Apply ReduceByKey to find frequent words
count_RBK=count.reduceByKey(lambda x,y:(x+y)).sortByKey()
#We want to sort the most frequent words in descending order. As the first step, we switch (key,val) pairs as (val,key).
count_RBK=count_RBK.map(lambda x:(x[1],x[0]))
#We see that the most common word is "the". However, these values are words that we call stopwords which brings value to our analysis
count_RBK.sortByKey(False).take(10)
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords =stopwords.words('english')
count_RBK = count_RBK.filter(lambda x: x[1] not in stopwords).sortByKey(False)
count_RBK.sortByKey(False).take(20)

In [13]:
# Create an RDD of tuples (name, age)
dataRDD = sc.parallelize([("Brooke", 20), ("Denny", 31), ("Jules", 30),
("TD", 35), ("Brooke", 25)])

# Try to undestand what this code does (line by line)
agesRDD = (dataRDD
  .map(lambda x: (x[0], (x[1], 1)))
  .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
  .map(lambda x: (x[0], x[1][0]/x[1][1])))