# DSCI 617 – Homework 02
**Felix Asare**

In [0]:
# Import libraries
from pyspark.sql import SparkSession
import pandas as pd
from string import punctuation

In [0]:
# Create a SparkSession and spark context
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## Problem 1: Word Count

In [0]:
#1 Reading in data
ws_lines = sc.textFile('/FileStore/tables/shakespeare_complete.txt')

#2.Creating ws_words RDD
import string
ws_words = (
    ws_lines
    .flatMap(lambda line: line.split(' '))  
    .flatMap(lambda word: word.split('-'))   
    .flatMap(lambda word: word.split('_'))  
    .flatMap(lambda word: word.split('.'))    
    .flatMap(lambda word: word.split(',')) 
    .flatMap(lambda word: word.split(':'))   
    .flatMap(lambda word: word.split('|'))  
    .flatMap(lambda word: word.split('\t'))
    .map(lambda word: word.strip(string.punctuation)) 
    .map(lambda word: word.strip('0123456789'))
    .map(lambda word: word.replace("'", ''))   
    .map(lambda word: word.lower())         
    .filter(lambda word: word != '') 
)

ws_words.persist()
# Create dist_words RDD
dist_words = ws_words.distinct()

# Print ws_words and dist_words
print('Total Number of Words:    {}'.format(ws_words.count()))
print('Number of Distinct Words: {}'.format(dist_words.count()))


Total Number of Words:    887279
Number of Distinct Words: 25362


In [0]:
# Collecting sample of words
sample_words = ws_words.sample(withReplacement=False, fraction=0.001)
print(sample_words.collect())

['in', 'thou', 'so', 'own', 'have', 'with', 'the', 'mayst', 'of', 'my', 'grant', 'womb', 'their', 'when', 'hath', 'whom', 'yet', 'perfumes', 'is', 'never', 'those', 'a', 'a', 'florentine', 'scorn', 'so', 'youll', 'you', 'his', 'the', 'sword', 'two', 'in', 'to', 'rousillon', 'very', 'have', 'letters', 'treasure', 'you', 'which', 'is', 'midnight', 'him', 'her', 'our', 'my', 'ear', 'greater', 'brother', 'son', 'she', 'it', 'you', 'tawny', 'pleasure', 'take', 'this', 'speak', 'be', 'get', 'well', 'antony', 'noble', 'caesar', 'exercise', 'who', 'caesar', 'even', 'and', 'but', 'he', 'duty', 'nails', 'oertake', 'dercetas', 'fetch', 'purchase', 'have', 'your', 'say', 'do', 'go', 'when', 'and', 'you', 'to', 'if', 'i', 'exeunt', 'it', 'it', 'good', 'religiously', 'the', 'great', 'and', 'youth', 'mother', 'nothing', 'her', 'have', 'thy', 'my', 'woman', 'her', 'at', 'you', 'will', 'carries', 'thou', 'centaur', 'get', 'deaf', 'second', 'your', 'you', 'have', 'me', 'i', 'better', 'cause', 'he', 'or'

## Problem 2: Longest Words

In [0]:
#1 Function to find the longest word
def longest_word(x, y):
  if len(x) > len(y):
    return x
  elif len(y) > len(x):
    return y
  else:
    return max(x, y)

longest = dist_words.reduce(longest_word)
print(longest)

honorificabilitudinitatibus


In [0]:
# 20 longest words
sorted_words = dist_words.sortBy(lambda x: -len(x))
print(sorted_words.take(20))

['honorificabilitudinitatibus', 'anthropophaginian', 'undistinguishable', 'indistinguishable', 'northamptonshire', 'superserviceable', 'incomprehensible', 'prognostication', 'unreconciliable', 'interrogatories', 'gioucestershire', 'extraordinarily', 'particularities', 'praeclarissimus', 'impossibilities', 'misconstruction', 'flibbertigibbet', 'circumscription', 'disproportioned', 'uncomprehensive']


## Problem 3: Word Frequency

In [0]:
#1 Create RDD pairs
pairs = ws_words.map(lambda x: (x, 1))

#2 Using reducebykey to group pairs
word_counts = pairs.reduceByKey(lambda x, y: x + y).sortBy(lambda x: -x[1])
            


#3 first 20 words of word_counts
words_20 = word_counts.take(20)

#4 Display the first 20 words of word_counts in a Dataframe
df = pd.DataFrame(words_20, columns=["Word", "Count"])
df


Unnamed: 0,Word,Count
0,the,27379
1,and,26082
2,i,20717
3,to,19661
4,of,17474
5,a,14723
6,you,13630
7,my,12489
8,in,10996
9,that,10915


## Problem 4: Removing Stop Words

In [0]:
#1 Stop Words file
sw_rdd = sc.textFile('/FileStore/tables/stopwords.txt')

#2 Print the number of stopwords
print(sw_rdd.count())

#3 Sampling stop words
print(sw_rdd.sample(withReplacement=False, fraction=0.05).collect())

#4 storing sw_rdd in a list sw
sw = sw_rdd.collect()

668
['any', 'apparently', 'are', 'co', 'could', 'do', "doesn't", 'downwards', 'due', 'eg', 'hereby', 'hundred', 'immediate', "isn't", 'itd', 'l', 'like', 'looks', 'moreover', 'necessary', 'now', 'nowhere', 'obtained', 'own', 'right', 'slightly', 'somethan', 'something', 'soon', 'sufficiently', 'sup', '', 'there', 'therein', 'thereto', 'thereupon', 'thoughh', 'usefully', 'vols', "we've", 'which', 'whoever', 'yours']


In [0]:
# Removing stopwords
ws_words_f = ws_words.filter(lambda x: x not in sw)

dist_words_f = ws_words_f.distinct()
print("Number of Distinct Non-Stop Words: ", dist_words_f.count())

Number of Distinct Non-Stop Words:  24840


In [0]:
# Repeating problem 4 with ws_words_f
# Creating RDD pairs from ws_words_f
ws_words_pairs = ws_words_f.map(lambda x: (x, 1))

#Using reduceByKey to count the number of times each word appears in ws_words_f
ws_words_counts = ws_words_pairs.reduceByKey(lambda x, y: x + y).sortBy(lambda x: -x[1])

#first 20 elements of ws_words_counts
df = pd.DataFrame(ws_words_counts.take(20), columns=['Word', 'Count'])

df


Unnamed: 0,Word,Count
0,will,4977
1,thy,4034
2,thee,3180
3,lord,3062
4,king,2871
5,good,2834
6,sir,2763
7,well,2553
8,enter,2350
9,love,2109


## Problem 5: Diamonds Dataset

In [0]:
# Read Diamond data
diamond_raw = sc.textFile("/FileStore/tables/diamonds.txt")

# Print the number of elements in the RDD
print("Number of elements in the RDD: ", diamond_raw.count())

Number of elements in the RDD:  53941


In [0]:
# first 5 elements of diamond_raw
for i in diamond_raw.take(5):
    print(i)

carat	cut	color	clarity	depth	table	price	x	y	z
0.23	Ideal	E	SI2	61.5	55	326	3.95	3.98	2.43
0.21	Premium	E	SI1	59.8	61	326	3.89	3.84	2.31
0.23	Good	E	VS1	56.9	65	327	4.05	4.07	2.31
0.29	Premium	I	VS2	62.4	58	334	4.2	4.23	2.63


In [0]:
# Function to tokenize data
def process_row(row):
    item = row.split('\t')
    return [float(item[0]), str(item[1]), str(item[2]), str(item[3]), 
            float(item[4]), float(item[5]), int(item[6]), float(item[7]),
             float(item[8]), float(item[9])]
# Filtering header row from diamond_raw
diamonds = (diamond_raw
            .filter(lambda x: 'carat' not in x)
            .map(process_row))

#First 5 elements of diamonds
for i in diamonds.take(5):
    print(i)

[0.23, 'Ideal', 'E', 'SI2', 61.5, 55.0, 326, 3.95, 3.98, 2.43]
[0.21, 'Premium', 'E', 'SI1', 59.8, 61.0, 326, 3.89, 3.84, 2.31]
[0.23, 'Good', 'E', 'VS1', 56.9, 65.0, 327, 4.05, 4.07, 2.31]
[0.29, 'Premium', 'I', 'VS2', 62.4, 58.0, 334, 4.2, 4.23, 2.63]
[0.31, 'Good', 'J', 'SI2', 63.3, 58.0, 335, 4.34, 4.35, 2.75]


## Problem 6: Grouped Means

In [0]:
#1 cut_summary list
cut_summary = (
    diamonds
    .map(lambda x: (x[1], (x[0], x[4], 1)))  
    .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1], a[2] + b[2]))
    .map(lambda x: (x[0], x[1][2], round(x[1][0] / x[1][2], 2), round(x[1][1] / x[1][2], 2)))
    .collect() 
)

#2 Pandas to display cut_summary
cut_df = pd.DataFrame(cut_summary, columns=["Cut", "Count", "Mean_Carat", "Mean_Price"])
cut_df

Unnamed: 0,Cut,Count,Mean_Carat,Mean_Price
0,Premium,13791,0.89,61.26
1,Very Good,12082,0.81,61.82
2,Good,4906,0.85,62.37
3,Fair,1610,1.05,64.04
4,Ideal,21551,0.7,61.71
