In [10]:
from pyspark.sql import SparkSession 

spark = (SparkSession
            .builder
            .appName("Demo App")
            .config("spark.ui.port", "4050")
            .getOrCreate()
            )

In [12]:
spark.sparkContext

In [13]:
type(spark)

pyspark.sql.session.SparkSession

In [14]:
spark

In [15]:
spark.sparkContext.setLogLevel('ERROR')

In [16]:
spark.read

<pyspark.sql.readwriter.DataFrameReader at 0xffff75278e10>

In [19]:
[i for i in dir(spark.read) if not i.startswith("_")]

['csv',
 'format',
 'jdbc',
 'json',
 'load',
 'option',
 'options',
 'orc',
 'parquet',
 'schema',
 'table',
 'text']

In [21]:
DATASET_PATH = 's3://data-engg-suman/dataset/book-1.txt'

In [22]:
book = spark.read.text(DATASET_PATH)

In [23]:
book.show()

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
|This eBook is for...|
|most other parts ...|
|whatsoever. You m...|
|of the Project Gu...|
|www.gutenberg.org...|
|will have to chec...|
|   using this eBook.|
|                    |
|Title: Pride and ...|
|                    |
| Author: Jane Austen|
|                    |
|Release Date: Nov...|
|                    |
|   Language: English|
|                    |
|Produced by: Chuc...|
|             http...|
+--------------------+
only showing top 20 rows



                                                                                

In [24]:
book.printSchema()

root
 |-- value: string (nullable = true)



In [28]:
book.show(10, truncate=False, vertical=False)

+------------------------------------------------------------------------+
|value                                                                   |
+------------------------------------------------------------------------+
|The Project Gutenberg eBook of Pride and prejudice, by Jane Austen      |
|                                                                        |
|This eBook is for the use of anyone anywhere in the United States and   |
|most other parts of the world at no cost and with almost no restrictions|
|whatsoever. You may copy it, give it away or re-use it under the terms  |
|of the Project Gutenberg License included with this eBook or online at  |
|www.gutenberg.org. If you are not located in the United States, you     |
|will have to check the laws of the country where you are located before |
|using this eBook.                                                       |
|                                                                        |
+------------------------

In [27]:
book.show(10, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------
 value | The Project Gutenberg eBook of Pride and prejudice, by Jane Austen       
-RECORD 1-------------------------------------------------------------------------
 value |                                                                          
-RECORD 2-------------------------------------------------------------------------
 value | This eBook is for the use of anyone anywhere in the United States and    
-RECORD 3-------------------------------------------------------------------------
 value | most other parts of the world at no cost and with almost no restrictions 
-RECORD 4-------------------------------------------------------------------------
 value | whatsoever. You may copy it, give it away or re-use it under the terms   
-RECORD 5-------------------------------------------------------------------------
 value | of the Project Gutenberg License included with this eBook or online at   
-REC

In [31]:
from pyspark.sql import functions as F 

lines = book.select(F.split(book.value, ' ').alias('line'))

## Different ways to select columns 

In [33]:
book.select(book.value).show(3)

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
|This eBook is for...|
+--------------------+
only showing top 3 rows



In [34]:
book.select(book['value']).show(3)

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
|This eBook is for...|
+--------------------+
only showing top 3 rows



In [35]:
book.select(F.col('value')).show(3)

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
|This eBook is for...|
+--------------------+
only showing top 3 rows



In [38]:
book.select('value').show(3)

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
|This eBook is for...|
+--------------------+
only showing top 3 rows



In [46]:
lines.show()

+--------------------+
|                line|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[most, other, par...|
|[whatsoever., You...|
|[of, the, Project...|
|[www.gutenberg.or...|
|[will, have, to, ...|
|[using, this, eBo...|
|                  []|
|[Title:, Pride, a...|
|                  []|
|[Author:, Jane, A...|
|                  []|
|[Release, Date:, ...|
|                  []|
|[Language:, English]|
|                  []|
|[Produced, by:, C...|
|[, , , , , , , , ...|
+--------------------+
only showing top 20 rows



In [52]:
# lines = book.select(F.split(book.value, ' ').alias('line'))
book.show(10, False)

+------------------------------------------------------------------------+
|value                                                                   |
+------------------------------------------------------------------------+
|The Project Gutenberg eBook of Pride and prejudice, by Jane Austen      |
|                                                                        |
|This eBook is for the use of anyone anywhere in the United States and   |
|most other parts of the world at no cost and with almost no restrictions|
|whatsoever. You may copy it, give it away or re-use it under the terms  |
|of the Project Gutenberg License included with this eBook or online at  |
|www.gutenberg.org. If you are not located in the United States, you     |
|will have to check the laws of the country where you are located before |
|using this eBook.                                                       |
|                                                                        |
+------------------------

In [53]:
lines.show()

+--------------------+
|                line|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[most, other, par...|
|[whatsoever., You...|
|[of, the, Project...|
|[www.gutenberg.or...|
|[will, have, to, ...|
|[using, this, eBo...|
|                  []|
|[Title:, Pride, a...|
|                  []|
|[Author:, Jane, A...|
|                  []|
|[Release, Date:, ...|
|                  []|
|[Language:, English]|
|                  []|
|[Produced, by:, C...|
|[, , , , , , , , ...|
+--------------------+
only showing top 20 rows



In [62]:
lines.select(F.explode(F.col('line')).alias('word')).show()

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     eBook|
|        of|
|     Pride|
|       and|
|prejudice,|
|        by|
|      Jane|
|    Austen|
|          |
|      This|
|     eBook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows



In [63]:
words = lines.select(F.explode(F.col('line')).alias('word'))

In [64]:
words.show()

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     eBook|
|        of|
|     Pride|
|       and|
|prejudice,|
|        by|
|      Jane|
|    Austen|
|          |
|      This|
|     eBook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows



In [65]:
# words.select(F.lower(F.col('word')).alias('word_lower')).show()
words.select(F.lower(F.col("word")).alias("word_lower")).show()

+----------+
|word_lower|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|     pride|
|       and|
|prejudice,|
|        by|
|      jane|
|    austen|
|          |
|      this|
|     ebook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows



In [67]:
words_lower = words.select(F.lower(F.col("word")).alias("word_lower"))

In [70]:
words_lower.show(2)

+----------+
|word_lower|
+----------+
|       the|
|   project|
+----------+
only showing top 2 rows



In [71]:
words_clean = words_lower.select(F.regexp_extract(F.col('word_lower'), '[a-z]*', 0).alias('word'))

In [73]:
words_clean.filter(F.col('word') != '').show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
+---------+
only showing top 20 rows



In [76]:
words_nonull = words_clean.filter(F.col('word') != '')

In [77]:
words_nonull.show(2)

+-------+
|   word|
+-------+
|    the|
|project|
+-------+
only showing top 2 rows



In [78]:
groups = words_nonull.groupby(F.col('word'))

In [79]:
groups

<pyspark.sql.group.GroupedData at 0xffff752db590>

In [81]:
groups.count().show()

+-----------+-----+
|       word|count|
+-----------+-----+
|     online|    5|
|       hope|  126|
| palpitated|    1|
|    solaced|    1|
|    elevate|    1|
|  solemnity|    5|
|     spared|    9|
|    courted|    2|
|ingratitude|    1|
|      parts|    8|
|   positive|    4|
|    highest|   10|
|      hurry|   11|
|      oddly|    1|
|   laughing|   10|
|     speedy|    2|
|   slightly|    4|
|      scorn|    1|
|      staff|    1|
|    explain|    7|
+-----------+-----+
only showing top 20 rows



                                                                                

In [82]:
results = groups.count()

In [83]:
results.show()

+-----------+-----+
|       word|count|
+-----------+-----+
|     online|    5|
|       hope|  126|
| palpitated|    1|
|    solaced|    1|
|    elevate|    1|
|  solemnity|    5|
|     spared|    9|
|    courted|    2|
|ingratitude|    1|
|      parts|    8|
|   positive|    4|
|    highest|   10|
|      hurry|   11|
|      oddly|    1|
|   laughing|   10|
|     speedy|    2|
|   slightly|    4|
|      scorn|    1|
|      staff|    1|
|    explain|    7|
+-----------+-----+
only showing top 20 rows



                                                                                

In [85]:
results.orderBy(F.col('count').desc()).show()

+----+-----+
|word|count|
+----+-----+
| the| 4803|
|  to| 4374|
|  of| 3949|
| and| 3685|
| her| 2254|
|   a| 2063|
|  in| 2024|
| was| 1870|
|   i| 1778|
| she| 1703|
|that| 1540|
|  it| 1533|
| not| 1505|
| you| 1317|
|  he| 1316|
| his| 1278|
|  be| 1278|
|  as| 1223|
| had| 1179|
|with| 1140|
+----+-----+
only showing top 20 rows



In [86]:
results_2 = results.orderBy(F.col('count').desc())

In [87]:
results_2.show()

[Stage 41:>                                                         (0 + 1) / 1]

+----+-----+
|word|count|
+----+-----+
| the| 4803|
|  to| 4374|
|  of| 3949|
| and| 3685|
| her| 2254|
|   a| 2063|
|  in| 2024|
| was| 1870|
|   i| 1778|
| she| 1703|
|that| 1540|
|  it| 1533|
| not| 1505|
| you| 1317|
|  he| 1316|
| his| 1278|
|  be| 1278|
|  as| 1223|
| had| 1179|
|with| 1140|
+----+-----+
only showing top 20 rows



                                                                                

In [88]:
PROCESSED_DATA_PATH = 's3://data-engg-suman/processed_data/book-1'

In [89]:
results_2.coalesce(1).write.csv(PROCESSED_DATA_PATH)

                                                                                

In [96]:
DATA_PATH = 's3://data-engg-suman/processed_data/book-1/part-00000-bb3b130d-9ec8-47c9-8727-9475ed33fee0-c000.csv'

In [97]:
results_2_verify = spark.read.csv(DATA_PATH)

In [98]:
results_2_verify.show()

+----+----+
| _c0| _c1|
+----+----+
| the|4803|
|  to|4374|
|  of|3949|
| and|3685|
| her|2254|
|   a|2063|
|  in|2024|
| was|1870|
|   i|1778|
| she|1703|
|that|1540|
|  it|1533|
| not|1505|
| you|1317|
|  he|1316|
| his|1278|
|  be|1278|
|  as|1223|
| had|1179|
|with|1140|
+----+----+
only showing top 20 rows



# SparkSession

In [18]:
from pyspark.sql import SparkSession 
from pyspark.sql import functions as F

spark = (SparkSession
            .builder
            .appName("Demo App")
            .config("spark.ui.port", "4050")
            .getOrCreate()
            )

spark.sparkContext.setLogLevel('ERROR')

# Exercise 2.2

In [3]:
exo2_2_df = spark.createDataFrame(
                                [["test", "more test", 10_000_000_000]], ["one", "two", "three"]
                    )

In [4]:
exo2_2_df.show()

+----+---------+-----------+
| one|      two|      three|
+----+---------+-----------+
|test|more test|10000000000|
+----+---------+-----------+



In [11]:
exo2_2_df.dtypes

[('one', 'string'), ('two', 'string'), ('three', 'bigint')]

## Solution 

In [17]:
count = sum([ 1 for i in exo2_2_df.dtypes if i[1] != 'string'])
print(count)

1


# Exercise 2.3

In [22]:
DATA_PATH = 's3://data-engg-suman/dataset/all-book-sataset/gutenberg_books/1342-0.txt'

exo2_3_df = ( spark
                .read
                .text(DATA_PATH)
                .select(F.length(F.col('value')))
                .withColumnRenamed("length(value)", "number_of_char")
) 

exo2_3_df.show(5)

+--------------+
|number_of_char|
+--------------+
|            66|
|             0|
|            64|
|            68|
|            67|
+--------------+
only showing top 5 rows



## Solution

In [25]:
DATA_PATH = 's3://data-engg-suman/dataset/all-book-sataset/gutenberg_books/1342-0.txt'

exo2_3_df = ( spark
                .read
                .text(DATA_PATH)
                .select(F.length(F.col('value')).alias('number_of_char'))
) 

exo2_3_df.show(5)

+--------------+
|number_of_char|
+--------------+
|            66|
|             0|
|            64|
|            68|
|            67|
+--------------+
only showing top 5 rows



# Exercise 2.4

In [3]:
from pyspark.sql import SparkSession 
from pyspark.sql import functions as F

spark = (SparkSession
            .builder
            .appName("Demo App")
            .config("spark.ui.port", "4050")
            .getOrCreate()
            )

spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [None]:
from pyspark.sql.functions import col, greatest
exo2_4_df = spark.createDataFrame(
[["key", 10_000, 20_000]], ["key", "value1", "value2"]
)
exo2_4_df.printSchema()
# root
# |-- key: string (containsNull = true)
# |-- value1: long (containsNull = true)
# |-- value2: long (containsNull = true)
# `greatest` will return the greatest value of the list of column names,
# skipping null value
# The following statement will return an error
from pyspark.sql.utils import AnalysisException
try:
exo2_4_mod = exo2_4_df.select(
greatest(col("value1"), col("value2")).alias("maximum_value")
).select("key", "max_value")
except AnalysisException as err:
print(err)

In [10]:
exo2_4_df = spark.createDataFrame(
[["key", 10_000, 20_000]], ["key", "value1", "value2"]
)

In [11]:
exo2_4_df.columns

['key', 'value1', 'value2']

In [12]:
exo2_4_df.show()

+---+------+------+
|key|value1|value2|
+---+------+------+
|key| 10000| 20000|
+---+------+------+



## Solution

In [22]:
from pyspark.sql.utils import AnalysisException
from pyspark.sql import functions as F 

try:
    exo2_4_mod = exo2_4_df.select(
                                    F.greatest(F.col("value1"), F.col("value2")).alias("maximum_value")
                                    ).select("key", "max_value")
except AnalysisException as err:
    print(err)

Column 'key' does not exist. Did you mean one of the following? [maximum_value];
'Project ['key, 'max_value]
+- Project [greatest(value1#39L, value2#40L) AS maximum_value#100L]
   +- LogicalRDD [key#38, value1#39L, value2#40L], false



In [23]:
from pyspark.sql.utils import AnalysisException
from pyspark.sql import functions as F 

try:
    exo2_4_mod = exo2_4_df.select((F.greatest(F.col("value1"), F.col("value2"))).alias("maximum_value"), F.col('key'))
except AnalysisException as err:
    print(err)

exo2_4_mod.show()

+-------------+---+
|maximum_value|key|
+-------------+---+
|        20000|key|
+-------------+---+



# Exercise 2.5

Let’s take our words_nonull data frame, available in the next listing.

a) Remove all of the occurrences of the word is.

b) (Challenge) Using the length function, keep only the words with more than three
characters.

In [26]:
DATASET_PATH = 's3://data-engg-suman/dataset/book-1.txt' 

book = spark.read.text(DATASET_PATH)
lines = book.select(F.split(book.value, ' ').alias('line'))
words = lines.select(F.explode(F.col('line')).alias('word'))
words_lower = words.select(F.lower(F.col("word")).alias("word_lower"))
words_clean = words_lower.select(F.regexp_extract(F.col('word_lower'), '[a-z]*', 0).alias('word'))
words_nonull = words_clean.filter(F.col('word') != '')

In [28]:
words_nonull.count()

127438

In [35]:
words_nonull.filter(F.col('word') == 'is').count()

921

In [36]:
words_without_is = words_nonull.filter(F.col('word') != 'is')
words_without_is.count() 

126517

## Solution (a)

In [41]:
DATASET_PATH = 's3://data-engg-suman/dataset/book-1.txt' 

book = spark.read.text(DATASET_PATH)
lines = book.select(F.split(book.value, ' ').alias('line'))
words = lines.select(F.explode(F.col('line')).alias('word'))
words_lower = words.select(F.lower(F.col("word")).alias("word_lower"))
words_clean = words_lower.select(F.regexp_extract(F.col('word_lower'), '[a-z]*', 0).alias('word'))
words_nonull = words_clean.filter(F.col('word') != '')

words_without_is = words_nonull.filter(F.col('word') != 'is')

words_without_is.filter(F.col('word') == 'is')

DataFrame[word: string]

## Solution (b)

In [46]:
DATASET_PATH = 's3://data-engg-suman/dataset/book-1.txt' 

book = spark.read.text(DATASET_PATH)
lines = book.select(F.split(book.value, ' ').alias('line'))
words = lines.select(F.explode(F.col('line')).alias('word'))
words_lower = words.select(F.lower(F.col("word")).alias("word_lower"))
words_clean = words_lower.select(F.regexp_extract(F.col('word_lower'), '[a-z]*', 0).alias('word'))
words_nonull = words_clean.filter(F.col('word') != '')

words_len_more_3 = words_nonull.filter(F.length(F.col('word')) > 3)

words_len_more_3.show()


+---------+
|     word|
+---------+
|  project|
|gutenberg|
|    ebook|
|    pride|
|prejudice|
|     jane|
|   austen|
|     this|
|    ebook|
|   anyone|
| anywhere|
|   united|
|   states|
|     most|
|    other|
|    parts|
|    world|
|     cost|
|     with|
|   almost|
+---------+
only showing top 20 rows



# Exercise 2.6

## Solution

In [59]:
DATASET_PATH = 's3://data-engg-suman/dataset/book-1.txt' 
stopping_words = "is not the if".split(" ")

book = spark.read.text(DATASET_PATH)
lines = book.select(F.split(book.value, ' ').alias('line'))
words = lines.select(F.explode(F.col('line')).alias('word'))
words_lower = words.select(F.lower(F.col("word")).alias("word_lower"))
words_clean = words_lower.select(F.regexp_extract(F.col('word_lower'), '[a-z]*', 0).alias('word'))
words_nonull = words_clean.filter(F.col('word') != '')

words_without_stopping_words = words_nonull.select(F.col('word')).where(~F.col('word').isin(stopping_words))
words_without_stopping_words.show(5)

+---------+
|     word|
+---------+
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
+---------+
only showing top 5 rows



# Exercise 2.7

In [60]:
from pyspark.sql.functions import col, split
from pyspark.sql import functions as F 

DATASET_PATH = 's3://data-engg-suman/dataset/all-book-sataset/gutenberg_books/1342-0.txt' 
 
try:
    book = spark.read.text(DATASET_PATH)
    book = book.printSchema()
    lines = book.select(F.split(book.value, " ").alias("line"))
    words = lines.select(F.explode(col("line")).alias("word"))
except AnalysisException as err:
    print(err)

root
 |-- value: string (nullable = true)



AttributeError: 'NoneType' object has no attribute 'select'

## Solution

In [65]:
from pyspark.sql.functions import col, split
from pyspark.sql import functions as F 

DATASET_PATH = 's3://data-engg-suman/dataset/all-book-sataset/gutenberg_books/1342-0.txt' 
 
try:
    book = spark.read.text(DATASET_PATH)
    # book = book.printSchema()
    lines = book.select(F.split(book.value, " ").alias("line"))
    words = lines.select(F.explode(col("line")).alias("word"))
except AnalysisException as err:
    print(err)

words.show()

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     EBook|
|        of|
|     Pride|
|       and|
|Prejudice,|
|        by|
|      Jane|
|    Austen|
|          |
|      This|
|     eBook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows



# Exercise 3.1

In [72]:
words_nonull.show(3)

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
+---------+
only showing top 3 rows



## Solution

In [80]:
# Option b 
words_nonull.select(F.length(F.col('word')).alias('length')).groupBy(F.col('length')).count().show()


+------+-----+
|length|count|
+------+-----+
|    12|  852|
|     1| 3879|
|     6| 9629|
|     3|29354|
|     4|22598|
|     8| 5326|
|    11| 1443|
|    14|  122|
|    16|    6|
|     5|12417|
|    15|   35|
|     2|24356|
|    13|  396|
|    17|    3|
|     7| 9090|
|     9| 5357|
|    10| 2575|
+------+-----+



                                                                                

# Exercise 3.3

In [91]:
# Dataset for all files inside the dataset folder 
DATASET_PATH = 's3://data-engg-suman/dataset/book-1.txt' 

book = spark.read.text(DATASET_PATH)
lines = book.select(F.split(book.value, ' ').alias('line'))
words = lines.select(F.explode(F.col('line')).alias('word'))
words_lower = words.select(F.lower(F.col("word")).alias("word_lower"))
words_clean = words_lower.select(F.regexp_extract(F.col('word_lower'), '[a-z]*', 0).alias('word'))
words_nonull = words_clean.filter(F.col('word') != '')

groups = words_nonull.groupby(F.col('word'))
results = groups.count()

final_result = results.count()

print(final_result, type(final_result)) 

6893 <class 'int'>


# Exercise 3.4

## Solution

In [104]:
# Dataset for all files inside the dataset folder 
DATASET_PATH = 's3://data-engg-suman/dataset/book-1.txt' 

book = spark.read.text(DATASET_PATH)
lines = book.select(F.split(book.value, ' ').alias('line'))
words = lines.select(F.explode(F.col('line')).alias('word'))
words_lower = words.select(F.lower(F.col("word")).alias("word_lower"))
words_clean = words_lower.select(F.regexp_extract(F.col('word_lower'), '[a-z]*', 0).alias('word'))
words_nonull = words_clean.filter(F.col('word') != '')

groups = words_nonull.groupby(F.col('word'))
results = groups.count()

unique_words = results.orderBy('count').select(F.col('word')).where(F.col('count') == 1)




# Exercise 3.5

1 Using the substring function (refer to PySpark’s API or the pyspark shell if
needed), return the top five most popular first letters (keep only the first letter
of each word).
2 Compute the number of words starting with a consonant or a vowel. (Hint: The
isin() function might be useful.)

In [120]:
# Dataset for all files inside the dataset folder 
DATASET_PATH = 's3://data-engg-suman/dataset/book-1.txt' 

book = spark.read.text(DATASET_PATH)
lines = book.select(F.split(book.value, ' ').alias('line'))
words = lines.select(F.explode(F.col('line')).alias('word'))
words_lower = words.select(F.lower(F.col("word")).alias("word_lower"))
words_clean = words_lower.select(F.regexp_extract(F.col('word_lower'), '[a-z]*', 0).alias('word'))
words_nonull = words_clean.filter(F.col('word') != '')

groups = words_nonull.groupby(F.col('word'))
results = groups.count()

In [121]:
results.show()

+-----------+-----+
|       word|count|
+-----------+-----+
|     online|    5|
|       hope|  126|
| palpitated|    1|
|    solaced|    1|
|    elevate|    1|
|  solemnity|    5|
|     spared|    9|
|    courted|    2|
|ingratitude|    1|
|      parts|    8|
|   positive|    4|
|    highest|   10|
|      hurry|   11|
|      oddly|    1|
|   laughing|   10|
|     speedy|    2|
|   slightly|    4|
|      scorn|    1|
|      staff|    1|
|    explain|    7|
+-----------+-----+
only showing top 20 rows



In [129]:
results.withColumn('first_letter', F.substring(F.col('word'), 1, 1)).show(5)

+----------+-----+------------+
|      word|count|first_letter|
+----------+-----+------------+
|    online|    5|           o|
|      hope|  126|           h|
|palpitated|    1|           p|
|   solaced|    1|           s|
|   elevate|    1|           e|
+----------+-----+------------+
only showing top 5 rows



## Solution 

In [139]:
# Solution of Part 1 

# Dataset for all files inside the dataset folder 
DATASET_PATH = 's3://data-engg-suman/dataset/book-1.txt' 

book = spark.read.text(DATASET_PATH)
lines = book.select(F.split(book.value, ' ').alias('line'))
words = lines.select(F.explode(F.col('line')).alias('word'))
words_lower = words.select(F.lower(F.col("word")).alias("word_lower"))
words_clean = words_lower.select(F.regexp_extract(F.col('word_lower'), '[a-z]*', 0).alias('word'))
words_nonull = words_clean.filter(F.col('word') != '')

groups = words_nonull.groupby(F.col('word'))
results = groups.count()


results.withColumn('first_letter', F.substring(F.col('word'), 1, 1)).groupby(F.col('first_letter')).sum().orderBy('sum(count)', ascending=False).show()

+------------+----------+
|first_letter|sum(count)|
+------------+----------+
|           t|     16844|
|           a|     14358|
|           h|     10734|
|           w|      9388|
|           s|      9073|
|           i|      8570|
|           o|      7596|
|           m|      6886|
|           b|      6277|
|           c|      5104|
|           f|      4419|
|           d|      4019|
|           n|      3579|
|           e|      3524|
|           p|      3427|
|           l|      3239|
|           r|      2552|
|           y|      2203|
|           g|      2018|
|           u|      1105|
+------------+----------+
only showing top 20 rows



## Solution 

In [168]:
# Solution of Part 2 

# Dataset for all files inside the dataset folder 
DATASET_PATH = 's3://data-engg-suman/dataset/book-1.txt' 

book = spark.read.text(DATASET_PATH)
lines = book.select(F.split(book.value, ' ').alias('line'))
words = lines.select(F.explode(F.col('line')).alias('word'))
words_lower = words.select(F.lower(F.col("word")).alias("word_lower"))
words_clean = words_lower.select(F.regexp_extract(F.col('word_lower'), '[a-z]*', 0).alias('word'))
words_nonull = words_clean.filter(F.col('word') != '')

vowel = list('aeiou')

results_with_first_letter = words_nonull.withColumn('first_letter', F.substring(F.col('word'), 1, 1))

words_starting_WITH_vowel = results_with_first_letter.select(F.col('word'), F.col('first_letter')).where(F.col('first_letter').isin(vowel)).groupby(F.col('first_letter')).count()
words_starting_WITH_vowel.show()

words_starting_NOT_WITH_vowel = results_with_first_letter.select(F.col('word'), F.col('first_letter')).where(~F.col('first_letter').isin(vowel)).groupby(F.col('first_letter')).count()
words_starting_NOT_WITH_vowel.show()

+------------+-----+
|first_letter|count|
+------------+-----+
|           a|14358|
|           i| 8570|
|           u| 1105|
|           e| 3524|
|           o| 7596|
+------------+-----+

+------------+-----+
|first_letter|count|
+------------+-----+
|           l| 3239|
|           p| 3427|
|           d| 4019|
|           j|  588|
|           s| 9073|
|           f| 4419|
|           h|10734|
|           y| 2203|
|           q|  259|
|           b| 6277|
|           g| 2018|
|           k|  696|
|           v|  918|
|           r| 2552|
|           m| 6886|
|           n| 3579|
|           c| 5104|
|           t|16844|
|           x|   62|
|           w| 9388|
+------------+-----+



In [177]:
# Dataset for all files inside the dataset folder 
DATASET_PATH = 's3://data-engg-suman/dataset/book-1.txt' 

book = spark.read.text(DATASET_PATH)
lines = book.select(F.split(book.value, ' ').alias('line'))
words = lines.select(F.explode(F.col('line')).alias('word'))
words_lower = words.select(F.lower(F.col("word")).alias("word_lower"))
words_clean = words_lower.select(F.regexp_extract(F.col('word_lower'), '[a-z]*', 0).alias('word'))
words_nonull = words_clean.filter(F.col('word') != '')

groups = words_nonull.groupby(F.col('word'))
results = groups.count()

results.withColumn('first_letter_vowel', F.substring(F.col('word'), 1, 1).isin(vowel)).groupby(F.col('first_letter_vowel')).sum().show()


+------------------+----------+
|first_letter_vowel|sum(count)|
+------------------+----------+
|              true|     35153|
|             false|     92285|
+------------------+----------+

