In [1]:
spark

### `Step 1` : Dataset

In [2]:
data_set = 's3://fcc-spark-example/dataset/gutenberg_books/1342-0.txt'

### `Step 2` : Loading the data 

In [5]:
book_df =  spark \
            .read \
            .text(data_set)

In [6]:
book_df.show()

                                                                                

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
|This eBook is for...|
|almost no restric...|
|re-use it under t...|
|with this eBook o...|
|                    |
|                    |
|Title: Pride and ...|
|                    |
| Author: Jane Austen|
|                    |
|Posting Date: Aug...|
|Release Date: Jun...|
|Last Updated: Mar...|
|                    |
|   Language: English|
|                    |
|Character set enc...|
|                    |
+--------------------+
only showing top 20 rows



In [7]:
book_df.printSchema()

root
 |-- value: string (nullable = true)



In [8]:
print(book_df.dtypes)

[('value', 'string')]


### Display the data 

In [10]:
book_df.show(10, truncate=50)

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------------------------------------+
|                                             value|
+--------------------------------------------------+
|The Project Gutenberg EBook of Pride and Prejud...|
|                                                  |
|This eBook is for the use of anyone anywhere at...|
|almost no restrictions whatsoever.  You may cop...|
|re-use it under the terms of the Project Gutenb...|
|    with this eBook or online at www.gutenberg.org|
|                                                  |
|                                                  |
|                        Title: Pride and Prejudice|
|                                                  |
+--------------------------------------------------+
only showing top 10 rows



                                                                                

In [11]:
book_df.show(10, truncate=False)

+--------------------------------------------------------------------+
|value                                                               |
+--------------------------------------------------------------------+
|The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen  |
|                                                                    |
|This eBook is for the use of anyone anywhere at no cost and with    |
|almost no restrictions whatsoever.  You may copy it, give it away or|
|re-use it under the terms of the Project Gutenberg License included |
|with this eBook or online at www.gutenberg.org                      |
|                                                                    |
|                                                                    |
|Title: Pride and Prejudice                                          |
|                                                                    |
+--------------------------------------------------------------------+
only s

### `Step 3` : Moving from a sentence to a list of words

In [12]:
import pyspark.sql.functions as F

In [38]:
lines = book_df \
            .select(F.split(F.col('value'), ' ').alias('line'))

In [39]:
lines.show()

+--------------------+
|                line|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title:, Pride, a...|
|                  []|
|[Author:, Jane, A...|
|                  []|
|[Posting, Date:, ...|
|[Release, Date:, ...|
|[Last, Updated:, ...|
|                  []|
|[Language:, English]|
|                  []|
|[Character, set, ...|
|                  []|
+--------------------+
only showing top 20 rows



#### Different ways to select the column 

Using any of these:

    - book_df.select("value")

    - book_df.select(F.col("value"))

    - book_df.select(book.value)

    - book_df.select(book["value"])


In [27]:
book_df.select('value').show(2)

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
+--------------------+
only showing top 2 rows



#### Giving a name to a column 

Using any of these: 

    - alias() 
    - withColumnRenamed()

In [24]:
book_df.select(F.split(F.col('value'), ' ')).show(1)

[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+
| split(value,  , -1)|
+--------------------+
|[The, Project, Gu...|
+--------------------+
only showing top 1 row



                                                                                

In [26]:
book_df.select(F.split(F.col('value'), ' ')).withColumnRenamed('split(value,  , -1)', 'line').show()

+--------------------+
|                line|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title:, Pride, a...|
|                  []|
|[Author:, Jane, A...|
|                  []|
|[Posting, Date:, ...|
|[Release, Date:, ...|
|[Last, Updated:, ...|
|                  []|
|[Language:, English]|
|                  []|
|[Character, set, ...|
|                  []|
+--------------------+
only showing top 20 rows



In [30]:
# should the column not exist, PySpark will treat this method as a no-op and not perform anything

book_df.select(F.split(F.col('value'), ' ')).withColumnRenamed('col_foo_bar', 'line').show()

+--------------------+
| split(value,  , -1)|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title:, Pride, a...|
|                  []|
|[Author:, Jane, A...|
|                  []|
|[Posting, Date:, ...|
|[Release, Date:, ...|
|[Last, Updated:, ...|
|                  []|
|[Language:, English]|
|                  []|
|[Character, set, ...|
|                  []|
+--------------------+
only showing top 20 rows



### `Step 4` : Exploding a list into rows

In [40]:
words = lines.select(F.explode(F.col('line')))

In [41]:
words.show()

+----------+
|       col|
+----------+
|       The|
|   Project|
| Gutenberg|
|     EBook|
|        of|
|     Pride|
|       and|
|Prejudice,|
|        by|
|      Jane|
|    Austen|
|          |
|      This|
|     eBook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows



In [42]:
words = words.withColumnRenamed('col', 'word')

In [43]:
words.show()

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     EBook|
|        of|
|     Pride|
|       and|
|Prejudice,|
|        by|
|      Jane|
|    Austen|
|          |
|      This|
|     eBook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows



#### Explode (a list to row)

<img src="../img/img1.png" alt="img1">


#### `Step 5` : Cleaning the data 

- Converting all words to lower case

In [51]:
words_lower = words.select(F.lower(F.col('word')).alias('word_lower'))

In [52]:
words_lower.show()

[Stage 20:>                                                         (0 + 1) / 1]

+----------+
|word_lower|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|     pride|
|       and|
|prejudice,|
|        by|
|      jane|
|    austen|
|          |
|      this|
|     ebook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows



                                                                                

- Remove any punctuation and other non-useful characters

- If you are interested in building your own, the RegExr (https://regexr.com/)

In [56]:
word_clean = words_lower.select(F.regexp_extract(F.col('word_lower'), "[a-z]+", 0).alias('word'))

In [57]:
word_clean.show()

[Stage 22:>                                                         (0 + 1) / 1]

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|         |
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
+---------+
only showing top 20 rows



                                                                                

### `Step 5` : Filtering row

In [58]:
word_with_no_null = word_clean.filter(F.col('word') != '')

In [59]:
word_with_no_null.show()

[Stage 23:>                                                         (0 + 1) / 1]

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
+---------+
only showing top 20 rows



                                                                                

### `Step 6` : Lets finally count the words 

In [65]:
word_with_no_null \
            .groupBy(F.col('word')) \
            .count() \
            .show()

[Stage 27:>                                                         (0 + 1) / 1]

+-----------+-----+
|       word|count|
+-----------+-----+
|     online|    4|
|       hope|  122|
|    solaced|    1|
|    elevate|    1|
|  solemnity|    5|
|     spared|    8|
| reanimated|    1|
|    courted|    2|
|ingratitude|    1|
|      parts|    5|
|    highest|    9|
|      hurry|   11|
|      oddly|    1|
|   laughing|   10|
|   positive|    3|
|     speedy|    2|
|   slightly|    4|
|      scorn|    1|
|      staff|    1|
|    explain|    6|
+-----------+-----+
only showing top 20 rows



                                                                                

<img src="../img/img2.png" alt="img2">


In [66]:
results = word_with_no_null \
            .groupBy(F.col('word')) \
            .count()

In [67]:
results.show()

+-----------+-----+
|       word|count|
+-----------+-----+
|     online|    4|
|       hope|  122|
|    solaced|    1|
|    elevate|    1|
|  solemnity|    5|
|     spared|    8|
| reanimated|    1|
|    courted|    2|
|ingratitude|    1|
|      parts|    5|
|    highest|    9|
|      hurry|   11|
|      oddly|    1|
|   laughing|   10|
|   positive|    3|
|     speedy|    2|
|   slightly|    4|
|      scorn|    1|
|      staff|    1|
|    explain|    6|
+-----------+-----+
only showing top 20 rows



We learned that PySpark distributes the data across multiple nodes. When performing a
grouping function, such as `groupby()`, each worker performs the work on its assigned
data. `groupby()` and `count()` are **transformations**, so PySpark will queue them **lazily**
until we request an action.

<img src="../img/img3.png" alt="img3">

#### Ordering the result based on `Count`

In [74]:
results.orderBy(F.desc(F.col('count'))).show()

[Stage 39:>                                                         (0 + 1) / 1]

+----+-----+
|word|count|
+----+-----+
| the| 4496|
|  to| 4235|
|  of| 3719|
| and| 3602|
| her| 2223|
|   i| 2052|
|   a| 1997|
|  in| 1920|
| was| 1844|
| she| 1703|
|that| 1582|
|  it| 1542|
| not| 1447|
| you| 1426|
|  he| 1334|
| his| 1267|
|  be| 1259|
|  as| 1191|
| had| 1176|
|with| 1099|
+----+-----+
only showing top 20 rows



                                                                                

In [75]:
results.orderBy(F.col('count').desc()).show()

+----+-----+
|word|count|
+----+-----+
| the| 4496|
|  to| 4235|
|  of| 3719|
| and| 3602|
| her| 2223|
|   i| 2052|
|   a| 1997|
|  in| 1920|
| was| 1844|
| she| 1703|
|that| 1582|
|  it| 1542|
| not| 1447|
| you| 1426|
|  he| 1334|
| his| 1267|
|  be| 1259|
|  as| 1191|
| had| 1176|
|with| 1099|
+----+-----+
only showing top 20 rows



[Stage 42:>                                                         (0 + 1) / 1]                                                                                

### `Step 7` : Save this data on S3

In [78]:
output_location = 's3://fcc-spark-example/output/word_count/'

results.write \
        .save(output_location)

                                                                                

<img src="../img/img4.png" alt="img4">

In [81]:
# Dataframe to CSV 

results.write \
  .format('csv') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_location)

<img src="../img/img5.png" alt="img5">

### Method Chaining

In [88]:
data_set = 's3://fcc-spark-example/dataset/gutenberg_books/1342-0.txt'

result = spark \
            .read \
            .text(data_set) \
            .select(F.split(F.col('value'), ' ').alias('line')) \
            .select(F.explode(F.col('line')).alias('word')) \
            .select(F.lower(F.col('word')).alias('word_lower')) \
            .select(F.regexp_extract(F.col('word_lower'), "[a-z]+", 0).alias('word')) \
            .filter(F.col('word') != '') \
            .groupBy(F.col('word')) \
            .count()

In [89]:
result.show()

[Stage 57:>                                                         (0 + 1) / 1]

+-----------+-----+
|       word|count|
+-----------+-----+
|     online|    4|
|       hope|  122|
|    solaced|    1|
|    elevate|    1|
|  solemnity|    5|
|     spared|    8|
| reanimated|    1|
|    courted|    2|
|ingratitude|    1|
|      parts|    5|
|    highest|    9|
|      hurry|   11|
|      oddly|    1|
|   laughing|   10|
|   positive|    3|
|     speedy|    2|
|   slightly|    4|
|      scorn|    1|
|      staff|    1|
|    explain|    6|
+-----------+-----+
only showing top 20 rows



                                                                                

<img src="../img/img6.png" alt="img6">

## Let's now `scaling up` our word frequency program

<img src="../img/img7.png" alt="img7">

In [91]:
data_set = 's3://fcc-spark-example/dataset/gutenberg_books/*.txt'

result_all_books = spark \
                        .read \
                        .text(data_set) \
                        .select(F.split(F.col('value'), ' ').alias('line')) \
                        .select(F.explode(F.col('line')).alias('word')) \
                        .select(F.lower(F.col('word')).alias('word_lower')) \
                        .select(F.regexp_extract(F.col('word_lower'), "[a-z]+", 0).alias('word')) \
                        .filter(F.col('word') != '') \
                        .groupBy(F.col('word')) \
                        .count()

In [96]:
result_all_books.orderBy(F.col('count'), ascending=False).show()



+----+-----+
|word|count|
+----+-----+
| the|39188|
| and|24292|
|  of|21234|
|  to|20581|
|   i|15153|
|   a|14564|
|  in|12857|
|that| 9900|
|  it| 9451|
| was| 8939|
| her| 7951|
|  my| 7594|
| you| 6722|
| his| 6676|
|  he| 6604|
|with| 6595|
|  as| 6502|
| had| 5727|
| she| 5699|
| for| 5497|
+----+-----+
only showing top 20 rows



                                                                                

## Saving all the processed data in S3

In [93]:
output_location = 's3://fcc-spark-example/output/word_count_all_books/'

result_all_books.write \
                .save(output_location)

                                                                                