## RDD (Resilient Distributed Dataset)

In [1]:
import numpy as np
import pandas as pd
from pyspark import SparkConf, SparkContext
import pyspark.sql.functions as fn

# Built-in libraries
import collections
import itertools
import re
from typing import Union
import json

# Black formatter (optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Configurations
conf = SparkConf().setMaster("local").setAppName("RDD_examples")

# Spark Context
sc = SparkContext(conf=conf)

23/07/05 13:53:49 WARN Utils: Your hostname, Chinedus-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.151 instead (on interface en0)
23/07/05 13:53:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/05 13:53:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Functions

### `.map()`

- Return a new RDD by applying a function to each element of this RDD.

```Python
fp = "some/data/filepath"
rdd = sc.textFile(fp)
# Return a new RDD by applying a function to each element of this RDD.
result = rdd.map(lambda row: row[0]) # Select the 0-th index
result = rdd.map(lambda row: row[1]) # Select the 1-st index
```
<br>

### `.countByValue()`

- Return the count of each unique value in this RDD as a dictionary of (value, count) pairs.
  
```Python
fp = "some/data/filepath"
rdd = sc.textFile(fp)
result = rdd.map(lambda row: row[0]) # Select the 0-th index
# return the count of each unique value in this RDD as a dictionary of (value, count) pairs.
result.countByValue()  # returns {'3': 27145, '1': 6110, '2': 11370, '4': 34174, '5': 21201}
```
<br>

### `rdd.collect()`

- Return a list that contains all of the elements in this RDD.

<br>

### `.mapValues()`

- This transformation applies a function to the values of each key-value pair in an RDD. It also retains the original RDD's partitioning.

<br>


In [3]:
# Read the file and return it as an RDD of strings
fp = "../data/ml-100k/u.data"
lines = sc.textFile(fp)

# Extract the ratings which is the 3rd field (2nd index)
# columns: ['user id', 'movie id', 'rating', 'timestamp']
ratings = lines.map(lambda x: x.split()[2])

# Groupby the keys. i.e. (count the values of each rating)
result = ratings.countByValue()

sorted_results = collections.OrderedDict(sorted(result.items()))
for key, value in sorted_results.items():
    print(f"Rating: {key}, Count: {value}")

[Stage 0:>                                                          (0 + 1) / 1]

Rating: 1, Count: 6110
Rating: 2, Count: 11370
Rating: 3, Count: 27145
Rating: 4, Count: 34174
Rating: 5, Count: 21201


                                                                                

## Spark DataFrame

In [4]:
# PySpark Modules
from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as func
from pyspark.sql.types import (
    ArrayType,
    StringType,
    IntegerType,
    FloatType,
    StructType,
    StructField,
    LongType,
)

In [5]:
# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

### Load The Data As An RDD

In [6]:
# Read the file and return it as an RDD of strings
fp = "../data/ml-100k/u.data"
raw_file = spark.sparkContext.textFile(fp)


def movie_mapper(row: str) -> Row:
    """this returns a Row object."""
    data = row.split()
    # ['user id', 'movie id', 'rating', 'timestamp']
    result = Row(
        user_id=int(data[0]),
        movie_id=int(data[1]),
        rating=int(data[2]),
        timestamp=data[3],
    )
    return result


# Map the RDD
rdd = raw_file.map(movie_mapper)

# Convert the RDD to DataFrame and infer schema
df = spark.createDataFrame(rdd).cache()
print(f"Schema: {df}\n")

# Register the DataFrame as a table
df.createOrReplaceTempView("movies")


df.show(10)

Schema: DataFrame[user_id: bigint, movie_id: bigint, rating: bigint, timestamp: string]

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|    196|     242|     3|881250949|
|    186|     302|     3|891717742|
|     22|     377|     1|878887116|
|    244|      51|     2|880606923|
|    166|     346|     1|886397596|
|    298|     474|     4|884182806|
|    115|     265|     2|881171488|
|    253|     465|     5|891628467|
|    305|     451|     3|886324817|
|      6|      86|     3|883603013|
+-------+--------+------+---------+
only showing top 10 rows



                                                                                

### SQL Queries

In [7]:
# Select all the movies with a rating of 4
result = spark.sql("SELECT * FROM movies WHERE rating = 4;")
result.show()

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|    298|     474|     4|884182806|
|    291|    1042|     4|874834944|
|    119|     392|     4|886176814|
|    167|     486|     4|892738452|
|    299|     144|     4|877881320|
|    308|       1|     4|887736532|
|     63|     277|     4|875747401|
|    301|      98|     4|882075827|
|    225|     193|     4|879539727|
|    290|      88|     4|880731963|
|    157|     274|     4|886890835|
|      7|      32|     4|891350932|
|     10|      16|     4|877888877|
|    284|     304|     4|885329322|
|    251|     100|     4|886271884|
|    260|     322|     4|890618898|
|     87|     384|     4|879877127|
|    292|     515|     4|881103977|
|    201|     219|     4|884112673|
|    246|     919|     4|884920949|
+-------+--------+------+---------+
only showing top 20 rows



In [8]:
result = spark.sql(
    """
    SELECT rating, COUNT(*) AS num_votes 
        FROM movies 
    GROUP BY rating
    ORDER BY rating ASC;
"""
)
result.show()

+------+---------+
|rating|num_votes|
+------+---------+
|     1|     6110|
|     2|    11370|
|     3|    27145|
|     4|    34174|
|     5|    21201|
+------+---------+



### Using Functions

In [9]:
# Select all the movies with a rating of 4
df.filter((df["rating"] == 4)).show()

# OR
# df.filter(func.col("rating") == 4).show()

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|    298|     474|     4|884182806|
|    291|    1042|     4|874834944|
|    119|     392|     4|886176814|
|    167|     486|     4|892738452|
|    299|     144|     4|877881320|
|    308|       1|     4|887736532|
|     63|     277|     4|875747401|
|    301|      98|     4|882075827|
|    225|     193|     4|879539727|
|    290|      88|     4|880731963|
|    157|     274|     4|886890835|
|      7|      32|     4|891350932|
|     10|      16|     4|877888877|
|    284|     304|     4|885329322|
|    251|     100|     4|886271884|
|    260|     322|     4|890618898|
|     87|     384|     4|879877127|
|    292|     515|     4|881103977|
|    201|     219|     4|884112673|
|    246|     919|     4|884920949|
+-------+--------+------+---------+
only showing top 20 rows



In [10]:
# Returns a DF
df.select("user_id")

# Returns a column
df["user_id"]

Column<'user_id'>

In [11]:
def friends_mapper(row: str) -> Row:
    """This is used to map the data and it returns a Row object."""
    _data = row.split(",")
    result = Row(
        user_id=(_data[0]),
        name=_data[1],
        age=(_data[2]),
        num_friends=(_data[3]),
    )
    return result

In [12]:
# Load the data as an RDD
fp = "../data/fakefriends-header.csv"
data = spark.sparkContext.textFile(fp)

# Map the RDD
rdd = data.map(friends_mapper)
# Load as DF and infer schema
friends_df = spark.createDataFrame(rdd).cache()
friends_df.show()

+-------+--------+---+-----------+
|user_id|    name|age|num_friends|
+-------+--------+---+-----------+
| userID|    name|age|    friends|
|      0|    Will| 33|        385|
|      1|Jean-Luc| 26|          2|
|      2|    Hugh| 55|        221|
|      3|  Deanna| 40|        465|
|      4|   Quark| 68|         21|
|      5|  Weyoun| 59|        318|
|      6|  Gowron| 37|        220|
|      7|    Will| 54|        307|
|      8|  Jadzia| 38|        380|
|      9|    Hugh| 27|        181|
|     10|     Odo| 53|        191|
|     11|     Ben| 57|        372|
|     12|   Keiko| 54|        253|
|     13|Jean-Luc| 56|        444|
|     14|    Hugh| 43|         49|
|     15|     Rom| 36|         49|
|     16|  Weyoun| 22|        323|
|     17|     Odo| 35|         13|
|     18|Jean-Luc| 45|        455|
+-------+--------+---+-----------+
only showing top 20 rows



In [13]:
# friends_df.groupBy(func.col("age"))

# Register the DataFrame as a table
friends_df.createOrReplaceTempView("friends_df")

result = spark.sql(
    """
    SELECT age, ROUND(AVG(num_friends), 2) as avg_num_friends
        FROM friends_df
    GROUP BY age
    ORDER BY avg_num_friends DESC;

"""
)

result.show(5)

+---+---------------+
|age|avg_num_friends|
+---+---------------+
| 63|          384.0|
| 21|         350.88|
| 18|         343.38|
| 52|         340.64|
| 33|         325.33|
+---+---------------+
only showing top 5 rows



In [14]:
# Load the data as a Spark DataFrame
fp = "../data/fakefriends-header.csv"
friends_df = spark.read.option("header", "true").option("InferSchema", "true").csv(fp)
friends_df.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- friends: integer (nullable = true)



In [15]:
friends_df.groupBy(["age"]).agg(
    func.round(func.avg("friends"), 2).alias("avg_num_friends")
).sort("avg_num_friends", ascending=False).show(5)

+---+---------------+
|age|avg_num_friends|
+---+---------------+
| 63|          384.0|
| 21|         350.88|
| 18|         343.38|
| 52|         340.64|
| 33|         325.33|
+---+---------------+
only showing top 5 rows



In [16]:
fp = "../data/titanic_data.csv"
titanic_df = (
    spark.read.option("header", "true")
    .option("inferSchema", "true")
    .format("csv")
    .load(fp)
)
titanic_df.printSchema()

root
 |-- pclass: integer (nullable = true)
 |-- survived: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: double (nullable = true)
 |-- sibsp: integer (nullable = true)
 |-- parch: integer (nullable = true)
 |-- ticket: string (nullable = true)
 |-- fare: double (nullable = true)
 |-- cabin: string (nullable = true)
 |-- embarked: string (nullable = true)
 |-- boat: string (nullable = true)
 |-- body: integer (nullable = true)
 |-- home.dest: string (nullable = true)



In [17]:
titanic_df.show(5)

+------+--------+--------------------+------+------+-----+-----+------+--------+-----+--------+----+----+--------------------+
|pclass|survived|                name|   sex|   age|sibsp|parch|ticket|    fare|cabin|embarked|boat|body|           home.dest|
+------+--------+--------------------+------+------+-----+-----+------+--------+-----+--------+----+----+--------------------+
|     1|       1|Allen, Miss. Elis...|female|  29.0|    0|    0| 24160|211.3375|   B5|       S|   2|null|        St Louis, MO|
|     1|       1|Allison, Master. ...|  male|0.9167|    1|    2|113781|  151.55|  C22|       S|  11|null|Montreal, PQ / Ch...|
|     1|       0|Allison, Miss. He...|female|   2.0|    1|    2|113781|  151.55|  C22|       S|null|null|Montreal, PQ / Ch...|
|     1|       0|Allison, Mr. Huds...|  male|  30.0|    1|    2|113781|  151.55|  C22|       S|null| 135|Montreal, PQ / Ch...|
|     1|       0|Allison, Mrs. Hud...|female|  25.0|    1|    2|113781|  151.55|  C22|       S|null|null|Montre

### Functions

#### `func.explode()`

- Returns a new row for each element in the given array or map.

<br>



## User Defined Functions (UDF)

In [18]:
@func.udf(returnType=ArrayType(StringType()))
def normalize_text(text: str) -> str:
    """Docs!"""
    pattern = re.compile(r"\W+")  # Select characters that are not words
    # Split the text using the pattern and convert to lowercase
    return pattern.split(text.lower())

In [19]:
# Count the number of characters
words_flattened = titanic_df.select(
    func.explode(normalize_text("name")).alias("normalized_words"), "name"
)

words_flattened.show(5)

[Stage 21:>                                                         (0 + 1) / 1]

+----------------+--------------------+
|normalized_words|                name|
+----------------+--------------------+
|           allen|Allen, Miss. Elis...|
|            miss|Allen, Miss. Elis...|
|       elisabeth|Allen, Miss. Elis...|
|          walton|Allen, Miss. Elis...|
|         allison|Allison, Master. ...|
+----------------+--------------------+
only showing top 5 rows



                                                                                

In [20]:
words_flattened.groupBy(["normalized_words"]).agg(
    func.count("normalized_words").alias("frequency")
).sort("frequency", ascending=False).show(5)

+----------------+---------+
|normalized_words|frequency|
+----------------+---------+
|              mr|      763|
|                |      263|
|            miss|      260|
|             mrs|      201|
|         william|       87|
+----------------+---------+
only showing top 5 rows



### Without UDFs

In [21]:
res = titanic_df.select(
    func.explode(func.split(func.lower(func.col("name")), pattern=r"\W+")).alias(
        "normalized_words"
    ),
    "name",
)

res.show(5)

+----------------+--------------------+
|normalized_words|                name|
+----------------+--------------------+
|           allen|Allen, Miss. Elis...|
|            miss|Allen, Miss. Elis...|
|       elisabeth|Allen, Miss. Elis...|
|          walton|Allen, Miss. Elis...|
|         allison|Allison, Master. ...|
+----------------+--------------------+
only showing top 5 rows



In [22]:
res.groupBy(["normalized_words"]).agg(
    func.count("normalized_words").alias("frequency")
).sort("frequency", ascending=False).show(5)

+----------------+---------+
|normalized_words|frequency|
+----------------+---------+
|              mr|      763|
|                |      263|
|            miss|      260|
|             mrs|      201|
|         william|       87|
+----------------+---------+
only showing top 5 rows



In [23]:
titanic_df.show(5)

+------+--------+--------------------+------+------+-----+-----+------+--------+-----+--------+----+----+--------------------+
|pclass|survived|                name|   sex|   age|sibsp|parch|ticket|    fare|cabin|embarked|boat|body|           home.dest|
+------+--------+--------------------+------+------+-----+-----+------+--------+-----+--------+----+----+--------------------+
|     1|       1|Allen, Miss. Elis...|female|  29.0|    0|    0| 24160|211.3375|   B5|       S|   2|null|        St Louis, MO|
|     1|       1|Allison, Master. ...|  male|0.9167|    1|    2|113781|  151.55|  C22|       S|  11|null|Montreal, PQ / Ch...|
|     1|       0|Allison, Miss. He...|female|   2.0|    1|    2|113781|  151.55|  C22|       S|null|null|Montreal, PQ / Ch...|
|     1|       0|Allison, Mr. Huds...|  male|  30.0|    1|    2|113781|  151.55|  C22|       S|null| 135|Montreal, PQ / Ch...|
|     1|       0|Allison, Mrs. Hud...|female|  25.0|    1|    2|113781|  151.55|  C22|       S|null|null|Montre

In [24]:
# Select the youngest passenger
min_temp = titanic_df.select(func.min("age")).first()[0]

titanic_df.filter(func.col("age") == min_temp).show(5)

+------+--------+--------------------+------+------+-----+-----+---------+------+-----+--------+----+----+--------------------+
|pclass|survived|                name|   sex|   age|sibsp|parch|   ticket|  fare|cabin|embarked|boat|body|           home.dest|
+------+--------+--------------------+------+------+-----+-----+---------+------+-----+--------+----+----+--------------------+
|     3|       1|Dean, Miss. Eliza...|female|0.1667|    1|    2|C.A. 2315|20.575| null|       S|  10|null|Devon, England Wi...|
+------+--------+--------------------+------+------+-----+-----+---------+------+-----+--------+----+----+--------------------+



In [25]:
# Add a new column
titanic_df.withColumn("LOWER", func.lower(func.col("name"))).select(
    ["name", "LOWER"]
).show(5)

+--------------------+--------------------+
|                name|               LOWER|
+--------------------+--------------------+
|Allen, Miss. Elis...|allen, miss. elis...|
|Allison, Master. ...|allison, master. ...|
|Allison, Miss. He...|allison, miss. he...|
|Allison, Mr. Huds...|allison, mr. huds...|
|Allison, Mrs. Hud...|allison, mrs. hud...|
+--------------------+--------------------+
only showing top 5 rows

