## RDD (Resilient Distributed Dataset)

In [1]:
import numpy as np
import pandas as pd
from pyspark import SparkConf, SparkContext
import pyspark.sql.functions as fn

# Built-in libraries
import collections
import itertools
import re
from typing import Union
import json

# Black formatter (optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Configurations
conf = SparkConf().setMaster("local").setAppName("RDD_examples")

# Spark Context
sc = SparkContext(conf=conf)

23/07/05 10:24:26 WARN Utils: Your hostname, Chinedus-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.151 instead (on interface en0)
23/07/05 10:24:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/05 10:24:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Functions

### `.map()`

- Return a new RDD by applying a function to each element of this RDD.

```Python
fp = "some/data/filepath"
rdd = sc.textFile(fp)
# Return a new RDD by applying a function to each element of this RDD.
result = rdd.map(lambda row: row[0]) # Select the 0-th index
result = rdd.map(lambda row: row[1]) # Select the 1-st index
```
<br>

### `.countByValue()`

- Return the count of each unique value in this RDD as a dictionary of (value, count) pairs.
  
```Python
fp = "some/data/filepath"
rdd = sc.textFile(fp)
result = rdd.map(lambda row: row[0]) # Select the 0-th index
# return the count of each unique value in this RDD as a dictionary of (value, count) pairs.
result.countByValue()  # returns {'3': 27145, '1': 6110, '2': 11370, '4': 34174, '5': 21201}
```
<br>

### `rdd.collect()`

- Return a list that contains all of the elements in this RDD.

<br>

### `.mapValues()`

- This transformation applies a function to the values of each key-value pair in an RDD. It also retains the original RDD's partitioning.

<br>


In [3]:
# Read the file and return it as an RDD of strings
fp = "../data/ml-100k/u.data"
lines = sc.textFile(fp)

# Extract the ratings which is the 3rd field (2nd index)
# columns: ['user id', 'movie id', 'rating', 'timestamp']
ratings = lines.map(lambda x: x.split()[2])

# Groupby the keys. i.e. (count the values of each rating)
result = ratings.countByValue()

sorted_results = collections.OrderedDict(sorted(result.items()))
for key, value in sorted_results.items():
    print(f"Rating: {key}, Count: {value}")

[Stage 0:>                                                          (0 + 1) / 1]

Rating: 1, Count: 6110
Rating: 2, Count: 11370
Rating: 3, Count: 27145
Rating: 4, Count: 34174
Rating: 5, Count: 21201


                                                                                

## Spark DataFrame

In [4]:
# PySpark Modules
from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as func
from pyspark.sql.types import (
    ArrayType,
    StringType,
    IntegerType,
    FloatType,
    StructType,
    StructField,
    LongType,
)

In [5]:
# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [9]:
# Read the file and return it as an RDD of strings
fp = "../data/ml-100k/u.data"
raw_file = spark.sparkContext.textFile(fp)


def movie_mapper(row: str) -> Row:
    """this returns a Row object."""
    data = row.split()
    # ['user id', 'movie id', 'rating', 'timestamp']
    result = Row(
        user_id=int(data[0]),
        movie_id=int(data[1]),
        rating=int(data[2]),
        timestamp=data[3],
    )
    return result


# Map the RDD
rdd = raw_file.map(movie_mapper)

# Convert the RDD to DataFrame and infer schema
schema_ = spark.createDataFrame(rdd).cache()
print(f"Schema: {schema_}\n")

# Register the DataFrame as a table
schema_.createOrReplaceTempView("movies")


schema_.show(10)

Schema: DataFrame[user_id: bigint, movie_id: bigint, rating: bigint, timestamp: string]

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|    196|     242|     3|881250949|
|    186|     302|     3|891717742|
|     22|     377|     1|878887116|
|    244|      51|     2|880606923|
|    166|     346|     1|886397596|
|    298|     474|     4|884182806|
|    115|     265|     2|881171488|
|    253|     465|     5|891628467|
|    305|     451|     3|886324817|
|      6|      86|     3|883603013|
+-------+--------+------+---------+
only showing top 10 rows



### SQL Queries

In [10]:
result = spark.sql("SELECT * FROM movies WHERE rating = 4;")
result.show()

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|    298|     474|     4|884182806|
|    291|    1042|     4|874834944|
|    119|     392|     4|886176814|
|    167|     486|     4|892738452|
|    299|     144|     4|877881320|
|    308|       1|     4|887736532|
|     63|     277|     4|875747401|
|    301|      98|     4|882075827|
|    225|     193|     4|879539727|
|    290|      88|     4|880731963|
|    157|     274|     4|886890835|
|      7|      32|     4|891350932|
|     10|      16|     4|877888877|
|    284|     304|     4|885329322|
|    251|     100|     4|886271884|
|    260|     322|     4|890618898|
|     87|     384|     4|879877127|
|    292|     515|     4|881103977|
|    201|     219|     4|884112673|
|    246|     919|     4|884920949|
+-------+--------+------+---------+
only showing top 20 rows



In [17]:
result = spark.sql(
    """
    SELECT rating, COUNT(*) AS num_votes 
        FROM movies 
    GROUP BY rating
    ORDER BY rating ASC;
"""
)
result.show()

+------+---------+
|rating|num_votes|
+------+---------+
|     1|     6110|
|     2|    11370|
|     3|    27145|
|     4|    34174|
|     5|    21201|
+------+---------+



In [None]:
fp = "../data/fakefriends-header.csv"
data = sc.textFile(fp)

res = data.map(lambda row: (row.split(",")[2], row.split(",")[3]))  # age, num_friends


result = res.mapValues(lambda row: (row, 1))
# result.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).collect()
result.reduceByKey(lambda x, y: (x[1] + y[1])).collect()

In [None]:
res = lines.map(lambda row: row.split()[1])
res.countByValue()

In [None]:
res.collect()