### Reading parquet data with an inferred schema

In [1]:
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 *

spark = (SparkSession.builder  # SparkSession 빌더 패턴 시작
         .appName("nested-dataframe")  # 애플리케이션 이름 설정
         .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
         .config("spark.executor.memory", "512m")  # Spark 설정 옵션
         .getOrCreate()  # SparkSession 생성 또는 기존 세션 반환)

spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/19 11:52:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
df = (spark.read.format("json")  # JSON 형식으로 데이터 읽기
      .option("multiLine", "true")  # 여러 줄 JSON 처리
      .load(  # 파일 로드"../data/Stanford Question Answering Dataset.json"))

                                                                                

In [8]:
df.printSchema()  # DataFrame 스키마 구조 출력

root
 |-- paragraphs: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- context: string (nullable = true)
 |    |    |-- qas: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- answers: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |-- answer_start: long (nullable = true)
 |    |    |    |    |    |    |-- text: string (nullable = true)
 |    |    |    |    |-- id: string (nullable = true)
 |    |    |    |    |-- question: string (nullable = true)
 |-- title: string (nullable = true)



In [9]:
df_exploded = (
    df.select(  # 컬럼 선택"title"
              , explode(  # 배열을 개별 행으로 분해"paragraphs").alias("paragraphs"))
    .select(  # 컬럼 선택"title"
            ,col(  # 컬럼 참조"paragraphs.context").alias ("context")
            ,explode(  # 배열을 개별 행으로 분해col("paragraphs.qas")).alias("questions")))

df_exploded.show()  # DataFrame 내용 출력

[Stage 2:>                                                          (0 + 1) / 1]

+-------------+--------------------+--------------------+
|        title|             context|           questions|
+-------------+--------------------+--------------------+
|Super_Bowl_50|Super Bowl 50 was...|{[{177, Denver Br...|
|Super_Bowl_50|Super Bowl 50 was...|{[{249, Carolina ...|
|Super_Bowl_50|Super Bowl 50 was...|{[{403, Santa Cla...|
|Super_Bowl_50|Super Bowl 50 was...|{[{177, Denver Br...|
|Super_Bowl_50|Super Bowl 50 was...|{[{488, gold}, {4...|
|Super_Bowl_50|Super Bowl 50 was...|{[{487, "golden a...|
|Super_Bowl_50|Super Bowl 50 was...|{[{334, February ...|
|Super_Bowl_50|Super Bowl 50 was...|{[{133, American ...|
|Super_Bowl_50|Super Bowl 50 was...|{[{487, "golden a...|
|Super_Bowl_50|Super Bowl 50 was...|{[{133, American ...|
|Super_Bowl_50|Super Bowl 50 was...|{[{334, February ...|
|Super_Bowl_50|Super Bowl 50 was...|{[{177, Denver Br...|
|Super_Bowl_50|Super Bowl 50 was...|{[{355, Levi's St...|
|Super_Bowl_50|Super Bowl 50 was...|{[{403, Santa Cla...|
|Super_Bowl_50

                                                                                

In [12]:
df_array_distinct = (
    df_exploded.select(  # 컬럼 선택"title","context"
                       ,col(  # 컬럼 참조"questions.id").alias("question_id")
                       ,col(  # 컬럼 참조"questions.question").alias("question_text")
                       ,array_distinct("questions.answers").alias(  # 컬럼 별칭 설정"answers")))

df_array_distinct.show()  # DataFrame 내용 출력

+-------------+--------------------+--------------------+--------------------+--------------------+
|        title|             context|         question_id|       question_text|             answers|
+-------------+--------------------+--------------------+--------------------+--------------------+
|Super_Bowl_50|Super Bowl 50 was...|56be4db0acb800140...|Which NFL team re...|[{177, Denver Bro...|
|Super_Bowl_50|Super Bowl 50 was...|56be4db0acb800140...|Which NFL team re...|[{249, Carolina P...|
|Super_Bowl_50|Super Bowl 50 was...|56be4db0acb800140...|Where did Super B...|[{403, Santa Clar...|
|Super_Bowl_50|Super Bowl 50 was...|56be4db0acb800140...|Which NFL team wo...|[{177, Denver Bro...|
|Super_Bowl_50|Super Bowl 50 was...|56be4db0acb800140...|What color was us...|[{488, gold}, {52...|
|Super_Bowl_50|Super Bowl 50 was...|56be8e613aeaaa140...|What was the them...|[{487, "golden an...|
|Super_Bowl_50|Super Bowl 50 was...|56be8e613aeaaa140...|What day was the ...|[{334, February 7...|


In [13]:
(df_array_distinct
 .select(  # 컬럼 선택"title","context","question_text"
         ,col(  # 컬럼 참조"answers").getItem(0).getField("text"))
 .show()  # DataFrame 내용 출력)

+-------------+--------------------+--------------------+--------------------+
|        title|             context|       question_text|     answers[0].text|
+-------------+--------------------+--------------------+--------------------+
|Super_Bowl_50|Super Bowl 50 was...|Which NFL team re...|      Denver Broncos|
|Super_Bowl_50|Super Bowl 50 was...|Which NFL team re...|   Carolina Panthers|
|Super_Bowl_50|Super Bowl 50 was...|Where did Super B...|Santa Clara, Cali...|
|Super_Bowl_50|Super Bowl 50 was...|Which NFL team wo...|      Denver Broncos|
|Super_Bowl_50|Super Bowl 50 was...|What color was us...|                gold|
|Super_Bowl_50|Super Bowl 50 was...|What was the them...|"golden anniversary"|
|Super_Bowl_50|Super Bowl 50 was...|What day was the ...|    February 7, 2016|
|Super_Bowl_50|Super Bowl 50 was...|What is the AFC s...|American Football...|
|Super_Bowl_50|Super Bowl 50 was...|What was the them...|"golden anniversary"|
|Super_Bowl_50|Super Bowl 50 was...|What does AFC st

### Large number of rows with explode 

In [14]:
(df_array_distinct
 .select(  # 컬럼 선택"title","context","question_text"
         , col(  # 컬럼 참조"answers").getItem(0).getField("text").alias('answer'))
 .show()  # DataFrame 내용 출력)

+-------------+--------------------+--------------------+--------------------+
|        title|             context|       question_text|              answer|
+-------------+--------------------+--------------------+--------------------+
|Super_Bowl_50|Super Bowl 50 was...|Which NFL team re...|      Denver Broncos|
|Super_Bowl_50|Super Bowl 50 was...|Which NFL team re...|   Carolina Panthers|
|Super_Bowl_50|Super Bowl 50 was...|Where did Super B...|Santa Clara, Cali...|
|Super_Bowl_50|Super Bowl 50 was...|Which NFL team wo...|      Denver Broncos|
|Super_Bowl_50|Super Bowl 50 was...|What color was us...|                gold|
|Super_Bowl_50|Super Bowl 50 was...|What was the them...|"golden anniversary"|
|Super_Bowl_50|Super Bowl 50 was...|What day was the ...|    February 7, 2016|
|Super_Bowl_50|Super Bowl 50 was...|What is the AFC s...|American Football...|
|Super_Bowl_50|Super Bowl 50 was...|What was the them...|"golden anniversary"|
|Super_Bowl_50|Super Bowl 50 was...|What does AFC st

### Nested data with null values 

In [15]:
(df_array_distinct
 .filter(  # 데이터 필터링col("answers").getItem(0).getField("text").isNotNull())
 .show()  # DataFrame 내용 출력)

+-------------+--------------------+--------------------+--------------------+--------------------+
|        title|             context|         question_id|       question_text|             answers|
+-------------+--------------------+--------------------+--------------------+--------------------+
|Super_Bowl_50|Super Bowl 50 was...|56be4db0acb800140...|Which NFL team re...|[{177, Denver Bro...|
|Super_Bowl_50|Super Bowl 50 was...|56be4db0acb800140...|Which NFL team re...|[{249, Carolina P...|
|Super_Bowl_50|Super Bowl 50 was...|56be4db0acb800140...|Where did Super B...|[{403, Santa Clar...|
|Super_Bowl_50|Super Bowl 50 was...|56be4db0acb800140...|Which NFL team wo...|[{177, Denver Bro...|
|Super_Bowl_50|Super Bowl 50 was...|56be4db0acb800140...|What color was us...|[{488, gold}, {52...|
|Super_Bowl_50|Super Bowl 50 was...|56be8e613aeaaa140...|What was the them...|[{487, "golden an...|
|Super_Bowl_50|Super Bowl 50 was...|56be8e613aeaaa140...|What day was the ...|[{334, February 7...|


                                                                                

### `array_contains()`

In [16]:
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 array_contains

df = spark.createDataFrame(
    [(["apple", "orange", "banana"],)
     ,(["grape", "kiwi", "melon"],)
     ,(["pear", "apple", "pineapple"],)]
    ,["fruits"])

(df.select(  # 컬럼 선택"fruits"
           , array_contains("fruits", "apple")
           .alias(  # 컬럼 별칭 설정"contains_apple"))
 .show(truncate=False))


[Stage 10:>                                                         (0 + 1) / 1]

+------------------------+--------------+
|fruits                  |contains_apple|
+------------------------+--------------+
|[apple, orange, banana] |true          |
|[grape, kiwi, melon]    |false         |
|[pear, apple, pineapple]|true          |
+------------------------+--------------+



                                                                                

### `map_keys()` and `map_values()`

In [17]:
data = [
    {"user_info": {"name": "Alice", "age": 28, "email": "alice@example.com"}},
    {"user_info": {"name": "Bob", "age": 35, "email": "bob@example.com"}},
    {"user_info": {"name": "Charlie", "age": 42, "email": "charlie@example.com"}}
]

df = spark.createDataFrame(data)
df.show(truncate=False)


+----------------------------------------------------------+
|user_info                                                 |
+----------------------------------------------------------+
|{name -> Alice, email -> alice@example.com, age -> 28}    |
|{name -> Bob, email -> bob@example.com, age -> 35}        |
|{name -> Charlie, email -> charlie@example.com, age -> 42}|
+----------------------------------------------------------+



In [19]:
(df
 .select(  # 컬럼 선택"user_info"
         , map_keys("user_info").alias(  # 컬럼 별칭 설정"user_info_keys") 
         , map_values("user_info").alias(  # 컬럼 별칭 설정"user_info_values"))
 .show(truncate=False))

+----------------------------------------------------------+------------------+----------------------------------+
|user_info                                                 |user_info_keys    |user_info_values                  |
+----------------------------------------------------------+------------------+----------------------------------+
|{name -> Alice, email -> alice@example.com, age -> 28}    |[name, email, age]|[Alice, alice@example.com, 28]    |
|{name -> Bob, email -> bob@example.com, age -> 35}        |[name, email, age]|[Bob, bob@example.com, 35]        |
|{name -> Charlie, email -> charlie@example.com, age -> 42}|[name, email, age]|[Charlie, charlie@example.com, 42]|
+----------------------------------------------------------+------------------+----------------------------------+



### `explode_outer()`

In [7]:
data = [
    {"words": ["hello", "world"]},
    {"words": ["foo", "bar", "baz"]},
    {"words": None}
]

df = spark.createDataFrame(data)

(df.select(  # 컬럼 선택explode_outer("words").alias("word"))
 .show(truncate=False))

+-----+
|word |
+-----+
|hello|
|world|
|foo  |
|bar  |
|baz  |
|null |
+-----+



### `posexplode()`

In [8]:
data = [
    {"words": ["hello", "world"]},
    {"words": ["foo", "bar", "baz"]},
    {"words": None}
]

df = spark.createDataFrame(data)

df.selectExpr("posexplode(  # 배열을 개별 행으로 분해words) as (pos, word)").show(truncate=False)

+---+-----+
|pos|word |
+---+-----+
|0  |hello|
|1  |world|
|0  |foo  |
|1  |bar  |
|2  |baz  |
+---+-----+



In [20]:
spark.stop()  # Spark 세션 종료 - 리소스 정리