### Reading parquet data with an inferred schema

In [6]:
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트

spark = (SparkSession.builder  # SparkSession 빌더 패턴 시작
         .appName("read-parquet-data")  # 애플리케이션 이름 설정
         .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
         .config("spark.executor.memory", "512m")  # Spark 설정 옵션
         .getOrCreate()  # SparkSession 생성 또는 기존 세션 반환)

spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

In [8]:
# 읽기 parquet file into a DataFrame
df = (spark.read.format("parquet")  # Parquet 형식으로 데이터 읽기
      .load(  # 파일 로드"../data/recipes.parquet"))

                                                                                

In [9]:
df.printSchema()  # DataFrame 스키마 구조 출력

root
 |-- RecipeId: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- AuthorId: integer (nullable = true)
 |-- AuthorName: string (nullable = true)
 |-- CookTime: string (nullable = true)
 |-- PrepTime: string (nullable = true)
 |-- TotalTime: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- RecipeCategory: string (nullable = true)
 |-- Keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientQuantities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientParts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- AggregatedRating: double (nullable = true)
 |-- ReviewCount: integer (nullable = true)
 |-- Calories: double (nullable = true)
 |-- FatContent: double (nullable = true)
 |-- SaturatedFatContent: double (nullable = true)
 |-- CholesterolContent: double (nullable = true)
 |-- SodiumContent: double (nullable = true)
 |-- Carbohydr

In [10]:
# 출력 contents of DataFrame
df.show()  # DataFrame 내용 출력

# Alternatively

# df.show(50)  # 출력 first 50 rows
# df.show(10, truncate=False)  # 출력 first 10 rows without truncation

                                                                                

+--------+--------------------+----------+------------+--------+--------+---------+--------------------+--------------+--------------------+--------------------------+---------------------+----------------+-----------+--------+----------+-------------------+------------------+-------------+-------------------+------------+------------+--------------+--------------+-----------+--------------------+--------------------+-------------+
|RecipeId|                Name|  AuthorId|  AuthorName|CookTime|PrepTime|TotalTime|         Description|RecipeCategory|            Keywords|RecipeIngredientQuantities|RecipeIngredientParts|AggregatedRating|ReviewCount|Calories|FatContent|SaturatedFatContent|CholesterolContent|SodiumContent|CarbohydrateContent|FiberContent|SugarContent|ProteinContent|RecipeServings|RecipeYield|  RecipeInstructions|              Images|DatePublished|
+--------+--------------------+----------+------------+--------+--------+---------+--------------------+--------------+---------

### Reading partitioned data

In [11]:
df_partitioned = (spark.read.format("parquet")  # Parquet 형식으로 데이터 읽기
                  .load(  # 파일 로드"../data/partitioned_recipes"))

                                                                                

In [12]:
df_partitioned.printSchema()  # DataFrame 스키마 구조 출력

root
 |-- RecipeId: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- AuthorId: integer (nullable = true)
 |-- AuthorName: string (nullable = true)
 |-- CookTime: string (nullable = true)
 |-- PrepTime: string (nullable = true)
 |-- TotalTime: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- RecipeCategory: string (nullable = true)
 |-- Keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientQuantities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientParts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- AggregatedRating: double (nullable = true)
 |-- ReviewCount: integer (nullable = true)
 |-- Calories: double (nullable = true)
 |-- FatContent: double (nullable = true)
 |-- SaturatedFatContent: double (nullable = true)
 |-- CholesterolContent: double (nullable = true)
 |-- SodiumContent: double (nullable = true)
 |-- Carbohydr

In [13]:
df_partitioned = (spark.read.format("parquet")  # Parquet 형식으로 데이터 읽기
                  .load(  # 파일 로드"../data/partitioned_recipes/DatePublished=2020-01*"))

In [14]:
df_partitioned.printSchema()  # DataFrame 스키마 구조 출력

root
 |-- RecipeId: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- AuthorId: integer (nullable = true)
 |-- AuthorName: string (nullable = true)
 |-- CookTime: string (nullable = true)
 |-- PrepTime: string (nullable = true)
 |-- TotalTime: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- RecipeCategory: string (nullable = true)
 |-- Keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientQuantities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientParts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- AggregatedRating: double (nullable = true)
 |-- ReviewCount: integer (nullable = true)
 |-- Calories: double (nullable = true)
 |-- FatContent: double (nullable = true)
 |-- SaturatedFatContent: double (nullable = true)
 |-- CholesterolContent: double (nullable = true)
 |-- SodiumContent: double (nullable = true)
 |-- Carbohydr

### Schema Merging

In [15]:
df_merged_schema = (spark.read.format("parquet")  # Parquet 형식으로 데이터 읽기
                    .option("mergeSchema", "true")
                    .load(  # 파일 로드"../data/partitioned_recipes"))

                                                                                

In [16]:
df_merged_schema.printSchema()  # DataFrame 스키마 구조 출력

root
 |-- RecipeId: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- AuthorId: integer (nullable = true)
 |-- AuthorName: string (nullable = true)
 |-- CookTime: string (nullable = true)
 |-- PrepTime: string (nullable = true)
 |-- TotalTime: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- RecipeCategory: string (nullable = true)
 |-- Keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientQuantities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientParts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- AggregatedRating: double (nullable = true)
 |-- ReviewCount: integer (nullable = true)
 |-- Calories: double (nullable = true)
 |-- FatContent: double (nullable = true)
 |-- SaturatedFatContent: double (nullable = true)
 |-- CholesterolContent: double (nullable = true)
 |-- SodiumContent: double (nullable = true)
 |-- Carbohydr

In [17]:
# Stop the Spark Session
spark.stop()  # Spark 세션 종료 - 리소스 정리