In [1]:
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 array_contains, col, explode

spark = (SparkSession.builder  # SparkSession 빌더 패턴 시작
         .appName("filter-data")  # 애플리케이션 이름 설정
         .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
         .config("spark.executor.memory", "512m")  # Spark 설정 옵션
         .getOrCreate()  # SparkSession 생성 또는 기존 세션 반환)

spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/04 16:28:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df = (spark.read.format("csv")  # CSV 형식으로 데이터 읽기
      .option("header", "true")  # 첫 번째 행을 헤더로 사용
      .option("nullValue", "null")
      .option("dateFormat",  # 날짜 형식 지정 "LLLL d, y")
      .load(  # 파일 로드"../data/netflix_titles.csv"))

                                                                                

In [3]:
filtered_df = df.filter(  # 데이터 필터링col("release_year") > 2020)
filtered_df.show()  # DataFrame 내용 출력

+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s2|TV Show|       Blood & Water|                null|Ama Qamata, Khosi...|        South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglands|     Julien Leclercq|Sami Bouajila, Tr...|                null|September 24, 2021|        2021| TV-MA| 1 Season|Crime TV Shows, I...|To protect his fa...|
|     s4|TV Show|Jailbirds New Orl.

In [4]:
filtered_df = (
    df.filter(  # 데이터 필터링
        (col(  # 컬럼 참조"country") == "United States")
        & (col(  # 컬럼 참조"release_year") > 2020)))

filtered_df.show()  # DataFrame 내용 출력

[Stage 2:>                                                          (0 + 1) / 1]

+-------+-------+--------------------+--------------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|    s10|  Movie|        The Starling|      Theodore Melfi|Melissa McCarthy,...|United States|September 24, 2021|        2021| PG-13|  104 min|    Comedies, Dramas|A woman adjusting...|
|    s16|TV Show|   Dear White People|                null|Logan Browning, B...|United States|September 22, 2021|        2021| TV-MA|4 Seasons|TV Comedies, TV D...|"Students of colo...|
|    s41|TV Show|He-Man and the Ma...|                null|Yuri Lowent

                                                                                

In [5]:
filtered_df = (
    df.filter(  # 데이터 필터링
        col(  # 컬럼 참조"country")
        .isin(["United States", "United Kingdom",  "India"])))
filtered_df.show(3)

+-------+-------+--------------------+---------------+--------------------+--------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|       country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+--------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                null| United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s5|TV Show|        Kota Factory|           null|Mayur More, Jiten...|         India|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|In a city of coac...|
|     s9|TV Show|The Great British...|Andy Devonshire|Mel Giedroyc, Sue...|United Kingdom|

### Filtering on string

In [6]:
# filter the DataFrame based on a substring match
filtered_df = df.filter(  # 데이터 필터링col("listed_in").like("%Crime%"))

# display the filtered DataFrame
filtered_df.show()  # DataFrame 내용 출력

+-------+-------+--------------------+--------------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|     s3|TV Show|           Ganglands|     Julien Leclercq|Sami Bouajila, Tr...|         null|September 24, 2021|        2021| TV-MA| 1 Season|Crime TV Shows, I...|To protect his fa...|
|    s11|TV Show|Vendetta: Truth, ...|                null|                null|         null|September 24, 2021|        2021| TV-MA| 1 Season|Crime TV Shows, D...|"Sicily boasts a ...|
|    s12|TV Show|    Bangkok Breaking|   Kongkiat Komesiri|Sukollawat 

In [7]:
# filter the DataFrame based on a regular expression match
filtered_df = df.filter(  # 데이터 필터링col("listed_in").rlike("(Crime|Thrillers)"))

# display the filtered DataFrame
filtered_df.show()  # DataFrame 내용 출력

+-------+-------+--------------------+-----------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|         director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+-----------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s3|TV Show|           Ganglands|  Julien Leclercq|Sami Bouajila, Tr...|                null|September 24, 2021|        2021| TV-MA| 1 Season|Crime TV Shows, I...|To protect his fa...|
|    s11|TV Show|Vendetta: Truth, ...|             null|                null|                null|September 24, 2021|        2021| TV-MA| 1 Season|Crime TV Shows, D...|"Sicily boasts a ...|
|    s12|TV Show|    Bangkok Breaking|Kongkiat Kom

### Filtering on Data Ranges

In [8]:
# filter the DataFrame based on a date range
filtered_df = df.filter(  # 데이터 필터링(col("date_added") >= "2021-09-05") & (col("date_added") <= "2021-09-01"))

# display the filtered DataFrame
filtered_df.show()  # DataFrame 내용 출력

+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+
|show_id|type|title|director|cast|country|date_added|release_year|rating|duration|listed_in|description|
+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+
+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+



In [9]:
# filter the DataFrame based on a date range
filtered_df = df.filter(  # 데이터 필터링(col("date_added").between("2021-02-01","2021-03-01")))

# display the filtered DataFrame
filtered_df.show()  # DataFrame 내용 출력

+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+
|show_id|type|title|director|cast|country|date_added|release_year|rating|duration|listed_in|description|
+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+
+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+



### Filter on Arrays

In [10]:
# 읽기 parquet file into a DataFrame
df_recipes = (spark.read
      .format("parquet")
      .load(  # 파일 로드"../data/recipes.parquet"))

# filter the DataFrame based on a value in the array column
filtered_df = df_recipes.filter(  # 데이터 필터링array_contains(col("RecipeIngredientParts"), "apple"))

# display the filtered DataFrame
filtered_df.show()  # DataFrame 내용 출력


[Stage 9:>                                                          (0 + 1) / 1]

+--------+--------------------+----------+-----------------+--------+--------+---------+--------------------+--------------+--------------------+--------------------------+---------------------+----------------+-----------+--------+----------+-------------------+------------------+-------------+-------------------+------------+------------+--------------+--------------+-----------+--------------------+--------------------+-------------+
|RecipeId|                Name|  AuthorId|       AuthorName|CookTime|PrepTime|TotalTime|         Description|RecipeCategory|            Keywords|RecipeIngredientQuantities|RecipeIngredientParts|AggregatedRating|ReviewCount|Calories|FatContent|SaturatedFatContent|CholesterolContent|SodiumContent|CarbohydrateContent|FiberContent|SugarContent|ProteinContent|RecipeServings|RecipeYield|  RecipeInstructions|              Images|DatePublished|
+--------+--------------------+----------+-----------------+--------+--------+---------+--------------------+---------

                                                                                

### Filtering on map columns

In [11]:
# 읽기 JSON file into a DataFrame
df_nobel_prizes = (spark.read
      .format("json")
      .option("multiLine", "true")  # 여러 줄 JSON 처리
      .load(  # 파일 로드"../data/nobel_prizes.json"))

df_nobel_prizes_exploded = (
    df_nobel_prizes
    .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"laureates",explode(col("laureates"))) # Explode the laureates array column into rows
    .select(  # 컬럼 선택col("category")
            , col(  # 컬럼 참조"year")
            , col(  # 컬럼 참조"overallMotivation")
            , col(  # 컬럼 참조"laureates"))) # Use dot notion for columns in the STRUCT field

filtered_df = (
    df_nobel_prizes_exploded
    .filter(  # 데이터 필터링
        (col(  # 컬럼 참조"laureates").getItem("firstname") == "Albert") 
        & (col(  # 컬럼 참조"laureates").getItem("surname") == "Einstein")))

filtered_df.show(truncate=False)

+--------+----+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------+
|category|year|overallMotivation|laureates                                                                                                                                    |
+--------+----+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------+
|physics |1921|null             |{Albert, 26, "for his services to Theoretical Physics, and especially for his discovery of the law of the photoelectric effect", 1, Einstein}|
+--------+----+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------+



In [12]:
spark.stop()  # Spark 세션 종료 - 리소스 정리