In [None]:
%pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark-dataframe").getOrCreate()
spark

In [1]:
%pyspark

filepath = "file:////home/ubuntu/working/spark-examples/data/titanic_train.csv"

titanic_sdf = spark.read.csv(filepath, inferSchema=True, header=True)

In [2]:
%pyspark

titanic_sdf.show()

In [3]:
%pyspark

z.show(titanic_sdf)

In [4]:
%pyspark

titanic_sdf.createOrReplaceTempView("titanic")

In [5]:
%pyspark
query = """
    SELECT * FROM titanic
"""
z.show(spark.sql(query))

### DataLake -> DataWarehouse
- survived, pclass, sex, age, Fare


In [7]:
%pyspark

query = """
    SELECT
        t.Survived, t.Sex, t.Pclass, t.Fare, t.Age
    FROM titanic t 
"""

titanic_wh = spark.sql(query)
z.show(titanic_wh)

In [8]:
%pyspark
titanic_wh.createOrReplaceTempView("titanic")

## DataWarehouse -> DataMart

In [None]:
## Fare 요금 이상치 제거

In [10]:
%pyspark
import pyspark.sql.functions as F

# Q1, Q3 범위 정의
q1 = titanic_wh.approxQuantile("Fare", [0.25], 0.05)[0]
q3 = titanic_wh.approxQuantile("Fare", [0.75], 0.05)[0]

# IQR 계산
iqr = q3 - q1

# 이상치 제거를 위한 상한선과 하한선 계산
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

filtered_df = titanic_wh.filter((F.col("Fare") >= lower_bound) & (F.col("Fare") <= upper_bound))
filtered_df.show()

In [11]:
%pyspark

filtered_df.createOrReplaceTempView("titanic")

In [12]:
%pyspark
query = """
    SELECT *
    FROM titanic
"""

z.show(spark.sql(query))

In [None]:
## 결측치 확인 후 제거

In [13]:
%pyspark


query = """

    SELECT *
    FROM titanic
    WHERE Age is not null
      AND Survived is not null
      AND Sex is not null
      AND Pclass is not null
"""
titanic_result = spark.sql(query)
z.show(titanic_result)

In [None]:
%pyspark 

titanic_result.createOrReplaceTempView("titanic")

In [None]:
## 나이대 컬럼 추가

In [14]:
%pyspark

def age_grade(age):
    age_grade = int(age/10)*10
    return age_grade

In [15]:
%pyspark
from pyspark.sql.types import LongType

spark.udf.register('age_grade', age_grade, LongType())

In [16]:
%pyspark

query = """
    SELECT
        *, age_grade(Age) as Age_grade
    FROM titanic
"""
titanic_result = spark.sql(query)
z.show(titanic_result)

In [17]:
%pyspark 
# DataMart

titanic_result.createOrReplaceTempView("titanic")

# EDA 계획

- 생존한 남녀 수 비교 (count)
- 남녀의 생존 비율
- Pclass 별 Fare 
- Pclass 별 생존한 사람 수
- 나이대 컬럼 생성해 나이대 별 생존한 사람 수

In [19]:
%pyspark
# 전체 남녀 수 비교 (count)

query = """
    SELECT *
    FROM titanic
"""
z.show(spark.sql(query))

In [20]:
%pyspark
# 생존한 남녀 수 비교

query = """
    SELECT *
    FROM titanic
"""
z.show(spark.sql(query))

In [21]:
%pyspark
# 남녀의 생존 비율
# - Survived AVG : 생존은 1, 사망은 0 이므로 avg = 생존한 사람 수 / 전체 사람 수 = 1의 개수 / 전체 개수

query = """
    SELECT *
    FROM titanic
"""
z.show(spark.sql(query))

In [22]:
%pyspark

# Pclass 별 Fare

query = """
    SELECT t.Pclass, t.Fare
    FROM titanic t
"""
z.show(spark.sql(query))

In [23]:
%pyspark
query = """
    SELECT *
    FROM titanic 
"""
z.show(spark.sql(query))

In [24]:
%pyspark
query = """
    SELECT *
    FROM titanic 
"""
z.show(spark.sql(query))

In [25]:
%pyspark
# 나이대 생존 수
query = """
    SELECT *
    FROM titanic 
"""
z.show(spark.sql(query))

In [26]:
%pyspark
spark.stop()