In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local").appName("learn_sql").getOrCreate()

In [3]:
stocks = [
    ('Google', 'GOOGL', 'USA', 2984, 'USD'),
    ('Netflix', 'NFLX', 'USA', 645, 'USD'),
    ('Amazon', 'AMZN', 'USA', 3518, 'USD'),
    ('Tesla', 'TSLA', 'USA', 1222, 'USD'),
    ('Tencent', '0700', 'Hong Kong', 483, 'HKD'),
    ('Toyota', '7203', 'Japan', 2006, 'JPY'),
    ('Samsung', '005930', 'Korea', 70600, 'KRW'),
    ('Kakao', '035720', 'Korea', 125000, 'KRW'),
]

In [4]:
stocks_schema = ["name", "ticker", "country", "price", "currency"]

In [5]:
df = spark.createDataFrame(stocks, schema=stocks_schema)

In [7]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- ticker: string (nullable = true)
 |-- country: string (nullable = true)
 |-- price: long (nullable = true)
 |-- currency: string (nullable = true)



In [11]:
df.show()

+-------+------+---------+------+--------+
|   name|ticker|  country| price|currency|
+-------+------+---------+------+--------+
| Google| GOOGL|      USA|  2984|     USD|
|Netflix|  NFLX|      USA|   645|     USD|
| Amazon|  AMZN|      USA|  3518|     USD|
|  Tesla|  TSLA|      USA|  1222|     USD|
|Tencent|  0700|Hong Kong|   483|     HKD|
| Toyota|  7203|    Japan|  2006|     JPY|
|Samsung|005930|    Korea| 70600|     KRW|
|  Kakao|035720|    Korea|125000|     KRW|
+-------+------+---------+------+--------+



In [12]:
# tempview 등록
df.createOrReplaceTempView("stocks")

In [13]:
spark.sql("""
SELECT
    name
FROM
    stocks
WHERE
    price >= 2000
""").show()

+-------+
|   name|
+-------+
| Google|
| Amazon|
| Toyota|
|Samsung|
|  Kakao|
+-------+



In [17]:
query = """
SELECT
    *
FROM
    stocks
WHERE
    country = "USA" and
    price >= 2000
"""

In [18]:
stocks_df = spark.sql(query)
stocks_df.show()

+------+------+-------+-----+--------+
|  name|ticker|country|price|currency|
+------+------+-------+-----+--------+
|Google| GOOGL|    USA| 2984|     USD|
|Amazon|  AMZN|    USA| 3518|     USD|
+------+------+-------+-----+--------+



In [59]:
earnings = [
    ('Google', 27.99, 'USD'),
    ('Netflix', 2.56, 'USD'),
    ('Amazon', 6.12, 'USD'),
    ('Tesla', 1.86, 'USD'),
    ('Tencent', 11.01, 'HKD'),
    ('Toyota', 224.82, 'JPY'),
    ('Samsung', 1780., 'KRW'),
    ('Kakao', 705., 'KRW'),
]

In [60]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType

In [61]:
earning_schema = StructType([
    StructField("name",StringType()),
    StructField("eps",FloatType()),
    StructField("currency",StringType())
])

In [62]:
earnings_df = spark.createDataFrame(earnings, earning_schema)

In [63]:
earnings_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- eps: float (nullable = true)
 |-- currency: string (nullable = true)



In [64]:
earnings_df.dtypes

[('name', 'string'), ('eps', 'float'), ('currency', 'string')]

In [65]:
earnings_df.show()

+-------+------+--------+
|   name|   eps|currency|
+-------+------+--------+
| Google| 27.99|     USD|
|Netflix|  2.56|     USD|
| Amazon|  6.12|     USD|
|  Tesla|  1.86|     USD|
|Tencent| 11.01|     HKD|
| Toyota|224.82|     JPY|
|Samsung|1780.0|     KRW|
|  Kakao| 705.0|     KRW|
+-------+------+--------+



In [66]:
earnings_df.createOrReplaceTempView("earnings")

In [67]:
spark.sql("select * from earnings").show()

+-------+------+--------+
|   name|   eps|currency|
+-------+------+--------+
| Google| 27.99|     USD|
|Netflix|  2.56|     USD|
| Amazon|  6.12|     USD|
|  Tesla|  1.86|     USD|
|Tencent| 11.01|     HKD|
| Toyota|224.82|     JPY|
|Samsung|1780.0|     KRW|
|  Kakao| 705.0|     KRW|
+-------+------+--------+



In [80]:
# 두 테이블 조인하기
query = """
SELECT
    s.name,
    ticker,
    country,
    price,
    eps,
    s.currency
    
FROM
    stocks s
JOIN
    earnings e
ON
    s.name = e.name
"""

In [81]:
comb = spark.sql(query)
comb.show()

+-------+------+---------+------+------+--------+
|   name|ticker|  country| price|   eps|currency|
+-------+------+---------+------+------+--------+
| Amazon|  AMZN|      USA|  3518|  6.12|     USD|
| Google| GOOGL|      USA|  2984| 27.99|     USD|
|  Kakao|035720|    Korea|125000| 705.0|     KRW|
|Netflix|  NFLX|      USA|   645|  2.56|     USD|
|Samsung|005930|    Korea| 70600|1780.0|     KRW|
|Tencent|  0700|Hong Kong|   483| 11.01|     HKD|
|  Tesla|  TSLA|      USA|  1222|  1.86|     USD|
| Toyota|  7203|    Japan|  2006|224.82|     JPY|
+-------+------+---------+------+------+--------+



In [82]:
query = """
SELECT
    s.name,
    (s.price / e.eps)
FROM
    stocks s
JOIN
    earnings e
ON
    s.name = e.name
"""

In [83]:
spark.sql(query).show()

+-------+------------------+
|   name|     (price / eps)|
+-------+------------------+
| Amazon| 574.8366120563447|
| Google| 106.6095042658442|
|  Kakao| 177.3049645390071|
|Netflix| 251.9531306315913|
|Samsung|39.662921348314605|
|Tencent| 43.86920889728746|
|  Tesla|  656.989242258975|
| Toyota| 8.922693419839167|
+-------+------------------+



In [84]:
comb.createOrReplaceTempView("comb")

In [88]:
# 테슬라 가격보다 비싼 미국 주식 가지고 오기

query = """
SELECT
    *
FROM
    comb
WHERE
    country = "USA" and
    price >= (SELECT price FROM comb WHERE name = "Tesla")
ORDER BY
    price
    
"""

spark.sql(query).show()

+------+------+-------+-----+-----+--------+
|  name|ticker|country|price|  eps|currency|
+------+------+-------+-----+-----+--------+
| Tesla|  TSLA|    USA| 1222| 1.86|     USD|
|Google| GOOGL|    USA| 2984|27.99|     USD|
|Amazon|  AMZN|    USA| 3518| 6.12|     USD|
+------+------+-------+-----+-----+--------+

