In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [4]:
sc

In [5]:
spark

In [6]:
df = spark.read.csv('sample/ages.csv')

In [7]:
df.dtypes

[('_c0', 'string'), ('_c1', 'string')]

## 데이터 프레임으로 자동으로 만들기
- 자동으로 스키마를 만드는 것이 데이터 프레임

In [8]:
df = spark.read.format("json").load("sample/people.json")

In [9]:
df.dtypes

[('age', 'bigint'), ('name', 'string')]

## RDD로 읽어온 다음에 데이터 프레임으로 생성

In [10]:
rdd = sc.textFile("sample/people.json")

In [11]:
rdd.collect()

['{"name":"Michael"}',
 '{"name":"Andy", "age":30}',
 '{"name":"Justin", "age":19}']

In [12]:
df = spark.read.json(rdd)

In [13]:
df.dtypes

[('age', 'bigint'), ('name', 'string')]

In [14]:
rdd = sc.parallelize(("""
    {
        "id":"123",
        "name": "Katie",
        "age": 19,
        "eyecolor":"brown"
    }
    """,
    """{
        "id":"234",
        "name": "Michael",
        "age": 22,
        "eyecolor":"green"
        }
    """,
    """{
        "id":"345",
        "name": "Simone",
        "age": 23,
        "eyecolor":"blue"
        }
    """))

In [15]:
rdd.collect()

['\n    {\n        "id":"123",\n        "name": "Katie",\n        "age": 19,\n        "eyecolor":"brown"\n    }\n    ',
 '{\n        "id":"234",\n        "name": "Michael",\n        "age": 22,\n        "eyecolor":"green"\n        }\n    ',
 '{\n        "id":"345",\n        "name": "Simone",\n        "age": 23,\n        "eyecolor":"blue"\n        }\n    ']

In [16]:
df = spark.read.json(rdd)

In [17]:
df

DataFrame[age: bigint, eyecolor: string, id: string, name: string]

## view table을 생성하는 것과 유사

In [18]:
df.createOrReplaceTempView("test")

## 데이터 프레임 데이터 가져오기
- 함수 df.show()
- sql 문

In [19]:
df.show() # shema를 가지고 있기 때문에 스키마 형태로 데이터를 보여준다.

+---+--------+---+-------+
|age|eyecolor| id|   name|
+---+--------+---+-------+
| 19|   brown|123|  Katie|
| 22|   green|234|Michael|
| 23|    blue|345| Simone|
+---+--------+---+-------+



In [20]:
spark.sql("select * from test").collect() # test라는 tempview를 전부 다 가져오게 된다. 

[Row(age=19, eyecolor='brown', id='123', name='Katie'),
 Row(age=22, eyecolor='green', id='234', name='Michael'),
 Row(age=23, eyecolor='blue', id='345', name='Simone')]

## Schema 확인

In [21]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- eyecolor: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



## Schema 생성

In [22]:
rdd = sc.parallelize(
    [
        (123, "Katie", 19, "brown"),
        (234, "Michael", 22, "green"),
        (345, "Simone", 23, "blue")
    ]
)

In [23]:
from pyspark.sql.types import *

In [24]:
schema = StructType(
    [
        StructField("id", LongType(), True),
        StructField("name", StringType(), True),
        StructField("age", LongType(), True),
        StructField("eyeColor", StringType(), True)
    ]
)

In [25]:
df = spark.createDataFrame(rdd, schema=schema)

In [26]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)



In [27]:
df.createOrReplaceTempView("test2")

In [28]:
spark.sql("select * from test2")

DataFrame[id: bigint, name: string, age: bigint, eyeColor: string]

In [29]:
spark.sql("select * from test2").collect()

[Row(id=123, name='Katie', age=19, eyeColor='brown'),
 Row(id=234, name='Michael', age=22, eyeColor='green'),
 Row(id=345, name='Simone', age=23, eyeColor='blue')]

In [30]:
df.show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [31]:
spark.sql("select * from test2").show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [32]:
df.count() # row가 몇 개 있는지 알려준다.

3

In [34]:
spark.sql("select count(*) from test").show()

+--------+
|count(1)|
+--------+
|       3|
+--------+



***

In [36]:
df.select("id", "age").show()

+---+---+
| id|age|
+---+---+
|123| 19|
|234| 22|
|345| 23|
+---+---+



In [37]:
spark.sql("select id, age from test").show()

+---+---+
| id|age|
+---+---+
|123| 19|
|234| 22|
|345| 23|
+---+---+



***

## 데이터를 불러올 때 여러 방법이 있다

In [43]:
df.select("id", "age").filter("age=22").show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [44]:
spark.sql("select id, age from test where age=22").show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [45]:
df.select(df.id, df.age).filter(df.age==22).show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



***

## 예제 (비행 기록 성능)

In [46]:
flightpath = "sample/departuredelays.csv"
airportpath = "sample/airport-codes-na.txt"

In [48]:
flight = spark.read.csv(flightpath, header=True)

In [54]:
airport = spark.read.csv(airportpath, header=True, inferSchema=True, sep="\t")

In [59]:
flight.take(1)

[Row(date='01011245', delay='6', distance='602', origin='ABE', destination='ATL')]

In [57]:
airport.take(1)

[Row(City='Abbotsford', State='BC', Country='Canada', IATA='YXX')]

In [60]:
airport.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- IATA: string (nullable = true)



In [61]:
flight.printSchema()

root
 |-- date: string (nullable = true)
 |-- delay: string (nullable = true)
 |-- distance: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



In [62]:
flight.createOrReplaceTempView("flight")
airport.createOrReplaceTempView("airport")

## 공항 위치, 그에 따른 공항 코드, 지연 시간의 합
- 워싱턴 DC 주에서 발생하는 비행기 지연

In [65]:
spark.sql(
    """
        select a.City, f.origin, sum(f.delay) as delay
            from flight f 
                join airport a
                    on a.IATA = f.origin
        where a.State = "WA"
        group by a.City, f.origin 
        order by delay desc
    """
).show()

+-------+------+--------+
|   City|origin|   delay|
+-------+------+--------+
|Seattle|   SEA|159086.0|
|Spokane|   GEG| 12404.0|
|  Pasco|   PSC|   949.0|
+-------+------+--------+



In [77]:
airport.join(flight, airport.IATA == flight.origin)\
    .where(airport.State == "WA")\
    .select(airport.City, flight.origin, flight.delay)\
    .groupBy(airport.City, flight.origin)\
    .agg(F.sum(flight.delay))\
    .orderBy("sum(delay)", ascending=False)\
    .show()

+-------+------+----------+
|   City|origin|sum(delay)|
+-------+------+----------+
|Seattle|   SEA|  159086.0|
|Spokane|   GEG|   12404.0|
|  Pasco|   PSC|     949.0|
+-------+------+----------+



In [78]:
from pyspark.sql import functions as F # 함수형으로 만들 수 있다