### spark 세션을 생성해주기위해서 다음과 같이 컴파일을 진행해준다

In [64]:
import pyspark
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SparkSession 
conf = pyspark.SparkConf() \
                .setAppName('appName') \
                .setMaster('local[2]')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

### 만약 세션이 끝난다면 다음과같은 코드를 실행한다

In [90]:
sc.stop()

### RDD 만들기

In [47]:
# 리스트에서 RDD 생성
data = [1, 2, 3, 4, 5]
data

[1, 2, 3, 4, 5]

In [48]:
rdd = sc.parallelize(data, 4)
rdd

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [49]:
sc.defaultParallelism

2

In [50]:
rdd1 = rdd.map(lambda x: x * 2)

In [51]:
rdd1.collect()

[2, 4, 6, 8, 10]

In [52]:
rdd2 = rdd.filter(lambda x: x % 2 == 0)

In [53]:
rdd2.collect()

[2, 4]

In [54]:
rdd3 = sc.parallelize([1, 4, 2, 2, 3])
rdd3.distinct().collect()

[4, 2, 1, 3]

In [55]:
rdd4 = sc.parallelize([1, 2, 3])
rdd4.map(lambda x: [x, x+5]).collect()

[[1, 6], [2, 7], [3, 8]]

In [56]:
rdd4.flatMap(lambda x: [x, x+5]).collect()

[1, 6, 2, 7, 3, 8]

### Action
reduce(func)  
take(n)  
collect()  
takeOrdered(n, key=func) 

In [57]:
rdd = sc.parallelize([1,2,3])
rdd.reduce(lambda a, b : a * b)

6

In [58]:
rdd.take(2)

[1, 2]

In [59]:
rdd.collect()

[1, 2, 3]

In [60]:
rdd5 = sc.parallelize([5, 3, 1, 2])
rdd5.takeOrdered(3, lambda s: -1 * s)

[5, 3, 2]

In [61]:
rdd5

ParallelCollectionRDD[16] at parallelize at PythonRDD.scala:195

### 스파크 데이터 타입

- ByteType : int, long -128 ~ 127 사이의 값  
- ShortType : int, long -32768 ~ 32767 사이의 값  
- IntegerType : int, long 2바이트 크기  
- LongType : long 8바이트 크기
- FloatType : float 4바이트 크기  
- Doubletype : float  
- DecimalType : decimal.Decimal 
- StringType : string  
- BinaryType : bytearray  
- BooleanType : bool  
- TimestampType : datetime.datetime  
- DateType : datetime.date  
- ArrayType : list, tuple, array  


### DataFrame
Row 타입의 레코드(테이불의 로우 같은)와 각 레코드에 수행할 연산 표현식을  
나타내는 여러 컬럼 (스프레드시트의 컬럼 같은)으로 구성됩니다.  
스키마는 각 컬럼명과 데이터 타입을 정의 

In [65]:
df = spark.read.format("json").load("./data/flight-data/json/2015-summary.json")

In [66]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [67]:
spark.read.format("json").load("./data/flight-data/json/2015-summary.json").schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

```
StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),
StructField(ORIGIN_COUNTRY_NAME,StringType,true),
StructField(count,LongType,true)))
```

In [68]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

##### 스키마를 지정해서 데이터 가져오기  ( .schema )

In [69]:
myManualSchema = StructType([
    StructField("DEST_COUNTRY_NAME",StringType(),True),
    StructField("ORIGIN_COUNTRY_NAME",StringType(),True),
    StructField("count",LongType(),False, metadata={"hello":"world"})
    ])
df = spark.read.format("json").schema(myManualSchema) \
        .load("./data/flight-data/json/2015-summary.json")

In [70]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [71]:
df.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

#### 얘는 왜 했는가

In [72]:
from pyspark.sql.functions import col, column

In [73]:
print(col("someColumnName"))
print(column("someColumnName"))

Column<b'someColumnName'>
Column<b'someColumnName'>


In [74]:
type(df)

pyspark.sql.dataframe.DataFrame

In [75]:
df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [76]:
from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)

In [77]:
myRow[0]

'Hello'

In [78]:
myRow[1]
# 아웃풋이 없다!

In [79]:
myRow[2]

1

In [80]:
myRow[3]

False

In [81]:
df = spark.read.format("json").schema(myManualSchema) \
        .load("./data/flight-data/json/2015-summary.json")

In [82]:
df.createOrReplaceTempView('dfTable')

In [83]:
myManualSchema = StructType([
    StructField("some",StringType(),True),
    StructField("col",StringType(),True),
    StructField("names",LongType(),False)
    ])
myRow = Row("Hello", None, 1)
myDf = spark.createDataFrame([myRow], myManualSchema)

In [84]:
myDf.show()

+-----+----+-----+
| some| col|names|
+-----+----+-----+
|Hello|null|    1|
+-----+----+-----+



In [85]:
type(myDf)

pyspark.sql.dataframe.DataFrame

In [86]:
df.select("DEST_COUNTRY_NAME").show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



```SQL
SELECT DEST_COUNTRY_NAME FROM dfTable LIMIT 2
```

In [87]:
df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+
only showing top 2 rows



```SQL
SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME FROM dfTable LIMIT 2
```

In [88]:
df.selectExpr(
    "*", # 모든 원본 컬럼
    "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry"
    ).show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



```SQL
SELECT *, (DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry
FROM dfTable
LIMIT 2
```

In [89]:
df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2)

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



```SQL
SELECT avg(count), count(distinct(DEST_COUNTRY_NAME))
FROM dfTable
LIMIT 2
```