In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 32 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 54.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=af013ca5a1ea04ffffee1d5fb9f563d28bca6dc1f0a2bad6734add417279c211
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


[참고 사이트](https://spark.apache.org/docs/latest/sql-getting-started.html)


### Starting Point : SparkSession
- spark 이용을 위한 entry point 역할

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .appName("Python Spark SQL basic example")\ 
        .config("spark.some.config.option", "some-value")\ 
        .getOrCreate()


In [7]:
df = spark.read.json('./people.json') # json 파일 불러오기

df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [8]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



### 1. `select` : 칼럼 선택

In [10]:
# 칼럼 선택
df.select('name').show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [12]:
# 여러 셀 선택
df.select(df['age']+1,df['name']).show()

+---------+-------+
|(age + 1)|   name|
+---------+-------+
|     null|Michael|
|       31|   Andy|
|       20| Justin|
+---------+-------+



### 2. `filter`

In [13]:
df.filter(df['age']>21).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



### 3. `groupBy`



In [15]:
df.groupBy('age').count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



이외에도 다양한 명령어들이 있습니다!

[더 많은 명령어 보기(head, isnull 등등 dataframe과 매우 유사한 함수 체계)](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#dataframe-apis)

## SQL Query 날리기
- Dataframe 등록
- spark sql

In [17]:
# SQL 쿼리사용을 위한 등록
df.createOrReplaceTempView("people")


sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



## Global Temporary View

- spark SQL에서 temporary view는 Session안에만 적용되고 session 종료되면 사라짐.
- 

In [None]:
df.createGlobalTempView("people")


## txt(또는 csv) 파일로부터 schema 뽑아내기
Reflection 사용하여 schema 추론하기

In [20]:
from pyspark.sql import Row

sc = spark.sparkContext

lines = sc.textFile('./people.txt')# text file (comma로 구분되어 있음)

parts = lines.map(lambda l : l.split(","))# comma 로 된 데이터 split
people = parts.map(lambda p : Row(name=p[0], age = int(p[1]))) # Row/age tuple 할당

schemaPeople = spark.createDataFrame(people) # 데이터프레임 생성

schemaPeople.show()

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 19|
+-------+---+



In [21]:
teenagers = spark.sql("SELECT name FROM people WHERE age>=13 and age<=19") # query로 할당

teenNames = teenagers.rdd.map(lambda p : "Name : "+p.name).collect()# 모든 데이터 불러오기
for name in teenNames:
  print(name)

Name : Justin


### Schema 특정하기
1. 원래 RDD로부터 tuple이나 list의 RDD를 생성
2. `StructureType`로 표현되는 schema 생성. 이것은 step1에서 생성된 RDD tuple이나 list를 매칭
3. `createDataFrame`으로 RDD에 schema 적용. 이것은 `SparkSession`에 의해 제공된 것. 

In [22]:
from pyspark.sql.types import StringType, StructType, StructField # 자료형 정의

sc = spark.sparkContext

lines = sc.textFile('./people.txt')
parts = lines.map(lambda l: l.split(","))
# 각 라인이 tuple로 변형됨
people = parts.map(lambda p: (p[0], p[1].strip()))

# schema
schemaString = 'name age'
#field name : name, age
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]#각 칼럼명을 Structfield로 감싸고
schema = StructType(fields) # Structtype으로 최종 감싸기

# schema RDD에 적용
schemaPeople = spark.createDataFrame(people, schema)

## temporary view 생성
schemaPeople.createOrReplaceTempView("people")

# query 날리기
results = spark.sql("SELECT name FROM people")

results.show()



+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+

