In [1]:
from pyspark.sql import  SQLContext, Row
from pyspark import SparkContext,SparkConf
import pandas as pd
from pyspark.sql.functions import regexp_extract, regexp_replace, when,udf,col,count,sum,avg,round
from pyspark.sql.types import DoubleType,IntegerType,StringType
import pyspark.sql.functions as F

In [2]:
sc = SparkContext('local')
sqlCtx = SQLContext( sc )

In [3]:
df = sqlCtx.read.csv('data/airline-passengers.csv', 
                     header = True, inferSchema = True)
df.toPandas() 


Unnamed: 0,Month,Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121
...,...,...
139,1960-08,606
140,1960-09,508
141,1960-10,461
142,1960-11,390


In [4]:
df.printSchema()

root
 |-- Month: string (nullable = true)
 |-- Passengers: integer (nullable = true)



### date 타입(timestamp)으로 변경

In [5]:
df1 = df.withColumn('parsed', F.to_timestamp('Month', format='yyyy-MM'))
df1.show()

+-------+----------+-------------------+
|  Month|Passengers|             parsed|
+-------+----------+-------------------+
|1949-01|       112|1949-01-01 00:00:00|
|1949-02|       118|1949-02-01 00:00:00|
|1949-03|       132|1949-03-01 00:00:00|
|1949-04|       129|1949-04-01 00:00:00|
|1949-05|       121|1949-05-01 00:00:00|
|1949-06|       135|1949-06-01 00:00:00|
|1949-07|       148|1949-07-01 00:00:00|
|1949-08|       148|1949-08-01 00:00:00|
|1949-09|       136|1949-09-01 00:00:00|
|1949-10|       119|1949-10-01 00:00:00|
|1949-11|       104|1949-11-01 00:00:00|
|1949-12|       118|1949-12-01 00:00:00|
|1950-01|       115|1950-01-01 00:00:00|
|1950-02|       126|1950-02-01 00:00:00|
|1950-03|       141|1950-03-01 00:00:00|
|1950-04|       135|1950-04-01 01:00:00|
|1950-05|       125|1950-05-01 00:00:00|
|1950-06|       149|1950-06-01 00:00:00|
|1950-07|       170|1950-07-01 00:00:00|
|1950-08|       170|1950-08-01 00:00:00|
+-------+----------+-------------------+
only showing top

In [6]:
df1.printSchema()

root
 |-- Month: string (nullable = true)
 |-- Passengers: integer (nullable = true)
 |-- parsed: timestamp (nullable = true)



In [7]:
df1.where(df1['parsed'] >= '1959').show(5)

+-------+----------+-------------------+
|  Month|Passengers|             parsed|
+-------+----------+-------------------+
|1959-01|       360|1959-01-01 00:00:00|
|1959-02|       342|1959-02-01 00:00:00|
|1959-03|       406|1959-03-01 00:00:00|
|1959-04|       396|1959-04-01 00:00:00|
|1959-05|       420|1959-05-01 00:00:00|
+-------+----------+-------------------+
only showing top 5 rows



In [8]:
# 연도의 값만 뽑아 y에 저장
df1.withColumn('y', F.year('parsed')).show(5)

+-------+----------+-------------------+----+
|  Month|Passengers|             parsed|   y|
+-------+----------+-------------------+----+
|1949-01|       112|1949-01-01 00:00:00|1949|
|1949-02|       118|1949-02-01 00:00:00|1949|
|1949-03|       132|1949-03-01 00:00:00|1949|
|1949-04|       129|1949-04-01 00:00:00|1949|
|1949-05|       121|1949-05-01 00:00:00|1949|
+-------+----------+-------------------+----+
only showing top 5 rows



In [9]:
# 월의 값만 뽑아 m에 저장
df1.withColumn('m', F.month('parsed')).show(5)

+-------+----------+-------------------+---+
|  Month|Passengers|             parsed|  m|
+-------+----------+-------------------+---+
|1949-01|       112|1949-01-01 00:00:00|  1|
|1949-02|       118|1949-02-01 00:00:00|  2|
|1949-03|       132|1949-03-01 00:00:00|  3|
|1949-04|       129|1949-04-01 00:00:00|  4|
|1949-05|       121|1949-05-01 00:00:00|  5|
+-------+----------+-------------------+---+
only showing top 5 rows



In [10]:
df1.withColumn('f',F.date_format('parsed', 'yyyy년 MM월')).show(5)

+-------+----------+-------------------+-----------+
|  Month|Passengers|             parsed|          f|
+-------+----------+-------------------+-----------+
|1949-01|       112|1949-01-01 00:00:00|1949년 01월|
|1949-02|       118|1949-02-01 00:00:00|1949년 02월|
|1949-03|       132|1949-03-01 00:00:00|1949년 03월|
|1949-04|       129|1949-04-01 00:00:00|1949년 04월|
|1949-05|       121|1949-05-01 00:00:00|1949년 05월|
+-------+----------+-------------------+-----------+
only showing top 5 rows



### selectExpr 사용하여 year 값 출력

In [11]:
df1.selectExpr('year(parsed) as year')\
            .groupby('year').mean().orderBy('year').show(5)

+----+---------+
|year|avg(year)|
+----+---------+
|1949|   1949.0|
|1950|   1950.0|
|1951|   1951.0|
|1952|   1952.0|
|1953|   1953.0|
+----+---------+
only showing top 5 rows



In [12]:
df1.selectExpr('year(parsed) as year', 'Passengers').\
            groupby('year').mean().orderBy('year').show(5)

+----+---------+------------------+
|year|avg(year)|   avg(Passengers)|
+----+---------+------------------+
|1949|   1949.0|126.66666666666667|
|1950|   1950.0|139.66666666666666|
|1951|   1951.0|170.16666666666666|
|1952|   1952.0|             197.0|
|1953|   1953.0|             225.0|
+----+---------+------------------+
only showing top 5 rows



In [13]:
# 연도별 평균값
df1.groupBy( F.year( 'parsed').alias('year') ).mean().\
    orderBy('year').\
withColumn('avg(Passengers)',F.round('avg(Passengers)',2) ).show()

+----+---------------+
|year|avg(Passengers)|
+----+---------------+
|1949|         126.67|
|1950|         139.67|
|1951|         170.17|
|1952|          197.0|
|1953|          225.0|
|1954|         238.92|
|1955|          284.0|
|1956|         328.25|
|1957|         368.42|
|1958|          381.0|
|1959|         428.33|
|1960|         476.17|
+----+---------------+



### 주식 데이터 이용

In [16]:
from pandas_datareader import data

In [18]:
samsungDF = data.get_data_yahoo('005930.KS','2017-01-01')
samsungDF.reset_index(inplace=True)

In [19]:
sDF = sqlCtx.createDataFrame( samsungDF )

In [21]:
sDF.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: double (nullable = true)
 |-- Adj Close: double (nullable = true)



#### window 함수 이용

In [23]:
# 집계 단위를 사용할수 있음 ex) 2주마다 2 week
sDF.groupBy(F.window('Date', '2 week')).mean('Close').orderBy('window').show()

+--------------------+------------------+
|              window|        avg(Close)|
+--------------------+------------------+
|[2016-12-29 09:00...|36893.333333333336|
|[2017-01-12 09:00...|           37822.0|
|[2017-01-26 09:00...|           39072.5|
|[2017-02-09 09:00...|           38358.0|
|[2017-02-23 09:00...|39415.555555555555|
|[2017-03-09 09:00...|           41650.0|
|[2017-03-23 09:00...|           41664.0|
|[2017-04-06 09:00...|           41572.0|
|[2017-04-20 09:00...|           43297.5|
|[2017-05-04 09:00...|           46087.5|
|[2017-05-18 09:00...|           45102.0|
|[2017-06-01 09:00...|45586.666666666664|
|[2017-06-15 09:00...|           47556.0|
|[2017-06-29 09:00...|           48336.0|
|[2017-07-13 09:00...|           50548.0|
|[2017-07-27 09:00...|           47652.0|
|[2017-08-10 09:00...| 46511.11111111111|
|[2017-08-24 09:00...|           46612.0|
|[2017-09-07 09:00...|           50842.0|
|[2017-09-21 09:00...|           52292.0|
+--------------------+------------