# 애플 주식 데이터 분석

## PySpark와 Py4J 설치

In [1]:
!pip install pyspark==3.0.1 py4j==0.10.9

Collecting pyspark==3.0.1
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 35 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 51.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612243 sha256=89f05fa06480c7070b7cf6d1add5a07ec7098e7ade9790bb7f6995f9aff49e1a
  Stored in directory: /root/.cache/pip/wheels/5e/34/fa/b37b5cef503fc5148b478b2495043ba61b079120b7ff379f9b
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


## SparkSession 생성

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession\
  .builder\
  .appName('Python Spark DataFrame basic example')\
  .getOrCreate()

## 애플 주식 CSV파일 로드

In [14]:
import pandas as pd

apple_pandas_df = pd.read_csv("https://pyspark-test-sj.s3-us-west-2.amazonaws.com/appl_stock.csv")
apple_spark_df = spark.createDataFrame(apple_pandas_df)

In [15]:
# 칼럼의 종류 확인
apple_spark_df.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

In [16]:
# 스키마를 출력
apple_spark_df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: long (nullable = true)
 |-- Adj Close: double (nullable = true)



In [17]:
# 처음 5개의 레코드 출력
apple_spark_df.show(n=5)

+----------+----------+----------+----------+----------+---------+------------------+
|      Date|      Open|      High|       Low|     Close|   Volume|         Adj Close|
+----------+----------+----------+----------+----------+---------+------------------+
|2010-01-04|213.429998|214.499996|212.380001|214.009998|123432400|         27.727039|
|2010-01-05|214.599998|215.589994|213.249994|214.379993|150476200|         27.774976|
|2010-01-06|214.379993|    215.23|210.750004|210.969995|138040000|27.333178000000004|
|2010-01-07|    211.75|212.000006|209.050005|    210.58|119282800|          27.28265|
|2010-01-08|210.299994|212.000006|209.060005|211.980005|111902700|         27.464034|
+----------+----------+----------+----------+----------+---------+------------------+
only showing top 5 rows



In [18]:
# describe를 사용하여 DataFrame의 칼럼별 통계를 확인
apple_spark_df.describe().show()

+-------+----------+------------------+-----------------+------------------+------------------+--------------------+------------------+
|summary|      Date|              Open|             High|               Low|             Close|              Volume|         Adj Close|
+-------+----------+------------------+-----------------+------------------+------------------+--------------------+------------------+
|  count|      1762|              1762|             1762|              1762|              1762|                1762|              1762|
|   mean|      null|313.07631115890996|315.9112880164584|309.82824050794557| 312.9270656379114| 9.422577587968218E7| 75.00174115607273|
| stddev|      null|185.29946803981545|186.8981768648577|183.38391664370988|185.14710361709427|6.0205187765927084E7|28.574929721799037|
|    min|2010-01-04|              90.0|        90.699997|         89.470001|         90.279999|            11475900|         24.881912|
|    max|2016-12-30|        702.409988|       70

In [19]:
from pyspark.sql.functions import mean

# Close 칼럼의 평균값
apple_spark_df.select(mean('Close')).show()

+-----------------+
|       avg(Close)|
+-----------------+
|312.9270656379114|
+-----------------+



In [20]:
from pyspark.sql.functions import min, max

# Volume 칼럼의 최댓값과 최솟값
apple_spark_df.select(max('Volume'), min('Volume')).show()

+-----------+-----------+
|max(Volume)|min(Volume)|
+-----------+-----------+
|  470249500|   11475900|
+-----------+-----------+



## HV radio 라는 이름의 새로운 칼럼을 추가한 데이터프레임 만들기 (hv ratio = High/Volume)

In [21]:
apple_spark_df_with_hv = apple_spark_df.withColumn("hv ratio", apple_spark_df.High/apple_spark_df.Volume)

In [23]:
apple_spark_df_with_hv.show(5)

+----------+----------+----------+----------+----------+---------+------------------+--------------------+
|      Date|      Open|      High|       Low|     Close|   Volume|         Adj Close|            hv ratio|
+----------+----------+----------+----------+----------+---------+------------------+--------------------+
|2010-01-04|213.429998|214.499996|212.380001|214.009998|123432400|         27.727039|1.737793286041590...|
|2010-01-05|214.599998|215.589994|213.249994|214.379993|150476200|         27.774976|1.432718223878593...|
|2010-01-06|214.379993|    215.23|210.750004|210.969995|138040000|27.333178000000004|1.559185743262822...|
|2010-01-07|    211.75|212.000006|209.050005|    210.58|119282800|          27.28265|1.777288980473295...|
|2010-01-08|210.299994|212.000006|209.060005|211.980005|111902700|         27.464034|1.894503045949740...|
+----------+----------+----------+----------+----------+---------+------------------+--------------------+
only showing top 5 rows



## 월별 Close 칼럼의 평균값

In [25]:
from pyspark.sql.functions import month 

monthdf = apple_spark_df.withColumn("Month", month("Date"))
monthdf.show(5)

+----------+----------+----------+----------+----------+---------+------------------+-----+
|      Date|      Open|      High|       Low|     Close|   Volume|         Adj Close|Month|
+----------+----------+----------+----------+----------+---------+------------------+-----+
|2010-01-04|213.429998|214.499996|212.380001|214.009998|123432400|         27.727039|    1|
|2010-01-05|214.599998|215.589994|213.249994|214.379993|150476200|         27.774976|    1|
|2010-01-06|214.379993|    215.23|210.750004|210.969995|138040000|27.333178000000004|    1|
|2010-01-07|    211.75|212.000006|209.050005|    210.58|119282800|          27.28265|    1|
|2010-01-08|210.299994|212.000006|209.060005|211.980005|111902700|         27.464034|    1|
+----------+----------+----------+----------+----------+---------+------------------+-----+
only showing top 5 rows



In [26]:
monthavgdf = monthdf.select(['Month', 'Close']).groupby('Month').mean()

In [27]:
monthavgdf.show()

+-----+----------+------------------+
|Month|avg(Month)|        avg(Close)|
+-----+----------+------------------+
|   12|      12.0|302.35053626845644|
|    1|       1.0| 322.2097142571429|
|    6|       6.0|288.12546566000003|
|    3|       3.0| 332.9115673137254|
|    5|       5.0|  351.621020857143|
|    9|       9.0| 301.0763195902777|
|    4|       4.0| 340.5104108150685|
|    8|       8.0|300.43858096129026|
|    7|       7.0| 281.7221621148649|
|   10|      10.0|  308.305525631579|
|   11|      11.0|306.27251748951056|
|    2|       2.0| 321.3595563037037|
+-----+----------+------------------+



In [36]:
monthavgdf.select(['Month', 'avg(Close)']).orderBy('Month').show()

+-----+------------------+
|Month|        avg(Close)|
+-----+------------------+
|    1| 322.2097142571429|
|    2| 321.3595563037037|
|    3| 332.9115673137254|
|    4| 340.5104108150685|
|    5|  351.621020857143|
|    6|288.12546566000003|
|    7| 281.7221621148649|
|    8|300.43858096129026|
|    9| 301.0763195902777|
|   10|  308.305525631579|
|   11|306.27251748951056|
|   12|302.35053626845644|
+-----+------------------+

