# 서울시 공공데이터 가져오기

In [37]:
from pyspark import SparkConf, SparkContext, SQLContext

conf = SparkConf().setMaster("local[2]") # run locally with 2 worker threads
sc = SparkContext.getOrCreate(conf=conf)

In [78]:
import pyspark.sql.types as typ

labels = [
    ('기관 명', typ.StringType()),
    ('송신 서버 번호', typ.IntegerType()),
    ('데이터 번호', typ.IntegerType()),
    ('모델명', typ.StringType()),
    ('시리얼', typ.StringType()),
    ('구분', typ.IntegerType()),
    ('초미세먼지(㎍/㎥)', typ.IntegerType()),
    ('미세먼지(㎍/㎥)', typ.IntegerType()),
    ('기온(℃)', typ.FloatType()),
    ('상대습도( %)', typ.IntegerType()),
    ('풍향(°)', typ.FloatType()),
    ('풍속(m/s)', typ.FloatType()),
    ('돌풍 풍향(°)', typ.FloatType()),
    ('돌풍 풍속(m/s)', typ.FloatType()),
    ('조도(lux)', typ.FloatType()),
    ('자외선(UVI)', typ.FloatType()),
    ('소음(dB)', typ.IntegerType()),
    ('진동_x(g)', typ.FloatType()),
    ('진동_y(g)', typ.FloatType()),
    ('진동_z(g)', typ.FloatType()),
    ('진동_x 최대(g)', typ.FloatType()),
    ('진동_y 최대(g)', typ.FloatType()),
    ('진동_z 최대(g)', typ.FloatType()),
    ('흑구 운도(℃)', typ.FloatType()),
    ('초미세먼지 보정(㎍/㎥)', typ.IntegerType()),
    ('미세먼지 보정(㎍/㎥)', typ.IntegerType()),
    ('전송시간', typ.StringType()),
    ('등록일자', typ.StringType())
]

schema = typ.StructType([typ.StructField(e[0], e[1], False) for e in labels])

In [83]:
# load rdd
path ="S-DoT_NATURE_2021.02.08-02.14.csv"

df = spark.read.option("header",True).option("encoding",'cp949').option('schema', schema).csv(path)

In [86]:
df.show()

+-------+--------------+-----------+-------+-----------+----+-----------------+---------------+-------+------------+-------+---------+------------+--------------+---------+-----------+--------+---------+---------+---------+--------------+--------------+--------------+------------+----------------------+--------------------+------------+-------------------+
|기관 명|송신 서버 번호|데이터 번호| 모델명|     시리얼|구분|초미세먼지(㎍/㎥)|미세먼지(㎍/㎥)|기온(℃)|상대습도( %)|풍향(°)|풍속(m/s)|돌풍 풍향(°)|돌풍 풍속(m/s)|조도(lux)|자외선(UVI)|소음(dB)|진동_x(g)|진동_y(g)|진동_z(g)|진동_x 최대(g)|진동_y 최대(g)|진동_z 최대(g)|흑구 운도(℃)|초미세먼지 보정(㎍/㎥)|미세먼지 보정(㎍/㎥)|    전송시간|           등록일자|
+-------+--------------+-----------+-------+-----------+----+-----------------+---------------+-------+------------+-------+---------+------------+--------------+---------+-----------+--------+---------+---------+---------+--------------+--------------+--------------+------------+----------------------+--------------------+------------+-------------------+
| 서울시|            48|         

In [97]:
df.printSchema()

root
 |-- 기관 명: string (nullable = true)
 |-- 송신 서버 번호: string (nullable = true)
 |-- 데이터 번호: string (nullable = true)
 |-- 모델명: string (nullable = true)
 |-- 시리얼: string (nullable = true)
 |-- 구분: string (nullable = true)
 |-- 초미세먼지(㎍/㎥): string (nullable = true)
 |-- 미세먼지(㎍/㎥): string (nullable = true)
 |-- 기온(℃): string (nullable = true)
 |-- 상대습도( %): string (nullable = true)
 |-- 풍향(°): string (nullable = true)
 |-- 풍속(m/s): string (nullable = true)
 |-- 돌풍 풍향(°): string (nullable = true)
 |-- 돌풍 풍속(m/s): string (nullable = true)
 |-- 조도(lux): string (nullable = true)
 |-- 자외선(UVI): string (nullable = true)
 |-- 소음(dB): string (nullable = true)
 |-- 진동_x(g): string (nullable = true)
 |-- 진동_y(g): string (nullable = true)
 |-- 진동_z(g): string (nullable = true)
 |-- 진동_x 최대(g): string (nullable = true)
 |-- 진동_y 최대(g): string (nullable = true)
 |-- 진동_z 최대(g): string (nullable = true)
 |-- 흑구 운도(℃): string (nullable = true)
 |-- 초미세먼지 보정(㎍/㎥): string (nullable = true)
 |-- 미세먼지 보정(㎍

In [96]:
target_node = df.filter(df.시리얼.contains('OC3CL200010'))
target_node.show()

+-------+--------------+-----------+-------+-----------+----+-----------------+---------------+-------+------------+-------+---------+------------+--------------+---------+-----------+--------+---------+---------+---------+--------------+--------------+--------------+------------+----------------------+--------------------+------------+-------------------+
|기관 명|송신 서버 번호|데이터 번호| 모델명|     시리얼|구분|초미세먼지(㎍/㎥)|미세먼지(㎍/㎥)|기온(℃)|상대습도( %)|풍향(°)|풍속(m/s)|돌풍 풍향(°)|돌풍 풍속(m/s)|조도(lux)|자외선(UVI)|소음(dB)|진동_x(g)|진동_y(g)|진동_z(g)|진동_x 최대(g)|진동_y 최대(g)|진동_z 최대(g)|흑구 운도(℃)|초미세먼지 보정(㎍/㎥)|미세먼지 보정(㎍/㎥)|    전송시간|           등록일자|
+-------+--------------+-----------+-------+-----------+----+-----------------+---------------+-------+------------+-------+---------+------------+--------------+---------+-----------+--------+---------+---------+---------+--------------+--------------+--------------+------------+----------------------+--------------------+------------+-------------------+
| 서울시|            48|         

# 상관 계수 확인하기

In [99]:
import  pyspark.mllib.stat as st
import numpy as np

numeric_cols = ['초미세먼지(㎍/㎥)', '미세먼지(㎍/㎥)', '기온(℃)', '상대습도( %)']

numeric_rdd = target_node.select(numeric_cols).rdd.map(lambda row: [e for e in row])

In [103]:
mllib_stats = st.Statistics.colStats(numeric_rdd)

for col, m, v in zip(numeric_cols, millib_stats.mean(), mllib_stats.variance()):
    print('{0}: \t{1:.2f} \t {2:.2f}'.format(col, m, np.sqrt(v)))

초미세먼지(㎍/㎥): 	33.73 	 22.80
미세먼지(㎍/㎥): 	49.68 	 33.11
기온(℃): 	3.27 	 5.16
상대습도( %): 	54.22 	 17.32


In [104]:
corrs = st.Statistics.corr(numeric_rdd)

for i, el in enumerate(corrs > 0.5):
    correlated = [
        (numeric_cols[j], corrs[i][j])
        for j, e in enumerate(el)
        if e == 1.0 and j != i]
    if len(correlated) > 0:
        for e in correlated:
            print('{0}-to-{1}: {2:.2f}'.format(numeric_cols[i], e[0], e[1]))

초미세먼지(㎍/㎥)-to-미세먼지(㎍/㎥): 1.00
초미세먼지(㎍/㎥)-to-기온(℃): 0.67
미세먼지(㎍/㎥)-to-초미세먼지(㎍/㎥): 1.00
미세먼지(㎍/㎥)-to-기온(℃): 0.68
기온(℃)-to-초미세먼지(㎍/㎥): 0.67
기온(℃)-to-미세먼지(㎍/㎥): 0.68
