## 대기 오염 정보 분석

여러가지 대기 오염 변수, 지역 변수, 시간 변수로 초 미세먼지(u_fine_dust)수치를 예측하는 모델을 학습하고 만든다.

In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("air-pollution-degree-analysis")\
                            .config("spark.executor.memory", MAX_MEMORY)\
                            .config("spark.driver.memory", MAX_MEMORY)\
                            .getOrCreate()

22/04/23 20:30:24 WARN Utils: Your hostname, devkhk-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.30.1.27 instead (on interface en0)
22/04/23 20:30:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/23 20:30:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
air_pollution_dir = "/Users/devkhk/Documents/public-data-engineering/data/air_pollution_degree/"
air_pollution_df = spark.read.csv(f"file:///{air_pollution_dir}air-pollution-degree.csv", encoding="euc-kr", header=True, inferSchema=True)\
                            .toDF("city", "city2", "region", "region2", "region_code", "measure_date", "sulfur_diox", "fine_dust", "ozone", "nitrogen_diox", "carbon_monox","u_fine_dust")

                                                                                

In [4]:
air_pollution_df.show()

+----+-----+------+-------+-----------+----------------+-----------+---------+-----+-------------+------------+-----------+
|city|city2|region|region2|region_code|    measure_date|sulfur_diox|fine_dust|ozone|nitrogen_diox|carbon_monox|u_fine_dust|
+----+-----+------+-------+-----------+----------------+-----------+---------+-----+-------------+------------+-----------+
|서울| 서울|  중구|   중구|     111121|2020-01-01 01:00|      0.003|    0.036|0.002|          0.5|          24|         19|
|서울| 서울|  중구|   중구|     111121|2020-01-01 02:00|      0.003|    0.039|0.001|          0.6|          25|         21|
|서울| 서울|  중구|   중구|     111121|2020-01-01 03:00|      0.003|    0.037|0.001|          0.9|          29|         23|
|서울| 서울|  중구|   중구|     111121|2020-01-01 04:00|      0.002|    0.036|0.001|          0.6|          26|         22|
|서울| 서울|  중구|   중구|     111121|2020-01-01 05:00|      0.002|    0.035|0.001|          0.6|          25|         19|
|서울| 서울|  중구|   중구|     111121|2020-01-01 06:00|

In [5]:
air_pollution_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- city2: string (nullable = true)
 |-- region: string (nullable = true)
 |-- region2: string (nullable = true)
 |-- region_code: integer (nullable = true)
 |-- measure_date: string (nullable = true)
 |-- sulfur_diox: double (nullable = true)
 |-- fine_dust: double (nullable = true)
 |-- ozone: double (nullable = true)
 |-- nitrogen_diox: double (nullable = true)
 |-- carbon_monox: integer (nullable = true)
 |-- u_fine_dust: integer (nullable = true)



In [6]:
# air_pollution_df = air_pollution_df.withColumnRenamed("시도", "city")

In [6]:
air_pollution_df.createOrReplaceTempView("origin")

In [14]:
query = """
SELECT
    city,
    region,
    region_code,
    TO_DATE(measure_date) as date,
    HOUR(measure_date) as hour,
    sulfur_diox,
    fine_dust,
    ozone,
    nitrogen_diox,
    carbon_monox,
    u_fine_dust
FROM origin
"""
origin_df = spark.sql(query)

In [15]:
spark.sql(query).show(100)

+----+------+-----------+----------+----+-----------+---------+-----+-------------+------------+-----------+
|city|region|region_code|      date|hour|sulfur_diox|fine_dust|ozone|nitrogen_diox|carbon_monox|u_fine_dust|
+----+------+-----------+----------+----+-----------+---------+-----+-------------+------------+-----------+
|서울|  중구|     111121|2020-01-01|   1|      0.003|    0.036|0.002|          0.5|          24|         19|
|서울|  중구|     111121|2020-01-01|   2|      0.003|    0.039|0.001|          0.6|          25|         21|
|서울|  중구|     111121|2020-01-01|   3|      0.003|    0.037|0.001|          0.9|          29|         23|
|서울|  중구|     111121|2020-01-01|   4|      0.002|    0.036|0.001|          0.6|          26|         22|
|서울|  중구|     111121|2020-01-01|   5|      0.002|    0.035|0.001|          0.6|          25|         19|
|서울|  중구|     111121|2020-01-01|   6|      0.002|    0.037|0.001|          0.5|          23|         19|
|서울|  중구|     111121|2020-01-01|   7|      

In [16]:
origin_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- region: string (nullable = true)
 |-- region_code: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- sulfur_diox: double (nullable = true)
 |-- fine_dust: double (nullable = true)
 |-- ozone: double (nullable = true)
 |-- nitrogen_diox: double (nullable = true)
 |-- carbon_monox: integer (nullable = true)
 |-- u_fine_dust: integer (nullable = true)



In [17]:
origin_df.describe(["sulfur_diox", "ozone", "nitrogen_diox", "carbon_monox"]).show()

                                                                                

+-------+------------------+------------------+------------------+------------------+
|summary|       sulfur_diox|             ozone|     nitrogen_diox|      carbon_monox|
+-------+------------------+------------------+------------------+------------------+
|  count|           1048575|           1048575|           1048575|           1048575|
|   mean| -33.4756512686748|-49.20555195955904|-36.25456252532455|-20.03172019168872|
| stddev|179.79034530646655|216.23970480747434|188.07656606234713|239.99483465556153|
|    min|            -999.0|            -999.0|            -999.0|              -999|
|    max|              0.09|             0.196|               7.1|               383|
+-------+------------------+------------------+------------------+------------------+



In [18]:
origin_df.describe(["fine_dust", "u_fine_dust"]).show()

[Stage 7:>                                                          (0 + 8) / 8]

+-------+-------------------+-------------------+
|summary|          fine_dust|        u_fine_dust|
+-------+-------------------+-------------------+
|  count|            1048575|            1048575|
|   mean|-34.678330849487836|-29.780616551033546|
| stddev|  182.9309217096852| 222.66068734141197|
|    min|             -999.0|               -999|
|    max|              0.142|                158|
+-------+-------------------+-------------------+



                                                                                

In [19]:
origin_df.createOrReplaceTempView("origin_preprocess")

In [53]:
query = """
SELECT
    *
FROM
    origin_preprocess
WHERE
        sulfur_diox > 0
    and ozone > 0
    and nitrogen_diox > 0
    and carbon_monox > 0
    and carbon_monox < 200
    and fine_dust > 0
    and u_fine_dust > 0

"""

preprocessed_df = spark.sql(query)
preprocessed_df.show()

+----+------+-----------+----------+----+-----------+---------+-----+-------------+------------+-----------+
|city|region|region_code|      date|hour|sulfur_diox|fine_dust|ozone|nitrogen_diox|carbon_monox|u_fine_dust|
+----+------+-----------+----------+----+-----------+---------+-----+-------------+------------+-----------+
|서울|  중구|     111121|2020-01-01|   1|      0.003|    0.036|0.002|          0.5|          24|         19|
|서울|  중구|     111121|2020-01-01|   2|      0.003|    0.039|0.001|          0.6|          25|         21|
|서울|  중구|     111121|2020-01-01|   3|      0.003|    0.037|0.001|          0.9|          29|         23|
|서울|  중구|     111121|2020-01-01|   4|      0.002|    0.036|0.001|          0.6|          26|         22|
|서울|  중구|     111121|2020-01-01|   5|      0.002|    0.035|0.001|          0.6|          25|         19|
|서울|  중구|     111121|2020-01-01|   6|      0.002|    0.037|0.001|          0.5|          23|         19|
|서울|  중구|     111121|2020-01-01|   7|      

In [54]:
# null, None 행 제거
preprocessed_df = preprocessed_df.na.drop("any")

In [55]:
preprocessed_df.show(200)

+----+------+-----------+----------+----+-----------+---------+-----+-------------+------------+-----------+
|city|region|region_code|      date|hour|sulfur_diox|fine_dust|ozone|nitrogen_diox|carbon_monox|u_fine_dust|
+----+------+-----------+----------+----+-----------+---------+-----+-------------+------------+-----------+
|서울|  중구|     111121|2020-01-01|   1|      0.003|    0.036|0.002|          0.5|          24|         19|
|서울|  중구|     111121|2020-01-01|   2|      0.003|    0.039|0.001|          0.6|          25|         21|
|서울|  중구|     111121|2020-01-01|   3|      0.003|    0.037|0.001|          0.9|          29|         23|
|서울|  중구|     111121|2020-01-01|   4|      0.002|    0.036|0.001|          0.6|          26|         22|
|서울|  중구|     111121|2020-01-01|   5|      0.002|    0.035|0.001|          0.6|          25|         19|
|서울|  중구|     111121|2020-01-01|   6|      0.002|    0.037|0.001|          0.5|          23|         19|
|서울|  중구|     111121|2020-01-01|   7|      

In [56]:
preprocessed_df.describe(["sulfur_diox", "ozone", "nitrogen_diox", "carbon_monox"]).show()
preprocessed_df.describe(["fine_dust", "u_fine_dust"]).show()

                                                                                

+-------+--------------------+--------------------+-------------------+-----------------+
|summary|         sulfur_diox|               ozone|      nitrogen_diox|     carbon_monox|
+-------+--------------------+--------------------+-------------------+-----------------+
|  count|              896919|              896919|             896919|           896919|
|   mean| 0.00310931577990743|0.026090863277507065|0.48527860375372783|38.59242807878972|
| stddev|0.001229118795065...|0.019442733724994665|0.21274261886750173| 23.0660859274277|
|    min|              1.0E-4|               0.001|                0.1|                1|
|    max|                0.09|               0.196|                7.1|              199|
+-------+--------------------+--------------------+-------------------+-----------------+





+-------+--------------------+------------------+
|summary|           fine_dust|       u_fine_dust|
+-------+--------------------+------------------+
|  count|              896919|            896919|
|   mean|  0.0231632228774271| 21.30682369310941|
| stddev|0.015494158096744386|15.493787803151514|
|    min|              1.0E-4|                 1|
|    max|               0.142|               152|
+-------+--------------------+------------------+



                                                                                

In [57]:
preprocessed_df.count()

                                                                                

896919

In [58]:
preprocessed_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- region: string (nullable = true)
 |-- region_code: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- sulfur_diox: double (nullable = true)
 |-- fine_dust: double (nullable = true)
 |-- ozone: double (nullable = true)
 |-- nitrogen_diox: double (nullable = true)
 |-- carbon_monox: integer (nullable = true)
 |-- u_fine_dust: integer (nullable = true)



In [65]:
train_df, test_df =  preprocessed_df.randomSplit([0.8, 0.2], seed=1)

In [60]:
# pipeline stages 설계
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler, StandardScaler

stages = []

cat_features = [
    "region_code",
    "hour",
]

std_features = [
    "sulfur_diox",
    "fine_dust",
    "ozone",
    "nitrogen_diox",
    "carbon_monox",
]


In [62]:
for c in cat_features:
    indexer = StringIndexer(inputCol=c , outputCol= c + "_idx").setHandleInvalid("skip")
    onehot = OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol= c+ "_one")
    stages += [indexer, onehot]
for s in std_features:
    vassembler = VectorAssembler(inputCols=[s], outputCol=s + "_vc")
    stdscaler = StandardScaler(inputCol=vassembler.getOutputCol(), outputCol=s + "_std")
    stages += [vassembler, stdscaler]

In [63]:
stages

[StringIndexer_fe7c0e7f9137,
 OneHotEncoder_7cb2143c4480,
 StringIndexer_2bd916e740a4,
 OneHotEncoder_4ecd23bca64a,
 VectorAssembler_9c03a3168767,
 StandardScaler_87cea2761ba0,
 VectorAssembler_a443c22af3f5,
 StandardScaler_db0d083b111d,
 VectorAssembler_6247168ae041,
 StandardScaler_d75bde26525d,
 VectorAssembler_aef2f7d7fd66,
 StandardScaler_f99c273a5260,
 VectorAssembler_cc261a04ead3,
 StandardScaler_016cd4fe71b8]

In [78]:
# vector된 데이터들을 하나로 모으는 assembler
assembler_list = [c + "_one" for c in cat_features ] + [s + "_std" for s in std_features]
assembler = VectorAssembler(inputCols=assembler_list, outputCol="features")
stages += [assembler]

In [79]:
stages

[StringIndexer_fe7c0e7f9137,
 OneHotEncoder_7cb2143c4480,
 StringIndexer_2bd916e740a4,
 OneHotEncoder_4ecd23bca64a,
 VectorAssembler_9c03a3168767,
 StandardScaler_87cea2761ba0,
 VectorAssembler_a443c22af3f5,
 StandardScaler_db0d083b111d,
 VectorAssembler_6247168ae041,
 StandardScaler_d75bde26525d,
 VectorAssembler_aef2f7d7fd66,
 StandardScaler_f99c273a5260,
 VectorAssembler_cc261a04ead3,
 StandardScaler_016cd4fe71b8,
 VectorAssembler_c2cfdc86399e]

In [64]:
# pipeline
from pyspark.ml.pipeline import Pipeline

In [80]:
pipeline = Pipeline(stages=stages)

In [81]:
fitted_pipeline = pipeline.fit(train_df)

                                                                                

In [82]:
vtrain_df = fitted_pipeline.transform(train_df)

In [83]:
vtrain_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- region: string (nullable = true)
 |-- region_code: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- sulfur_diox: double (nullable = true)
 |-- fine_dust: double (nullable = true)
 |-- ozone: double (nullable = true)
 |-- nitrogen_diox: double (nullable = true)
 |-- carbon_monox: integer (nullable = true)
 |-- u_fine_dust: integer (nullable = true)
 |-- region_code_idx: double (nullable = false)
 |-- region_code_one: vector (nullable = true)
 |-- hour_idx: double (nullable = false)
 |-- hour_one: vector (nullable = true)
 |-- sulfur_diox_vc: vector (nullable = true)
 |-- sulfur_diox_std: vector (nullable = true)
 |-- fine_dust_vc: vector (nullable = true)
 |-- fine_dust_std: vector (nullable = true)
 |-- ozone_vc: vector (nullable = true)
 |-- ozone_std: vector (nullable = true)
 |-- nitrogen_diox_vc: vector (nullable = true)
 |-- nitrogen_diox_std: vector (nullable = true)
 |-- carbon_mon

In [84]:
vtrain_df.select("features").show()

+--------------------+
|            features|
+--------------------+
|(144,[32,118,139,...|
|(144,[32,120,139,...|
|(144,[32,122,139,...|
|(144,[32,125,139,...|
|(144,[32,129,139,...|
|(144,[32,128,139,...|
|(144,[32,127,139,...|
|(144,[32,132,139,...|
|(144,[32,131,139,...|
|(144,[32,133,139,...|
|(144,[32,136,139,...|
|(144,[32,138,139,...|
|(144,[32,134,139,...|
|(144,[32,123,139,...|
|(144,[32,119,139,...|
|(144,[32,121,139,...|
|(144,[32,124,139,...|
|(144,[32,117,139,...|
|(144,[32,118,139,...|
|(144,[32,120,139,...|
+--------------------+
only showing top 20 rows



[Stage 141:>                                                        (0 + 1) / 1]                                                                                

In [67]:
from pyspark.ml.regression import LinearRegression

In [85]:
lr = LinearRegression(
    maxIter=100,
    labelCol="u_fine_dust",
    solver='normal',
)

In [86]:
model = lr.fit(vtrain_df)

22/04/23 22:06:38 WARN Instrumentation: [a54471fa] regParam is zero, which might cause numerical instability and overfitting.
22/04/23 22:06:42 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/04/23 22:06:42 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/04/23 22:06:43 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [88]:
vtest_df = fitted_pipeline.transform(test_df)

In [91]:
vtest_df.select("features").show()

+--------------------+
|            features|
+--------------------+
|(144,[32,126,139,...|
|(144,[32,139,140,...|
|(144,[32,137,139,...|
|(144,[32,135,139,...|
|(144,[32,130,139,...|
|(144,[32,128,139,...|
|(144,[32,131,139,...|
|(144,[32,133,139,...|
|(144,[32,119,139,...|
|(144,[32,124,139,...|
|(144,[32,129,139,...|
|(144,[32,132,139,...|
|(144,[32,138,139,...|
|(144,[32,119,139,...|
|(144,[32,129,139,...|
|(144,[32,128,139,...|
|(144,[32,137,139,...|
|(144,[32,135,139,...|
|(144,[32,119,139,...|
|(144,[32,118,139,...|
+--------------------+
only showing top 20 rows



In [92]:
predictions = model.transform(vtest_df)

In [93]:
predictions.select("u_fine_dust", "prediction").show()

+-----------+------------------+
|u_fine_dust|        prediction|
+-----------+------------------+
|         12| 22.57207358401653|
|         26|22.598589344647145|
|         22| 22.41310062049842|
|         25| 29.02886975695082|
|         28|  30.6842900253734|
|         29| 38.30589276729137|
|         29| 32.52974150919139|
|         29| 32.65598345328481|
|         37|38.796438261911504|
|         40|39.414462035538584|
|         42| 47.84527424860319|
|         43| 48.98527168275435|
|         36| 37.41092169787684|
|         26|23.935151523061236|
|         30| 42.13738163494986|
|         31|41.022856143021414|
|         26|30.147515618883567|
|         27|29.066016600966577|
|         24| 24.84147252203551|
|         29| 35.90992655209013|
+-----------+------------------+
only showing top 20 rows



[Stage 147:>                                                        (0 + 1) / 1]                                                                                

In [94]:
model.summary.rootMeanSquaredError

7.6721414972447555

In [95]:
model.summary.r2

0.7548288476458045

## 요약

다양한 환경 변수들과 시간, 지역 코드를 이용해 초 미세먼지가 어느정도 나오는지 예측하는 모델을 만들었다.

R2값은 0.75로 괜찮은 정도라고 생각되고, RMSE는 다소 높은데 아마 carbon_monoxide(일산화탄소)가 높게 튀는 부분들이 있어서 일 것 같다.

24시의 값이 null 값으로 된 점이 조금 아쉽다.

hyper 파라미터값을 측정해보고 airflow로 모델을 저장하는 것 까지 프로젝트를 마무리 할 예정
