# **공공데이터를 활용한 미세먼지 농도 예측**
---
## Step 3. 머신러닝 모델링

## Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from statsmodels.graphics.mosaicplot import mosaic
from scipy import stats as spst
import statsmodels.api as sm
import joblib

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# Apple Mac 기준 코드
plt.rc('font', family='AppleGothic')
sns.set(font="AppleGothic", 
        rc={"axes.unicode_minus":False}, # 마이너스 부호 깨짐 현상 해결
        style='darkgrid')

# # Window 기준 코드
# plt.rc('font', family='Malgun Gothic')
# sns.set(font="Malgun Gothic",#"NanumGothicCoding", 
#         rc={"axes.unicode_minus":False}, # 마이너스 부호 깨짐 현상 해결
#         style='darkgrid')

## Data Load

In [3]:
train_x = pd.read_csv('./train_x.csv')
train_y = pd.read_csv('./train_y.csv')
test_x = pd.read_csv('./test_x.csv')
test_y = pd.read_csv('./test_y.csv')

### Data Scale

In [4]:
mm_scaler = MinMaxScaler()
s_scaler = StandardScaler()

train_x_mm = mm_scaler.fit_transform(train_x)
test_x_mm = mm_scaler.transform(test_x)

train_x_s = s_scaler.fit_transform(train_x)
test_x_s = s_scaler.transform(test_x)

### PCA

In [5]:
pca = PCA(n_components=5)

train_x_pca = pca.fit_transform(train_x)
train_x_pca = pd.DataFrame(data=train_x_pca, columns = ['pca_1','pca_2','pca_3','pca_4','pca_5'])

test_x_pca = pca.transform(test_x)
test_x_pca = pd.DataFrame(data=test_x_pca, columns = ['pca_1','pca_2','pca_3','pca_4','pca_5'])

## Modeling

* Linear Regression

In [6]:
def lr_trainer(X_train, y_train, X_test, y_test):
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    y_pred = lr_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    result = [mse, rmse, mae, r2]
    return lr_model, result

In [7]:
org_lr_model, org_result = lr_trainer(train_x, train_y, test_x, test_y)
mm_lr_model, mm_result = lr_trainer(train_x_mm, train_y, test_x_mm, test_y)
s_lr_model, s_result = lr_trainer(train_x_s, train_y, test_x_s, test_y)
pca_lr_model, pca_result = lr_trainer(train_x_pca, train_y, test_x_pca, test_y)

print("MSE | RMSE | MAE | R2")
print(org_result)
print(mm_result)
print(s_result)
print(pca_result)

MSE | RMSE | MAE | R2
[102.31460068904832, 10.115068002195947, 5.994888610543269, 0.8973186151384074]
[102.31460068906821, 10.115068002196931, 5.994888610543184, 0.8973186151383874]
[102.31460068906831, 10.115068002196937, 5.99488861054319, 0.8973186151383873]
[218.9634034279007, 14.797412051703525, 9.251165867361273, 0.780251642027949]


In [None]:
# joblib.dump(org_lr_model, 'model_lr_org.pkl')
# joblib.dump(mm_lr_model, 'model_lr_mm.pkl')
# joblib.dump(s_lr_model, 'model_lr_s.pkl')

* Random Forest

In [None]:
def rf_trainer(X_train, y_train, X_test, y_test):
    rf_model = RFR(max_depth=7)
    rf_model.fit(X_train, y_train.values.ravel())
    y_pred = rf_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    result = [mse, rmse, mae, r2]
    return rf_model, result

In [None]:
org_rf_model, org_result = rf_trainer(train_x, train_y, test_x, test_y)
mm_rf_model, mm_result = rf_trainer(train_x_mm, train_y, test_x_mm, test_y)
s_rf_model, s_result = rf_trainer(train_x_s, train_y, test_x_s, test_y)

print("MSE | RMSE | MAE | R2")
print(org_result)
print(mm_result)
print(s_result)

In [None]:
# joblib.dump(org_rf_model, 'model_lr_org.pkl')
# joblib.dump(mm_rf_model, 'model_lr_mm.pkl')
# joblib.dump(s_rf_model, 'model_lr_s.pkl')

* Gradient Boosting

In [None]:
def gbr_trainer(X_train, y_train, X_test, y_test):
    gbr_model = GBR()
    gbr_model.fit(X_train, y_train.values.ravel())
    y_pred = gbr_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    result = [mse, rmse, mae, r2]
    return gbr_model, result

In [None]:
org_gbr_model, org_result = gbr_trainer(train_x, train_y, test_x, test_y)
mm_gbr_model, mm_result = gbr_trainer(train_x_mm, train_y, test_x_mm, test_y)
s_gbr_model, s_result = gbr_trainer(train_x_s, train_y, test_x_s, test_y)

print("MSE | RMSE | MAE | R2")
print(org_result)
print(mm_result)
print(s_result)

In [None]:
# joblib.dump(org_gbr_model, 'model_lr_org.pkl')
# joblib.dump(mm_gbr_model, 'model_lr_mm.pkl')
# joblib.dump(s_gbr_model, 'model_lr_s.pkl')

In [None]:
plt.figure(figsize=(12,8))
plt.barh(y=list(train_x), width=org_gbr_model.feature_importances_)
plt.show()

* XGBoost

In [None]:
def xgb_trainer(X_train, y_train, X_test, y_test):
    xgb_model = XGBRegressor(objective ='reg:squarederror')
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    result = [mse, rmse, mae, r2]
    return xgb_model, result

In [None]:
org_xgb_model, org_result = xgb_trainer(train_x, train_y, test_x, test_y)
mm_xgb_model, mm_result = xgb_trainer(train_x_mm, train_y, test_x_mm, test_y)
s_xgb_model, s_result = xgb_trainer(train_x_s, train_y, test_x_s, test_y)

print("MSE | RMSE | MAE | R2")
print(org_result)
print(mm_result)
print(s_result)

In [None]:
def lgbm_trainer(X_train, y_train, X_test, y_test):
    lgbm_model = LGBMRegressor()
    lgbm_model.fit(X_train, y_train.values.ravel())
    y_pred = lgbm_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    result = [mse, rmse, mae, r2]
    return lgbm_model, result

In [None]:
org_lgbm_model, org_result = lgbm_trainer(train_x, train_y, test_x, test_y)
mm_lgbm_model, mm_result = lgbm_trainer(train_x_mm, train_y, test_x_mm, test_y)
s_lgbm_model, s_result = lgbm_trainer(train_x_s, train_y, test_x_s, test_y)

print("MSE | RMSE | MAE | R2")
print(org_result)
print(mm_result)
print(s_result)