In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

import os
import chardet

In [None]:
pd.set_option('display.float_format', '{:.4f}'.format) # 소수 4째 자리까지

# **1. 데이터 병합**

In [None]:
### 파일 병합을 위한 함수

def file_concat(path, file_list):
  dfs = []  # DataFrame들을 저장할 리스트

  for file in file_list:
    # 파일 확장자가 .csv인 경우에만 처리
    if file.endswith(".csv"):
      file_path = os.path.join(path, file)

      # 파일을 열어서 인코딩 확인
      with open(file_path, 'rb') as f:
        rawdata = f.read()
      result = chardet.detect(rawdata)
      enc = result['encoding']

      # 해당 파일의 인코딩에 맞게 열어서 DataFrame으로 변환
      df = pd.read_csv(file_path, encoding = enc)
      dfs.append(df)

  # DataFrame들을 세로 방향으로 이어붙임
  df = pd.concat(dfs, axis = 0, ignore_index = True)  # ignore_index = True로 해줘야 오류 없이 밑으로 결합됨

  return df

## **1-1. 수문 자료**
- 각 댐의 운영 정보를 포함하고 있는 데이터    
- 활용 데이터 목록
  - [다목적댐 운영 정보(일자료)](https://www.bigdata-environment.kr/user/data_market/detail.do?id=b77fd770-38bb-11ea-be28-4fa0eb812a46)
  - [MyWater 물정보포털수문자료](https://www.water.or.kr/kor/realtime/sumun/index.do?mode=sumun&menuId=13_91_93_94)

### **📌 Data Description**
1. dam_nm: 댐이름
2. obsr_de: 일자
3. rain_qy: 강우량
4. inflow_qy: 유입량
5. tot_dcwtr_qy: 방류량
6. rsvwt_qy: (현재) 저수량
7. dam_rsvwt_rt: 저수율
  - 총저수량에 대한 현재 저수량

### **a) 데이터 불러오기**

In [None]:
water_path = '/content/drive/MyDrive/2023년 K-water 대국민 물 빅데이터 분석과제 공모전/final/원본 데이터/다목적댐운영정보' # path 변수에 경로 지정

In [None]:
water_list = os.listdir(water_path)
print(water_list)
print(len(water_list))

['202109.csv', '202110.csv', '202111.csv', '202112.csv', '202201.csv', '202204.csv', '202205.csv', '202206.csv', '202207.csv', '202209.csv', '202202.csv', '202203.csv', '202208.csv', '202211.csv', '202212.csv', '202210.csv', '202010.csv', '202009.csv', '201911.csv', '202007.csv', '202011.csv', '202012.csv', '202101.csv', '202102.csv', '202103.csv', '202104.csv', '202105.csv', '202106.csv', '202107.csv', '202108.csv', '201901.csv', '201902.csv', '201903.csv', '201904.csv', '201905.csv', '201906.csv', '201907.csv', '201908.csv', '201909.csv', '201910.csv', '201912.csv', '202001.csv', '202002.csv', '202003.csv', '202004.csv', '202005.csv', '202006.csv', '202008.csv']
48


- 4개년치에 해당하는 48개 달에 대한 데이터가 정상적으로 load됨

In [None]:
df = file_concat(water_path, water_list)

In [None]:
df

Unnamed: 0,dam_nm,obsr_de,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt
0,군위,20210901,61.1076,33.5930,27.7090,38.4440,78.9000
1,군위,20210902,0.2366,16.1420,49.0420,35.6010,73.1000
2,군위,20210903,0.0000,2.4020,30.3300,33.1880,68.2000
3,군위,20210904,0.0000,2.6560,1.4070,33.2960,68.4000
4,군위,20210905,0.0000,1.4920,1.4920,33.2960,68.4000
...,...,...,...,...,...,...,...
30676,횡성,20200827,7.2651,6.4480,15.7910,64.6700,74.4000
30677,횡성,20200828,59.3119,8.4850,4.4070,65.0220,74.8000
30678,횡성,20200829,24.2231,44.2680,17.7100,67.3170,77.5000
30679,횡성,20200830,39.7127,45.8840,29.0690,68.7700,79.1000


- 365(일) * 4(년) * 21(댐 개수) = 30660
- 2020년의 경우 2월 29일까지 존재 -> 21개 추가

In [None]:
### 데이터 정보 확인

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30681 entries, 0 to 30680
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   dam_nm        30681 non-null  object 
 1   obsr_de       30681 non-null  int64  
 2   rain_qy       30681 non-null  object 
 3   inflow_qy     30681 non-null  float64
 4   tot_dcwtr_qy  30681 non-null  float64
 5   rsvwt_qy      30681 non-null  float64
 6   dam_rsvwt_rt  30681 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 1.6+ MB


- obsr_de(관측일)과 rain_qy(강우량)의 데이터형(dtype)이 잘못 설정되어 있는 것 같음
  - obsr_de: object -> datetime
  - rain_qy: object -> float64
- 저수율의 표시 형식을 소수로 변경하기
  - 60(%) -> 0.6

In [None]:
# 결측치나 변경할 수 없는 값이 있다면 일단 모두 무시하고 데이터형 변경

df['obsr_de'] = df['obsr_de'].apply(lambda x: pd.to_datetime(x, format='%Y%m%d', errors = 'coerce'))
df['rain_qy'] = df['rain_qy'].apply(lambda x: pd.to_numeric(x, errors = 'coerce'))

df['dam_rsvwt_rt'] = df['dam_rsvwt_rt'] * 0.01

In [None]:
df.head()

Unnamed: 0,dam_nm,obsr_de,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt
0,군위,2021-09-01,61.1076,33.593,27.709,38.444,0.789
1,군위,2021-09-02,0.2366,16.142,49.042,35.601,0.731
2,군위,2021-09-03,0.0,2.402,30.33,33.188,0.682
3,군위,2021-09-04,0.0,2.656,1.407,33.296,0.684
4,군위,2021-09-05,0.0,1.492,1.492,33.296,0.684


In [None]:
df.describe()

Unnamed: 0,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt
count,30680.0,30681.0,30681.0,30681.0,30681.0
mean,3.5431,24.2884,25.1708,362.7624,0.573
std,12.5175,130.0254,105.7154,493.719,0.1677
min,0.0,0.0,0.0,0.003,0.0
25%,0.0,0.469,1.596,39.345,0.464
50%,0.0,2.877,6.812,135.101,0.584
75%,0.4914,12.856,18.683,466.956,0.692
max,334.6305,5457.957,5027.616,2416.721,0.989


In [None]:
# 저수율이 0인 데이터 파악하기

df.loc[df['dam_rsvwt_rt'] == 0,:]

Unnamed: 0,dam_nm,obsr_de,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt
21433,영주,2019-04-11,0.0963,1.352,1.352,0.061,0.0
21434,영주,2019-04-12,0.0,1.344,1.344,0.061,0.0
21435,영주,2019-04-13,0.0,1.3,1.3,0.061,0.0
21436,영주,2019-04-14,0.0,1.3,1.3,0.061,0.0
22078,영주,2019-05-14,0.0,0.02,0.02,0.003,0.0
22079,영주,2019-05-15,0.0,0.02,0.02,0.003,0.0
22080,영주,2019-05-16,0.0,0.02,0.02,0.003,0.0
22081,영주,2019-05-17,0.0,0.057,0.025,0.006,0.0
22082,영주,2019-05-18,9.5778,0.04,0.04,0.006,0.0
22083,영주,2019-05-19,4.5433,0.04,0.04,0.006,0.0


- 저수량(rsvwt_qy)이 너무 작아 저수율이 0에 가깝게 나왔음을 짐작할 수 있다.
- 다른 변수들은 모두 데이터 상의 문제는 없어 보인다.

In [None]:
df['dam_nm'].unique()

array(['군위', '김천부항', '남강', '대청', '밀양', '보령', '보현산', '부안', '섬진강', '성덕',
       '소양강', '안동', '영주', '용담', '임하', '장흥', '주암(본)', '주암(조)', '충주', '합천',
       '횡성'], dtype=object)

- 21개 댐에 대한 정보를 포함하고 있음

### **b) 결측치 처리**

In [None]:
# 현재 데이터 내의 결측치는 공백으로 처리되어 있음
# 데이터프레임 내의 공백을 NaN으로 변경

df.replace('', np.nan, inplace = True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30681 entries, 0 to 30680
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   dam_nm        30681 non-null  object        
 1   obsr_de       30681 non-null  datetime64[ns]
 2   rain_qy       30680 non-null  float64       
 3   inflow_qy     30681 non-null  float64       
 4   tot_dcwtr_qy  30681 non-null  float64       
 5   rsvwt_qy      30681 non-null  float64       
 6   dam_rsvwt_rt  30681 non-null  float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 1.6+ MB


- rain_qy(강수량)에 결측치가 존재

In [None]:
df.loc[df['rain_qy'].isna(),:]

Unnamed: 0,dam_nm,obsr_de,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt
19852,김천부항,2019-02-01,,0.0267,1.6169,37.143,0.684


In [None]:
# 해당 댐의 해당 연도 + 달의 최빈값으로 대체

df['rain_qy'] = df.groupby(['dam_nm', df['obsr_de'].dt.year ,df['obsr_de'].dt.month])['rain_qy'].transform(lambda x: x.fillna(x.mode().iloc[0]))

In [None]:
df.loc[df['rain_qy'].isna(),:]

Unnamed: 0,dam_nm,obsr_de,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt


- 결측치가 제대로 채워짐

In [None]:
df.head()

Unnamed: 0,dam_nm,obsr_de,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt
0,군위,2021-09-01,61.1076,33.593,27.709,38.444,0.789
1,군위,2021-09-02,0.2366,16.142,49.042,35.601,0.731
2,군위,2021-09-03,0.0,2.402,30.33,33.188,0.682
3,군위,2021-09-04,0.0,2.656,1.407,33.296,0.684
4,군위,2021-09-05,0.0,1.492,1.492,33.296,0.684


## **1-2. 제원정보**
- 21개의 댐에 대한 제원 정보를 포함하는 데이터
- 해당 데이터에서 필요한 값들을 선택적으로 수집 후 데이터 가공
  - 댐이름, 총저수량, 유효저수량, 홍수조절용량, 비활용용량
- 활용 데이터 목록
  - [댐 관리현황(2022)](https://www.kwater.or.kr/gov3/sub03/annoView.do?seq=4240&s_mid=1664&x=0&y=0)


### **📌Data Description**
1. dam_nm: 댐이름
2. tot_qy: 총저수량
3. valid_qy: 유효저수용량
4. flood_qy: 홍수조절용량
5. unused_qy: 비활용용량

In [None]:
# 데이터 불러오기

df_capa = pd.read_csv('/content/drive/MyDrive/2023년 K-water 대국민 물 빅데이터 분석과제 공모전/final/원본 데이터/다목적댐 제원 정보.csv', sep = ',')
df_capa

Unnamed: 0,dam_nm,tot_qy,valid_qy,flood_qy,unused_qy
0,군위,48.7,40.1,3.1,5.5
1,김천부항,54.3,42.6,12.3,3.7
2,남강,309.2,299.7,269.8,9.5
3,대청,1490.0,790.0,250.0,450.0
4,밀양,73.6,69.8,6.0,3.8
5,보령,116.9,108.7,10.0,8.2
6,보현산,22.11,17.88,3.49,0.8
7,부안,50.3,35.6,9.3,5.9
8,섬진강,466.0,429.0,30.3,6.7
9,성덕,27.9,24.8,4.2,2.2


In [None]:
df_capa['dam_nm'].unique()

array(['군위', '김천부항', '남강', '대청', '밀양', '보령', '보현산', '부안', '섬진강', '성덕',
       '소양강', '안동', '영주', '용담', '임하', '장흥', '주암(본)', '주암(조)', '충주', '합천',
       '횡성'], dtype=object)

- 정상적으로 **21개**의 댐의 제원 정보가 포함되어 있음

In [None]:
# 두 데이터가 가진 댐이 서로 동일한지 확인

sum(~df['dam_nm'].isin(df_capa['dam_nm']))

0

- 두 개의 데이터에서 동일한 댐에 대한 정보들을 가지고 있음을 확인할 수 있다.
  - 댐 이름에서 차이가 x

In [None]:
# 기존 df 와 df_valid_capa 병합하기(merge)

df = pd.merge(left = df, right = df_capa,
              how = "inner", on = "dam_nm") # dam_nm 기준으로 결합

In [None]:
df

Unnamed: 0,dam_nm,obsr_de,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt,tot_qy,valid_qy,flood_qy,unused_qy
0,군위,2021-09-01,61.1076,33.5930,27.7090,38.4440,0.7890,48.7000,40.1000,3.1000,5.5000
1,군위,2021-09-02,0.2366,16.1420,49.0420,35.6010,0.7310,48.7000,40.1000,3.1000,5.5000
2,군위,2021-09-03,0.0000,2.4020,30.3300,33.1880,0.6820,48.7000,40.1000,3.1000,5.5000
3,군위,2021-09-04,0.0000,2.6560,1.4070,33.2960,0.6840,48.7000,40.1000,3.1000,5.5000
4,군위,2021-09-05,0.0000,1.4920,1.4920,33.2960,0.6840,48.7000,40.1000,3.1000,5.5000
...,...,...,...,...,...,...,...,...,...,...,...
30676,횡성,2020-08-27,7.2651,6.4480,15.7910,64.6700,0.7440,86.9000,73.4000,9.5000,13.5000
30677,횡성,2020-08-28,59.3119,8.4850,4.4070,65.0220,0.7480,86.9000,73.4000,9.5000,13.5000
30678,횡성,2020-08-29,24.2231,44.2680,17.7100,67.3170,0.7750,86.9000,73.4000,9.5000,13.5000
30679,횡성,2020-08-30,39.7127,45.8840,29.0690,68.7700,0.7910,86.9000,73.4000,9.5000,13.5000


- 정확히 결합된 것을 확인할 수 있음

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30681 entries, 0 to 30680
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   dam_nm        30681 non-null  object        
 1   obsr_de       30681 non-null  datetime64[ns]
 2   rain_qy       30681 non-null  float64       
 3   inflow_qy     30681 non-null  float64       
 4   tot_dcwtr_qy  30681 non-null  float64       
 5   rsvwt_qy      30681 non-null  float64       
 6   dam_rsvwt_rt  30681 non-null  float64       
 7   tot_qy        30681 non-null  float64       
 8   valid_qy      30681 non-null  float64       
 9   flood_qy      30681 non-null  float64       
 10  unused_qy     30681 non-null  float64       
dtypes: datetime64[ns](1), float64(9), object(1)
memory usage: 2.8+ MB


- 결측치는 존재하지 않는다.

In [None]:
df.describe()

Unnamed: 0,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt,tot_qy,valid_qy,flood_qy,unused_qy
count,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0
mean,3.543,24.2884,25.1708,362.7624,0.573,615.3814,436.6657,109.2376,115.8857
std,12.5173,130.0254,105.7154,493.719,0.1677,820.2634,530.0535,164.6,195.8659
min,0.0,0.0,0.0,0.003,0.0,22.11,17.88,3.1,0.8
25%,0.0,0.469,1.596,39.345,0.464,73.6,69.8,9.3,5.9
50%,0.0,2.877,6.812,135.101,0.584,250.0,210.0,30.3,12.0
75%,0.4914,12.856,18.683,466.956,0.692,790.0,560.0,110.0,124.0
max,334.6305,5457.957,5027.616,2416.721,0.989,2900.0,1900.0,616.0,650.0


- 잘못된 데이터는 없음을 짐작할 수 있다.

### **(최대) 이수용량 계산하기**
- 비홍수기(대부분의 날짜)
  - 유효저수량과 동일
- 홍수기(6/21 ~ 9/20)
  - 활용용량 - 홍수조절용량 = (총저수용량 - 비활용용량) - 홍수조절용량

In [None]:
from datetime import datetime

# 홍수기와 비홍수기를 구분하여 maximum_use_qy를 계산
def calculate_maximum_use_qy(row):
  if datetime(2021, 6, 21) <= row['obsr_de'] <= datetime(2021, 9, 20):
      return row['tot_qy'] - row['unused_qy'] - row['flood_qy']
  else:
      return row['valid_qy']

In [None]:
df['maximum_use_qy'] = df.apply(calculate_maximum_use_qy, axis=1)

In [None]:
df.head(3)

Unnamed: 0,dam_nm,obsr_de,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt,tot_qy,valid_qy,flood_qy,unused_qy,maximum_use_qy
0,군위,2021-09-01,61.1076,33.593,27.709,38.444,0.789,48.7,40.1,3.1,5.5,40.1
1,군위,2021-09-02,0.2366,16.142,49.042,35.601,0.731,48.7,40.1,3.1,5.5,40.1
2,군위,2021-09-03,0.0,2.402,30.33,33.188,0.682,48.7,40.1,3.1,5.5,40.1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30681 entries, 0 to 30680
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   dam_nm          30681 non-null  object        
 1   obsr_de         30681 non-null  datetime64[ns]
 2   rain_qy         30681 non-null  float64       
 3   inflow_qy       30681 non-null  float64       
 4   tot_dcwtr_qy    30681 non-null  float64       
 5   rsvwt_qy        30681 non-null  float64       
 6   dam_rsvwt_rt    30681 non-null  float64       
 7   tot_qy          30681 non-null  float64       
 8   valid_qy        30681 non-null  float64       
 9   flood_qy        30681 non-null  float64       
 10  unused_qy       30681 non-null  float64       
 11  maximum_use_qy  30681 non-null  float64       
dtypes: datetime64[ns](1), float64(10), object(1)
memory usage: 3.0+ MB


In [None]:
df.describe()

Unnamed: 0,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt,tot_qy,valid_qy,flood_qy,unused_qy,maximum_use_qy
count,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0
mean,3.543,24.2884,25.1708,362.7624,0.573,615.3814,436.6657,109.2376,115.8857,433.7434
std,12.5173,130.0254,105.7154,493.719,0.1677,820.2634,530.0535,164.6,195.8659,527.3284
min,0.0,0.0,0.0,0.003,0.0,22.11,17.88,3.1,0.8,17.82
25%,0.0,0.469,1.596,39.345,0.464,73.6,69.8,9.3,5.9,69.8
50%,0.0,2.877,6.812,135.101,0.584,250.0,210.0,30.3,12.0,210.0
75%,0.4914,12.856,18.683,466.956,0.692,790.0,560.0,110.0,124.0,560.0
max,334.6305,5457.957,5027.616,2416.721,0.989,2900.0,1900.0,616.0,650.0,1900.0


In [None]:
# 필요없는 변수 삭제

df.drop(['flood_qy'], axis = 1, inplace = True)

## **1-3. 기상자료**
- 각 댐이 위치하는 지역의 기상정보에 대한 데이터
  - 습도, 기온, 풍속, 일사량
- 여러 데이터에서 필요한 정보들을 수집 후 최종 데이터 형태로 가공함
- 활용 데이터 목록
  - [환경 빅데이터 플랫폼_관측소별기상관측정보](https://www.bigdata-environment.kr/user/data_market/detail.do?id=b1de30f0-38bb-11ea-be28-4fa0eb812a46)
  - [국가수자원관리종합정보시스템_실시간 기상자료](http://www.wamis.go.kr/wkw/we_dwtwtobs.do)
  - [농촌진흥청_기상통계](https://fruit.nihhs.go.kr/main/aws/awsStat.do)
  - [국립농업과학원_농업기상정보_주산지기상분석](http://weather.rda.go.kr/w/farmProduce/frcPlpdAvg.do)
  - [농업관측통계시스템](https://oasis.krei.re.kr/basicInfo/weather/observe.do)

### **📌 Data Description**
1. dam_nm: 댐이름
2. anc_dt: 관측일자
3. avg_hmd: 평균습도
4. avg_tmr: 평균기온
5. avg_wv: 평균풍속
6. snsn_sum: 일사량

### **a) 데이터 불러오기**

In [None]:
weather_path = '/content/drive/MyDrive/2023년 K-water 대국민 물 빅데이터 분석과제 공모전/final/원본 데이터/관측소별기상관측정보' # path 변수에 경로 지정

In [None]:
weather_list = os.listdir(weather_path)
print(weather_list)
print(len(weather_list))

['일사량 결측치 핸들링.pdf', '202002.csv', '202005.csv', '202006.csv', '202008.csv', '202009.csv', '202011.csv', '202101.csv', '202104.csv', '202105.csv', '202106.csv', '202107.csv', '202112.csv', '202202.csv', '202203.csv', '202204.csv', '202205.csv', '202206.csv', '202108.csv', '202109.csv', '201907_201912.csv', '202110.csv', '202111.csv', '202208.csv', '202211.csv', '202212.csv', '202102.csv', '202007.csv', '202003.csv', '202001.csv', '202004.csv', '202210.csv', '202209.csv', '202207.csv', '202201.csv', '202103.csv', '201901_201906.csv', '202010.csv', '202012.csv']
39


In [None]:
df2 = file_concat(weather_path, weather_list)

In [None]:
df2

Unnamed: 0,dam_nm,anc_dt,avg_hmd,avg_tmr,avg_wv,snsn_sum
0,군위,20200201,67.6000,2.9000,2.0000,7.1000
1,군위,20200202,61.5000,3.6000,1.2000,8.3000
2,군위,20200203,45.6000,3.1000,2.9000,7.2000
3,군위,20200204,42.5000,1.9000,1.8000,9.8000
4,군위,20200205,30.8000,-1.5000,4.5000,10.1000
...,...,...,...,...,...,...
30676,횡성,20201227,74.9000,1.4000,0.5000,5.8200
30677,횡성,20201228,77.8000,3.2000,0.5000,6.8000
30678,횡성,20201229,84.6000,-0.6000,0.9000,3.8300
30679,횡성,20201230,46.4000,-9.2000,2.7000,7.1500


In [None]:
### 데이터 정보 확인

df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30681 entries, 0 to 30680
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   dam_nm    30681 non-null  object 
 1   anc_dt    30681 non-null  int64  
 2   avg_hmd   30615 non-null  float64
 3   avg_tmr   30671 non-null  float64
 4   avg_wv    30672 non-null  float64
 5   snsn_sum  30651 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 1.4+ MB


- 변수명 통일
  - 관측일자의 경우 다른 데이터와의 병합을 위해 ```anc_dt```에서 ```obsr_de```로 변경
- 데이터형 변경
  - obsr_de: object -> datetime
- 결측치 처리 필요

In [None]:
# 변수명 변경

df2.rename(columns={'anc_dt':'obsr_de'}, inplace = True)

In [None]:
df2

Unnamed: 0,dam_nm,obsr_de,avg_hmd,avg_tmr,avg_wv,snsn_sum
0,군위,20200201,67.6000,2.9000,2.0000,7.1000
1,군위,20200202,61.5000,3.6000,1.2000,8.3000
2,군위,20200203,45.6000,3.1000,2.9000,7.2000
3,군위,20200204,42.5000,1.9000,1.8000,9.8000
4,군위,20200205,30.8000,-1.5000,4.5000,10.1000
...,...,...,...,...,...,...
30676,횡성,20201227,74.9000,1.4000,0.5000,5.8200
30677,횡성,20201228,77.8000,3.2000,0.5000,6.8000
30678,횡성,20201229,84.6000,-0.6000,0.9000,3.8300
30679,횡성,20201230,46.4000,-9.2000,2.7000,7.1500


In [None]:
# 결측치나 변경할 수 없는 값이 있다면 일단 모두 무시하고 데이터형 변경

df2['obsr_de'] = df2['obsr_de'].apply(lambda x: pd.to_datetime(x, format = '%Y%m%d',errors = 'coerce'))

In [None]:
# 습도의 경우 %를 소수로 변경(60% -> 0.6)

df2['avg_hmd'] = df2['avg_hmd'] * 0.01

In [None]:
df2.head()

Unnamed: 0,dam_nm,obsr_de,avg_hmd,avg_tmr,avg_wv,snsn_sum
0,군위,2020-02-01,0.676,2.9,2.0,7.1
1,군위,2020-02-02,0.615,3.6,1.2,8.3
2,군위,2020-02-03,0.456,3.1,2.9,7.2
3,군위,2020-02-04,0.425,1.9,1.8,9.8
4,군위,2020-02-05,0.308,-1.5,4.5,10.1


### **b) 결측치 처리**

In [None]:
# 현재 데이터 내의 결측치는 공백으로 처리되어 있음
# 데이터프레임 내의 공백을 NaN으로 변경

df2.replace('', np.nan, inplace = True)

In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30681 entries, 0 to 30680
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   dam_nm    30681 non-null  object        
 1   obsr_de   30681 non-null  datetime64[ns]
 2   avg_hmd   30615 non-null  float64       
 3   avg_tmr   30671 non-null  float64       
 4   avg_wv    30672 non-null  float64       
 5   snsn_sum  30651 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 1.4+ MB


- 여러 변수들에 걸쳐 결측치가 있는 것으로 보인다.
- 결측치 처리
  - 습도 -> 최빈값
  - 온도 -> 평균
  - 풍속 -> 봄,가을의 경우 평균으로 / 여름, 겨울의 경우 최빈값으로
  - 일사량 -> 최빈값

In [None]:
df2.describe()

Unnamed: 0,avg_hmd,avg_tmr,avg_wv,snsn_sum
count,30615.0,30671.0,30672.0,30651.0
mean,0.6875,13.0387,1.5672,8.72
std,0.151,9.6457,0.9792,5.7
min,0.008,-16.0,0.02,0.08
25%,0.584,5.1,0.9,5.0
50%,0.706,13.2,1.3,8.2
75%,0.8,21.6,1.9,11.1
max,1.0,31.6,10.0,40.0


- 잘못 기록된 값은 없는 것으로 짐작할 수 있다.

In [None]:
## 습도
len(df2.loc[df2['avg_hmd'].isna(),:])

66

In [None]:
# 해당 댐의 해당 연도 + 달의 습도 최빈값으로 대체

df2['avg_hmd'] = df2.groupby(['dam_nm',df2['obsr_de'].dt.year, df2['obsr_de'].dt.month])['avg_hmd'].transform(lambda x: x.fillna(x.mode().iloc[0]))

In [None]:
df2.loc[df2['avg_hmd'].isna(),:]

Unnamed: 0,dam_nm,obsr_de,avg_hmd,avg_tmr,avg_wv,snsn_sum


In [None]:
## 온도

len(df2.loc[df2['avg_tmr'].isna(),:])

10

In [None]:
# 해당 댐의 해당 연도 + 달의 온도 평균값으로 대체

df2['avg_tmr'] = df2.groupby(['dam_nm',df2['obsr_de'].dt.year, df2['obsr_de'].dt.month])['avg_tmr'].transform(lambda x: x.fillna(x.mean()))

In [None]:
df2.loc[df2['avg_tmr'].isna(),:]

Unnamed: 0,dam_nm,obsr_de,avg_hmd,avg_tmr,avg_wv,snsn_sum


In [None]:
## 풍속

len(df2.loc[df2['avg_wv'].isna(),:])

9

In [None]:
# 월별 계절 정보를 가지고 있는 함수를 정의합니다.
def get_season(month):
    if month in [3, 4, 5]:    # 봄
        return '봄'
    elif month in [6, 7, 8]:  # 여름
        return '여름'
    elif month in [9, 10, 11]:  # 가을
        return '가을'
    else:                     # 겨울 (12, 1, 2월)
        return '겨울'

In [None]:
# 계절 정보 추가
df2['season'] = df2['obsr_de'].dt.month.apply(get_season)

# 여름과 겨울은 최빈값으로 대체, 봄과 가을은 평균값으로 대체
df2['avg_wv'] = df2.groupby(['dam_nm',df2['obsr_de'].dt.year, df2['obsr_de'].dt.month, 'season'])['avg_wv'].transform(lambda x: x.fillna(x.mode()[0]) if x.name[2]=='여름' or x.name[2]=='겨울' else x.fillna(x.mean()))

In [None]:
df2.loc[df2['avg_wv'].isna(),:]

Unnamed: 0,dam_nm,obsr_de,avg_hmd,avg_tmr,avg_wv,snsn_sum,season


In [None]:
# 계절 정보는 더이상 필요 없으므로 삭제

df2.drop('season', axis = 1, inplace = True)

In [None]:
## 일사량

len(df2.loc[df2['snsn_sum'].isna(),:])

30

In [None]:
# 해당 댐의 해당 연도 + 달의 습도 최빈값으로 대체

df2['snsn_sum'] = df2.groupby(['dam_nm',df2['obsr_de'].dt.year, df2['obsr_de'].dt.month])['snsn_sum'].transform(lambda x: x.fillna(x.mode().iloc[0]))

In [None]:
df2.loc[df2['snsn_sum'].isna(),:]

Unnamed: 0,dam_nm,obsr_de,avg_hmd,avg_tmr,avg_wv,snsn_sum


In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30681 entries, 0 to 30680
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   dam_nm    30681 non-null  object        
 1   obsr_de   30681 non-null  datetime64[ns]
 2   avg_hmd   30681 non-null  float64       
 3   avg_tmr   30681 non-null  float64       
 4   avg_wv    30681 non-null  float64       
 5   snsn_sum  30681 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 1.4+ MB


- 결측치가 제대로 채워졌다.

In [None]:
sum(~df['dam_nm'].isin(df2['dam_nm']))

0

- 댐 이름은 모두 동일

In [None]:
sum(~df['obsr_de'].isin(df2['obsr_de']))

0

- 관측일자도 모두 동일

In [None]:
duplicates2 = df2[df2.duplicated(subset = ['dam_nm', 'obsr_de'], keep=False)]
print(duplicates2)

Empty DataFrame
Columns: [dam_nm, obsr_de, avg_hmd, avg_tmr, avg_wv, snsn_sum]
Index: []


- 중복된 데이터는 없음을 확인할 수 있다.

In [None]:
### 데이터 병합

df = pd.merge(left = df, right = df2, how = "inner", on = ['dam_nm','obsr_de'] ) # dam_nm, 관측일 기준으로 조인

In [None]:
df

Unnamed: 0,dam_nm,obsr_de,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt,tot_qy,valid_qy,unused_qy,maximum_use_qy,avg_hmd,avg_tmr,avg_wv,snsn_sum
0,군위,2021-09-01,61.1076,33.5930,27.7090,38.4440,0.7890,48.7000,40.1000,5.5000,40.1000,0.9080,21.9000,2.4000,0.2000
1,군위,2021-09-02,0.2366,16.1420,49.0420,35.6010,0.7310,48.7000,40.1000,5.5000,40.1000,0.7710,21.8000,2.5000,0.9000
2,군위,2021-09-03,0.0000,2.4020,30.3300,33.1880,0.6820,48.7000,40.1000,5.5000,40.1000,0.8650,20.5000,1.4000,0.7000
3,군위,2021-09-04,0.0000,2.6560,1.4070,33.2960,0.6840,48.7000,40.1000,5.5000,40.1000,0.6640,22.4000,2.0000,7.7000
4,군위,2021-09-05,0.0000,1.4920,1.4920,33.2960,0.6840,48.7000,40.1000,5.5000,40.1000,0.6850,21.8000,1.7000,1.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30676,횡성,2020-08-27,7.2651,6.4480,15.7910,64.6700,0.7440,86.9000,73.4000,13.5000,73.4000,0.8080,27.4000,2.0000,3.6000
30677,횡성,2020-08-28,59.3119,8.4850,4.4070,65.0220,0.7480,86.9000,73.4000,13.5000,73.4000,0.8590,27.0000,1.1000,3.7000
30678,횡성,2020-08-29,24.2231,44.2680,17.7100,67.3170,0.7750,86.9000,73.4000,13.5000,73.4000,0.8840,25.8000,0.6000,1.3000
30679,횡성,2020-08-30,39.7127,45.8840,29.0690,68.7700,0.7910,86.9000,73.4000,13.5000,73.4000,0.9030,25.1000,0.8000,1.7000


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30681 entries, 0 to 30680
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   dam_nm          30681 non-null  object        
 1   obsr_de         30681 non-null  datetime64[ns]
 2   rain_qy         30681 non-null  float64       
 3   inflow_qy       30681 non-null  float64       
 4   tot_dcwtr_qy    30681 non-null  float64       
 5   rsvwt_qy        30681 non-null  float64       
 6   dam_rsvwt_rt    30681 non-null  float64       
 7   tot_qy          30681 non-null  float64       
 8   valid_qy        30681 non-null  float64       
 9   unused_qy       30681 non-null  float64       
 10  maximum_use_qy  30681 non-null  float64       
 11  avg_hmd         30681 non-null  float64       
 12  avg_tmr         30681 non-null  float64       
 13  avg_wv          30681 non-null  float64       
 14  snsn_sum        30681 non-null  float64       
dtypes:

- 결측치 없이 잘 처리되었다.

In [None]:
df.describe()

Unnamed: 0,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt,tot_qy,valid_qy,unused_qy,maximum_use_qy,avg_hmd,avg_tmr,avg_wv,snsn_sum
count,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0,30681.0
mean,3.543,24.2884,25.1708,362.7624,0.573,615.3814,436.6657,115.8857,433.7434,0.6874,13.0393,1.5672,8.7174
std,12.5173,130.0254,105.7154,493.719,0.1677,820.2634,530.0535,195.8659,527.3284,0.1512,9.6452,0.9791,5.6991
min,0.0,0.0,0.0,0.003,0.0,22.11,17.88,0.8,17.82,0.008,-16.0,0.02,0.08
25%,0.0,0.469,1.596,39.345,0.464,73.6,69.8,5.9,69.8,0.584,5.1,0.9,5.0
50%,0.0,2.877,6.812,135.101,0.584,250.0,210.0,12.0,210.0,0.706,13.2,1.3,8.2
75%,0.4914,12.856,18.683,466.956,0.692,790.0,560.0,124.0,560.0,0.8,21.6,1.9,11.1
max,334.6305,5457.957,5027.616,2416.721,0.989,2900.0,1900.0,650.0,1900.0,1.0,31.6,10.0,40.0


In [None]:
df.loc[df['rsvwt_qy'] > df['tot_qy'],:]

Unnamed: 0,dam_nm,obsr_de,rain_qy,inflow_qy,tot_dcwtr_qy,rsvwt_qy,dam_rsvwt_rt,tot_qy,valid_qy,unused_qy,maximum_use_qy,avg_hmd,avg_tmr,avg_wv,snsn_sum


- 잘못된 값은 없음을 확인할 수 있다.

## **⏺ 변수명 변경**

In [None]:
df.columns = ['dam_nm', 'obsr_de', 'rain_qy', 'inflow_qy', 'outflow_qy', 'reserve_qy', 'reserve_rt',
              'tot_qy' ,'valid_qy', 'unused_qy', 'maximum_use_qy',
              'avg_hmd', 'avg_tmr', 'avg_wv', 'sun_sum']

In [None]:
df.head()

Unnamed: 0,dam_nm,obsr_de,rain_qy,inflow_qy,outflow_qy,reserve_qy,reserve_rt,tot_qy,valid_qy,unused_qy,maximum_use_qy,avg_hmd,avg_tmr,avg_wv,sun_sum
0,군위,2021-09-01,61.1076,33.593,27.709,38.444,0.789,48.7,40.1,5.5,40.1,0.908,21.9,2.4,0.2
1,군위,2021-09-02,0.2366,16.142,49.042,35.601,0.731,48.7,40.1,5.5,40.1,0.771,21.8,2.5,0.9
2,군위,2021-09-03,0.0,2.402,30.33,33.188,0.682,48.7,40.1,5.5,40.1,0.865,20.5,1.4,0.7
3,군위,2021-09-04,0.0,2.656,1.407,33.296,0.684,48.7,40.1,5.5,40.1,0.664,22.4,2.0,7.7
4,군위,2021-09-05,0.0,1.492,1.492,33.296,0.684,48.7,40.1,5.5,40.1,0.685,21.8,1.7,1.0


In [None]:
# 병합된 데이터프레임 저장

df.to_csv("/content/drive/MyDrive/2023년 K-water 대국민 물 빅데이터 분석과제 공모전/final/final.csv", index = False)

# **2. 수질 데이터**
- 각 댐의 수질 측정 결과 데이터
  - 다른 데이터들과 다르게 **월별** 데이터임
- 수질 등급의 기준이 되는 10가지 검사항목을 포함하고 있음
- 여러 데이터에서 필요한 정보를 수집 후 가공
  - 하나의 댐에 대해 측정소가 여러 곳인 경우 **평균값** 채택
- 활용 데이터 목록
  - [물환경정보시스템_수질측정망](https://water.nier.go.kr/web/waterMeasure?pMENU_NO=571)
  - [MyWater_다목적댐 수질정보](https://www.water.or.kr/kor/realtime/sujil/index.do?mode=vers&menuId=13_91_103_104)
  - [통계청_호소수 수질현황](https://kosis.kr/statHtml/statHtml.do?orgId=106&tblId=DT_106N_01_0100070&conn_path=I2)

### **📌 Data Description**
1. dam_nm: 댐이름   
2. year: 측정일자(연도)      
3. month: 측정일자(월)
4. pH: 수소이온농도
5. TOC: 총유기탄소량(mg/L)
6. SS: 부유물질량(mg/L)
7. DO: 용존산소량(mg/L)
8. TP: 총인(mg/L)
9. TN: 총질소(mg/L)        
10. Chl-a: 클로로필-a(mg/m^2)
11. tot_coli: 총대장균군수(군수/100mL)
12. fec_coli: 분원성대장균군수(군수/100mL)

### **a) 데이터 불러오기**

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/2023년 K-water 대국민 물 빅데이터 분석과제 공모전/final/원본 데이터/수질정보/2019_2020.csv')
df2 = pd.read_csv('/content/drive/MyDrive/2023년 K-water 대국민 물 빅데이터 분석과제 공모전/final/원본 데이터/수질정보/2021_2022.csv')
quality = pd.concat([df1,df2], ignore_index = True)

In [None]:
quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   dam_nm    1008 non-null   object 
 1   year      1008 non-null   int64  
 2   month     1008 non-null   int64  
 3   pH        975 non-null    float64
 4   TOC       975 non-null    float64
 5   SS        975 non-null    float64
 6   DO        974 non-null    float64
 7   TP        975 non-null    float64
 8   TN        975 non-null    float64
 9   Chl-a     975 non-null    float64
 10  tot_coli  969 non-null    float64
 11  fec_coli  961 non-null    float64
dtypes: float64(9), int64(2), object(1)
memory usage: 94.6+ KB


### **b) 결측치 처리하기**

In [None]:
# 현재 데이터 내의 결측치는 공백으로 처리되어 있음
# 데이터프레임 내의 공백을 NaN으로 변경

quality.replace('', np.nan, inplace = True)

In [None]:
quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   dam_nm    1008 non-null   object 
 1   year      1008 non-null   int64  
 2   month     1008 non-null   int64  
 3   pH        975 non-null    float64
 4   TOC       975 non-null    float64
 5   SS        975 non-null    float64
 6   DO        974 non-null    float64
 7   TP        975 non-null    float64
 8   TN        975 non-null    float64
 9   Chl-a     975 non-null    float64
 10  tot_coli  969 non-null    float64
 11  fec_coli  961 non-null    float64
dtypes: float64(9), int64(2), object(1)
memory usage: 94.6+ KB


### **⏺ 수질등급 산정**
- 호소의 생활환경기준에 따라 **7개** 등급으로 구분
- [생활환경기준](https://water.nier.go.kr/web/contents/contentView/?pMENU_NO=37)
- 결측치가 존재하여 등급 산정이 불가능한 경우 TOC로 대체

In [None]:
## 총인, 총질소의 경우 총인에 대한 총질소의 농도비율이 7 미만일 경우에는 총인의 기준을 적용하지 않으며, 그 비율이 16 이상일 경우에는 총질소의 기준을 적용하지 않음
# 해당 기준에 따라 등급 조정

quality['TN_TP_ratio'] = quality['TN'] / quality['TP']

In [None]:
# 1) 총인에 대한 총질소의 농도비율이 7 미만인 경우

quality.loc[quality['TN_TP_ratio'] < 7, :]

Unnamed: 0,dam_nm,year,month,pH,TOC,SS,DO,TP,TN,Chl-a,tot_coli,fec_coli,TN_TP_ratio


In [None]:
# 2) 총인에 대한 총질소의 농도비율이 16 이상인 경우

quality.loc[quality['TN_TP_ratio'] >= 16, :]

Unnamed: 0,dam_nm,year,month,pH,TOC,SS,DO,TP,TN,Chl-a,tot_coli,fec_coli,TN_TP_ratio
0,군위,2019,1,6.8000,4.1000,1.1000,10.8000,0.0120,1.4785,2.1000,5.0000,0.0000,123.2083
1,군위,2019,2,7.6000,4.0000,1.0000,12.1000,0.0105,1.0905,3.4000,1.0000,0.0000,103.8571
2,군위,2019,3,7.3000,3.9000,1.7000,13.5000,0.0075,1.2120,1.5000,2.0000,1.0000,161.6000
3,군위,2019,4,7.0000,3.7000,2.3000,12.1000,0.0125,1.5590,4.6000,88.0000,2.0000,124.7200
4,군위,2019,5,7.1000,3.5000,0.8000,9.0000,0.0090,1.2835,3.4000,5.0000,1.0000,142.6111
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,횡성,2022,8,7.4000,2.6000,1.3000,6.8000,0.0137,2.8349,10.5000,3408.0000,1.0000,206.9270
1004,횡성,2022,9,7.6000,1.4000,3.8000,7.2000,0.0109,2.2101,4.4000,1177.0000,13.0000,202.7615
1005,횡성,2022,10,7.7000,2.0000,2.7000,7.0000,0.0106,2.1933,9.8000,807.0000,1.0000,206.9151
1006,횡성,2022,11,7.4000,1.7000,1.3000,7.3000,0.0117,2.3474,2.7000,67.0000,1.0000,200.6325


- 대부분의 경우 총인에 대한 총질소의 농도 비율이 16 이상이기에, 총질소 기준을 적용 x

In [None]:
# 필요없는 변수 삭제

quality.drop(['TN', 'TN_TP_ratio'], axis = 1, inplace = True)

In [None]:
def get_water_grade(data):

  pH = data['pH']
  TOC = data['TOC']
  SS = data['SS']
  DO = data['DO']
  TP = data['TP']
  Chl_a = data['Chl-a']
  tot_coli = data['tot_coli']
  fec_coli = data['fec_coli']

  # 하나라도 결측치가 있다면 등급 산정 x
  if data.isnull().any():
    return np.NaN
  else:
    ## 매우 좋음(la)
    if (6.5 <= pH <= 8.5) and (TOC <= 2) and (SS <= 1) and (DO >= 7.5) and (TP <= 0.01) and (Chl_a <= 5) and (tot_coli <= 50) and (fec_coli <= 10):
      return 'Ia'

    ## 좋음(Ib)
    elif (6.5 <= pH <= 8.5) and (TOC <= 3) and (SS <= 5) and (DO >= 5.0) and (TP <= 0.02) and (Chl_a <= 9) and (tot_coli <= 500) and (fec_coli <= 100):
      return 'Ib'

    ## 약간 좋음(II)
    elif (6.5 <= pH <= 8.5) and (TOC <= 4) and (SS <= 5) and (DO >= 5.0) and (TP <= 0.03) and (Chl_a <= 14) and (tot_coli <= 1000) and (fec_coli <= 200):
      return 'II'

    ## 보통(III)
    elif (6.5 <= pH <= 8.5) and (TOC <= 5) and (SS <= 15) and (DO >= 5.0) and (TP <= 0.05) and (Chl_a <= 20) and (tot_coli <= 5000) and (fec_coli <= 1000):
      return 'III'

    ## 약간 나쁨(IV)
    elif (6.0 <= pH <= 8.5) and (TOC <= 6) and (SS <= 15) and (DO >= 2.0) and (TP <= 0.10) and (Chl_a <= 35):
      return 'IV'

    ## 나쁨(V)
    elif (6.0 <= pH <= 8.5) and (TOC <= 8) and (DO >= 2.0) and (TP <= 0.15) and (Chl_a <= 70): # SS의 경우 별다른 수치적 기준은 x, 쓰레기 유무에 따라 등급 산정
      return 'V'

    ## 매우 나쁨(VI)
    elif (TOC > 8) and (DO < 2.0) and (TP > 0.15) and (Chl_a > 70):
      return 'VI'

    ## 등급 산출 불가
    else:
      return np.NaN

In [None]:
quality['grade'] = quality.apply(get_water_grade, axis = 1)

In [None]:
# 등급이 산정되지 않은 데이터의 개수

len(quality.loc[quality['grade'].isnull(), :])

76

In [None]:
quality.loc[quality['grade'].isnull(), :]

Unnamed: 0,dam_nm,year,month,pH,TOC,SS,DO,TP,Chl-a,tot_coli,fec_coli,grade
24,김천부항,2019,1,,,,,,,,,
25,김천부항,2019,2,,,,,,,,,
26,김천부항,2019,3,,,,,,,,,
27,김천부항,2019,4,,,,,,,,,
28,김천부항,2019,5,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
985,횡성,2021,2,,,,,,,,,
987,횡성,2021,4,8.8000,2.1000,1.3000,11.6000,0.0093,4.0000,7.0000,0.0000,
988,횡성,2021,5,8.6000,1.8000,1.5000,10.2000,0.0114,2.5000,439.0000,0.0000,
996,횡성,2022,1,,,,,,,,,


**✔ 등급이 제대로 산정되지 않는 경우**
1. 데이터에 결측치가 있는 경우
  - 전체 결측치 => 해당 댐의 수질등급 최빈값
  - 일부만 결측치 => TOC로 산정
2. pH가 8.5 이상 => TOC로 산정

In [None]:
# TOC가 결측치인 경우

quality[quality['TOC'].isnull()]

Unnamed: 0,dam_nm,year,month,pH,TOC,SS,DO,TP,Chl-a,tot_coli,fec_coli,grade
24,김천부항,2019,1,,,,,,,,,
25,김천부항,2019,2,,,,,,,,,
26,김천부항,2019,3,,,,,,,,,
27,김천부항,2019,4,,,,,,,,,
28,김천부항,2019,5,,,,,,,,,
29,김천부항,2019,6,,,,,,,,,
144,보현산,2019,1,,,,,,,,,
145,보현산,2019,2,,,,,,,,,
146,보현산,2019,3,,,,,,,,,
147,보현산,2019,4,,,,,,,,,


- TOC가 결측치인 경우 데이터 **전체**가 결측치이다.


In [None]:
### 등급 재산정

def update_grade(row):
  # 행 전체가 결측인 경우
  if row.isnull().all():
    dam_name = row['dam_nm']
    dam_grade_mode = quality[quality['dam_nm'] == dam_name]['grade'].dropna().mode()
    if not dam_grade_mode.empty:
      return dam_grade_mode.values[0]

  # pH가 8.5 이상이거나 일부만 결측인 경우
  else:
      if row['TOC'] <= 2:
          return 'la'
      elif row['TOC'] <= 3:
          return 'lb'
      elif row['TOC'] <= 4:
          return 'll'
      elif row['TOC'] <= 5:
          return 'lll'
      elif row['TOC'] <= 6:
          return 'lV'
      elif row['TOC'] <= 8:
          return 'V'
      else:
          return 'Vl'

In [None]:
# 아직 등급이 산정되지 않은 데이터에 대해 등급 산정

quality.loc[quality['grade'].isnull(), 'grade'] = quality.loc[quality['grade'].isnull(), :].apply(update_grade, axis = 1)

In [None]:
quality.loc[quality['grade'].isnull(), :]

Unnamed: 0,dam_nm,year,month,pH,TOC,SS,DO,TP,Chl-a,tot_coli,fec_coli,grade


In [None]:
quality.groupby('dam_nm')['grade'].describe()

Unnamed: 0_level_0,count,unique,top,freq
dam_nm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
군위,48,5,II,30
김천부항,48,7,Ib,19
남강,48,6,Ib,23
대청,48,5,Ib,31
밀양,48,6,Ib,31
보령,48,5,Ib,28
보현산,48,7,III,21
부안,48,3,Ib,36
섬진강,48,4,Ib,37
성덕,48,8,Ib,16


- 대부분의 댐에서 수질이 일정한 범위에서 유지되고 있음을 확인할 수 있다.

In [None]:
# 수질 데이터는 모델링에 사용되지는 않을 예정이므로 따로 저장

quality.to_csv('/content/drive/MyDrive/2023년 K-water 대국민 물 빅데이터 분석과제 공모전/final/quality.csv', index = False)