In [1]:
#!pip install numpy
import numpy as np
import pandas as pd

## 누락된 데이터 처리

In [2]:
#다음과 같은 데이터를 가정한다.

df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,-0.224249,-0.307291,-1.541354
1,-0.580189,-0.340944,0.760566
2,0.733609,-0.447845,0.098904
3,0.555205,-0.231496,-0.264863
4,-1.806941,0.438018,-0.448137
5,0.164165,0.601759,-0.656827
6,-0.500668,0.52178,1.188909


In [3]:
df.iloc[:4,1] = np.nan

df.iloc[:2,2] = np.nan

df.iloc[1,0] = np.nan

df

Unnamed: 0,0,1,2
0,-0.224249,,
1,,,
2,0.733609,,0.098904
3,0.555205,,-0.264863
4,-1.806941,0.438018,-0.448137
5,0.164165,0.601759,-0.656827
6,-0.500668,0.52178,1.188909


In [4]:
# 결측값 제외 함수 : dropna()
# dropna()는 원칙적으로 하나라도 NaN이 있으면 그 행을 제외한다

df.dropna(axis=0)

Unnamed: 0,0,1,2
4,-1.806941,0.438018,-0.448137
5,0.164165,0.601759,-0.656827
6,-0.500668,0.52178,1.188909


In [5]:
# 만약 해당 행의 모든 요소가 NaN일 때 제외하려면 

df1 = df.dropna(how='all')
df1

Unnamed: 0,0,1,2
0,-0.224249,,
2,0.733609,,0.098904
3,0.555205,,-0.264863
4,-1.806941,0.438018,-0.448137
5,0.164165,0.601759,-0.656827
6,-0.500668,0.52178,1.188909


In [6]:
# 행을 제외하는 것도 동일, 다만 이때 axis=1을 인자로 한다

df1.dropna(axis=1)

Unnamed: 0,0
0,-0.224249
2,0.733609
3,0.555205
4,-1.806941
5,0.164165
6,-0.500668


In [7]:
df

Unnamed: 0,0,1,2
0,-0.224249,,
1,,,
2,0.733609,,0.098904
3,0.555205,,-0.264863
4,-1.806941,0.438018,-0.448137
5,0.164165,0.601759,-0.656827
6,-0.500668,0.52178,1.188909


In [8]:
# 몇 개 이상의 NaN을 가질 때 제외할 것인가 조건을 주는 방법은..
# thresh = n 의 의미는 해당 행/열에서 NaN이 아닌 값이 n개 이상일 경우만 남긴다는 의미
# NaN의 갯수를 의미하지 않는다

df.dropna(thresh=1)

Unnamed: 0,0,1,2
0,-0.224249,,
2,0.733609,,0.098904
3,0.555205,,-0.264863
4,-1.806941,0.438018,-0.448137
5,0.164165,0.601759,-0.656827
6,-0.500668,0.52178,1.188909


In [9]:
df

Unnamed: 0,0,1,2
0,-0.224249,,
1,,,
2,0.733609,,0.098904
3,0.555205,,-0.264863
4,-1.806941,0.438018,-0.448137
5,0.164165,0.601759,-0.656827
6,-0.500668,0.52178,1.188909


In [10]:
# 삭제하는 것이 아니라, 값을 다른 값으로 채워야 할 경우가 많다
# 만약 NaN을 0으로 채워야 한다면...

df.fillna(0)   # fillna() 안에 채울 값을 표시하면 된다

Unnamed: 0,0,1,2
0,-0.224249,0.0,0.0
1,0.0,0.0,0.0
2,0.733609,0.0,0.098904
3,0.555205,0.0,-0.264863
4,-1.806941,0.438018,-0.448137
5,0.164165,0.601759,-0.656827
6,-0.500668,0.52178,1.188909


In [11]:
# 각 열별로 서로 다른 값으로 채울 수도 있다

df.fillna({0:10, 1:20, 2:30})

Unnamed: 0,0,1,2
0,-0.224249,20.0,30.0
1,10.0,20.0,30.0
2,0.733609,20.0,0.098904
3,0.555205,20.0,-0.264863
4,-1.806941,0.438018,-0.448137
5,0.164165,0.601759,-0.656827
6,-0.500668,0.52178,1.188909


In [12]:
# 나아가 함수로 채울 수도 있다
# 0 열은 평균, 1열은 최대, 2열은 최소로 채운다

imputed = df.fillna({0:df[0].mean(), 1:df[1].max(), 2: df[2].min()})
imputed

Unnamed: 0,0,1,2
0,-0.224249,0.601759,-0.656827
1,-0.179813,0.601759,-0.656827
2,0.733609,0.601759,0.098904
3,0.555205,0.601759,-0.264863
4,-1.806941,0.438018,-0.448137
5,0.164165,0.601759,-0.656827
6,-0.500668,0.52178,1.188909


## 중복 데이터의 제거

In [14]:
# 때에 따라 중복 데이터를 제거해야 할 경우가 많다.

# 먼저 중복 데이터 여부를 확인하는 방법은...

imputed.duplicated()  #각 row의 값들이 모두 중복이면 True가 된다.

0    False
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [15]:
# 특정 열 기준으로 해당 열의 중복 값을 제거...

imputed.drop_duplicates(subset=[1, 2])    #칼럼 1, 2의 중복 값을 제거

Unnamed: 0,0,1,2
0,-0.224249,0.601759,-0.656827
2,0.733609,0.601759,0.098904
3,0.555205,0.601759,-0.264863
4,-1.806941,0.438018,-0.448137
6,-0.500668,0.52178,1.188909


In [16]:
# drop_duplicates()는 원칙적으로 앞으로 값을 보존하고, 뒤의 중복을 없앤다.
# 만약 뒤의 값을 보존하려면...

imputed.drop_duplicates([1,2], keep ='last')

Unnamed: 0,0,1,2
2,0.733609,0.601759,0.098904
3,0.555205,0.601759,-0.264863
4,-1.806941,0.438018,-0.448137
5,0.164165,0.601759,-0.656827
6,-0.500668,0.52178,1.188909


## 특정한 조건으로 값을 걸러내기

In [17]:
import pandas as pd

df = pd.read_csv('./weather_tmp.csv', encoding = 'utf-8')
df.head()

Unnamed: 0,지점,일시,평균기온,최저기온,최저기온시각,최고기온,최고기온시각,1시간최다강수량,일강수량,평균풍속,평균기압,합계일조시간,평균지면온도
0,108,2021-01-01,-4.2,-9.8,511,1.6,1447,,,2.0,1014.9,6.5,-3.4
1,108,2021-01-02,-5.0,-8.4,805,-1.4,1346,,,2.6,1018.5,9.0,-3.9
2,108,2021-01-03,-5.6,-9.1,536,-2.0,1238,,,2.0,1023.0,5.5,-4.9
3,108,2021-01-04,-3.5,-8.4,656,0.3,1535,,0.0,1.7,1020.3,4.6,-3.3
4,108,2021-01-05,-5.5,-9.9,2356,-2.1,1,,0.0,2.9,1019.2,8.6,-3.2


In [18]:
# 위의 데이터프레임에서 1시간 최대강수량, 일강수량 등에서 NaN인 것은
# 측정 자체가 불가능했다기 보다는 단순히 0 대신 NaN으로 표시했을 것으로 보인다.
# NaN 모두를 0으로 채워 넣어보자

df.fillna(0, inplace = True)
df.head()

Unnamed: 0,지점,일시,평균기온,최저기온,최저기온시각,최고기온,최고기온시각,1시간최다강수량,일강수량,평균풍속,평균기압,합계일조시간,평균지면온도
0,108,2021-01-01,-4.2,-9.8,511,1.6,1447,0.0,0.0,2.0,1014.9,6.5,-3.4
1,108,2021-01-02,-5.0,-8.4,805,-1.4,1346,0.0,0.0,2.6,1018.5,9.0,-3.9
2,108,2021-01-03,-5.6,-9.1,536,-2.0,1238,0.0,0.0,2.0,1023.0,5.5,-4.9
3,108,2021-01-04,-3.5,-8.4,656,0.3,1535,0.0,0.0,1.7,1020.3,4.6,-3.3
4,108,2021-01-05,-5.5,-9.9,2356,-2.1,1,0.0,0.0,2.9,1019.2,8.6,-3.2


In [19]:
# 특정한 조건을 충족하는 데이터만을 걸러내는 방법

# df[조건] 의 형식으로 걸러 낼수 있다.

# 평균 기온이 -10 이하인 날만 걸러내기

df['평균기온'] < -10

0      False
1      False
2      False
3      False
4      False
       ...  
360    False
361    False
362    False
363    False
364    False
Name: 평균기온, Length: 365, dtype: bool

In [20]:
day = df[df['평균기온'] < -10]
day

Unnamed: 0,지점,일시,평균기온,최저기온,최저기온시각,최고기온,최고기온시각,1시간최다강수량,일강수량,평균풍속,평균기압,합계일조시간,평균지면온도
6,108,2021-01-07,-14.5,-16.5,802,-8.4,1,0.0,0.0,4.1,1015.3,0.0,-6.1
7,108,2021-01-08,-14.9,-18.6,812,-10.7,1450,0.0,0.0,3.3,1014.1,9.2,-9.3
8,108,2021-01-09,-12.2,-16.6,614,-7.5,1602,0.0,0.0,2.6,1014.0,9.0,-8.4
358,108,2021-12-25,-11.7,-14.4,2258,-7.3,1,0.0,0.0,3.9,1021.5,9.1,-4.4
359,108,2021-12-26,-12.1,-15.5,817,-7.3,1526,0.0,0.0,3.1,1023.4,9.1,-5.8


In [21]:
df['최고기온'] > 35

0      False
1      False
2      False
3      False
4      False
       ...  
360    False
361    False
362    False
363    False
364    False
Name: 최고기온, Length: 365, dtype: bool

In [22]:
# 최고 기온이 35도 이상인 날만 걸러내기

df[df['최고기온'] > 35]

Unnamed: 0,지점,일시,평균기온,최저기온,최저기온시각,최고기온,최고기온시각,1시간최다강수량,일강수량,평균풍속,평균기압,합계일조시간,평균지면온도
196,108,2021-07-16,28.8,24.2,508,35.2,1538,0.0,0.0,2.2,1004.3,10.5,32.5
201,108,2021-07-21,30.5,25.3,556,35.3,1654,0.0,0.0,1.7,1002.9,11.3,30.5
202,108,2021-07-22,31.2,26.5,621,35.9,1615,0.0,0.0,2.0,1000.9,11.4,31.4
203,108,2021-07-23,31.2,27.2,541,35.8,1604,0.0,0.0,1.8,999.3,12.4,32.2
204,108,2021-07-24,31.7,26.9,548,36.5,1508,0.0,0.0,1.7,998.2,12.3,32.6
205,108,2021-07-25,31.5,27.2,542,35.9,1547,0.0,0.0,2.0,996.5,13.0,34.2
206,108,2021-07-26,31.2,27.4,608,35.4,1521,0.0,0.0,2.4,994.6,13.2,35.3
207,108,2021-07-27,31.1,27.8,601,35.7,1520,0.0,0.0,2.1,992.9,7.3,33.3
210,108,2021-07-30,30.5,25.8,559,35.4,1406,0.0,0.0,2.1,990.5,7.7,33.5


In [24]:
# 2021년 7월 달만 걸러내기
#조금 복잡하다.

tmp = []

for i in df['일시']:
    
    if i[:7] =='2021-07':
        tmp.append(1)
    else:
        tmp.append(0)
        
df['7월여부'] = tmp

In [25]:
df.head()

Unnamed: 0,지점,일시,평균기온,최저기온,최저기온시각,최고기온,최고기온시각,1시간최다강수량,일강수량,평균풍속,평균기압,합계일조시간,평균지면온도,7월여부
0,108,2021-01-01,-4.2,-9.8,511,1.6,1447,0.0,0.0,2.0,1014.9,6.5,-3.4,0
1,108,2021-01-02,-5.0,-8.4,805,-1.4,1346,0.0,0.0,2.6,1018.5,9.0,-3.9,0
2,108,2021-01-03,-5.6,-9.1,536,-2.0,1238,0.0,0.0,2.0,1023.0,5.5,-4.9,0
3,108,2021-01-04,-3.5,-8.4,656,0.3,1535,0.0,0.0,1.7,1020.3,4.6,-3.3,0
4,108,2021-01-05,-5.5,-9.9,2356,-2.1,1,0.0,0.0,2.9,1019.2,8.6,-3.2,0


In [43]:
df['일시'][0][:7]

'2021-01'

In [26]:
df[df['7월여부'] ==1]

Unnamed: 0,지점,일시,평균기온,최저기온,최저기온시각,최고기온,최고기온시각,1시간최다강수량,일강수량,평균풍속,평균기압,합계일조시간,평균지면온도,7월여부
181,108,2021-07-01,26.3,21.4,519,31.0,1525,0.0,0.0,1.8,996.8,8.2,27.4,1
182,108,2021-07-02,27.1,23.6,538,31.8,1612,0.0,0.0,2.3,998.7,7.4,27.2,1
183,108,2021-07-03,22.6,19.6,2208,25.7,1,13.8,60.0,3.6,996.3,0.9,23.5,1
184,108,2021-07-04,21.4,19.8,1,23.1,1500,5.3,21.4,2.3,994.4,0.0,22.2,1
185,108,2021-07-05,23.3,20.1,328,26.9,1444,0.0,0.0,1.4,999.8,0.9,24.6,1
186,108,2021-07-06,25.3,21.0,450,30.1,1437,0.0,0.0,1.6,997.2,4.3,26.8,1
187,108,2021-07-07,26.4,23.9,419,29.1,1441,12.4,12.7,1.5,997.1,1.1,26.6,1
188,108,2021-07-08,26.0,23.8,520,29.8,1451,2.7,2.9,2.0,995.2,2.9,27.0,1
189,108,2021-07-09,25.5,23.6,335,29.2,1530,0.0,0.0,1.9,995.5,3.8,27.0,1
190,108,2021-07-10,25.9,23.1,631,29.2,1340,0.2,0.5,2.4,997.5,4.2,26.9,1


In [27]:
# 여러개의 조건으로 동시에 걸러낼 수도 있다. 
# 단 개별 조건을 ()로 묶고, 일반적인 and 대신에 &로 표시한다
# (or조건의 경우는 |)
# ex) 최고 기온이 30도 이상이면서 일강수량이 50 mm 이상인 날

df[(df['최고기온'] >= 30) & (df['일강수량'] >= 50)]

Unnamed: 0,지점,일시,평균기온,최저기온,최저기온시각,최고기온,최고기온시각,1시간최다강수량,일강수량,평균풍속,평균기압,합계일조시간,평균지면온도,7월여부
199,108,2021-07-19,26.4,24.0,1532,30.5,1134,65.7,67.4,1.9,1003.4,3.1,28.0,1
