In [1]:
import numpy as np
import pandas as pd 


In [2]:
df = pd.DataFrame(np.random.rand(6, 4))

dataframe은 인덱스와 컬럼인자를 주지 않으면, index와 columns는 0부터 시작한다.  
여기서는 **nupmpy**의 랜덤함수인 `np.random.rand(행, 렬)`로 `랜덤 array`를 만들자

In [10]:
df.columns = ['A', 'B', 'C', 'D']
df.index = pd.date_range('20210610', periods = 6)

**pd.date_range("yyyymmdd", periods = 6) 함수**는    

pandas에서 제공하는 datetime64라는 데이터형으로 구성된 인덱스를 생성할때 사용하는  
함수 `periods = 6`는 입력날 부터 6일에 대한 index를 생성

In [8]:
df

Unnamed: 0,A,B,C,D
2021-06-10,0.603472,0.225558,0.080184,0.884147
2021-06-11,0.277473,0.977127,0.871562,0.756826
2021-06-12,0.841923,0.908786,0.320936,0.236505
2021-06-13,0.081508,0.76494,0.243565,0.077799
2021-06-14,0.738182,0.36586,0.672628,0.262703
2021-06-15,0.426397,0.093371,0.070274,0.936595


In [11]:
df['F'] = [1.0, np.nan, 3.5, 6.1, np.nan, 7.0]

In [12]:
df

Unnamed: 0,A,B,C,D,F
2021-06-10,0.603472,0.225558,0.080184,0.884147,1.0
2021-06-11,0.277473,0.977127,0.871562,0.756826,
2021-06-12,0.841923,0.908786,0.320936,0.236505,3.5
2021-06-13,0.081508,0.76494,0.243565,0.077799,6.1
2021-06-14,0.738182,0.36586,0.672628,0.262703,
2021-06-15,0.426397,0.093371,0.070274,0.936595,7.0


pandas를 이용해서 dataset을 가져올 때, `NaN`과 같이 범주에서 벗어난 값이 존재 할 수 있다.  
이러한 값을 **결측값, 이상치**라고 하다.  
`np.nan`은 성분에 `NaN을 대입하는` numpy의 함수 

---
이제 우리는 `NaN이라 표기된 값`만 골라내고, 해당 `행을 제거해야` 한다.
이럴 때 **df.dropna(how="any")** 함수를 이용해야 한다.   
각 행들 중 NaN을 포함하는 행들을 제거한 df를 가져올 수 있다.  
이 때, `any`는 행의 성분에 `NaN이 하나라도 있으면 그 행을 제거`하라는 뜻
만약 만약 `how="all"`이 있다면, 행의 성분이 `모두 NaN일 때 제거`하라는 뜻

In [14]:
df.dropna(how = 'any') # NaN이 하나라도 있으면 

Unnamed: 0,A,B,C,D,F
2021-06-10,0.603472,0.225558,0.080184,0.884147,1.0
2021-06-12,0.841923,0.908786,0.320936,0.236505,3.5
2021-06-13,0.081508,0.76494,0.243565,0.077799,6.1
2021-06-15,0.426397,0.093371,0.070274,0.936595,7.0


In [15]:
df.dropna(how = 'all') # 모두 NaN이 있을 경우  

Unnamed: 0,A,B,C,D,F
2021-06-10,0.603472,0.225558,0.080184,0.884147,1.0
2021-06-11,0.277473,0.977127,0.871562,0.756826,
2021-06-12,0.841923,0.908786,0.320936,0.236505,3.5
2021-06-13,0.081508,0.76494,0.243565,0.077799,6.1
2021-06-14,0.738182,0.36586,0.672628,0.262703,
2021-06-15,0.426397,0.093371,0.070274,0.936595,7.0


이제 NaN이 포함된 행을 제거하는 것이 아니라, **NaN에 다른 값**을 대입해보자.  
이럴 땐, **df.fillna(value = '값') 함수**를 사용해야 한다.  
NaN성분에 일괄적으로 `value값`을 대입 

In [19]:
## df.fillna(value = 5.0)  #NaN 성분에 일괄적으로 5.0을 대입

In [20]:
df.isnull()

Unnamed: 0,A,B,C,D,F
2021-06-10,False,False,False,False,False
2021-06-11,False,False,False,False,True
2021-06-12,False,False,False,False,False
2021-06-13,False,False,False,False,False
2021-06-14,False,False,False,False,True
2021-06-15,False,False,False,False,False


`df,isnull()함수`는 `NaN인 성분만 True을 뽑아줌`

In [23]:
df.loc[df.isnull()['F'], :]

Unnamed: 0,A,B,C,D,F
2021-06-11,0.277473,0.977127,0.871562,0.756826,
2021-06-14,0.738182,0.36586,0.672628,0.262703,


In [24]:
pd.to_datetime('20210610')

Timestamp('2021-06-10 00:00:00')

이전에 index명에 pd.date_range() 함수를 통해 datetime 데이터 형으로 집어 넣었기 때문  
pd.to_datetime("입력한 날짜") 함수를 통해 날짜명은 -> datetime 데이터형으로 변환  

---
"yyyymmdd" 날짜명을 pd.to_datetime()로 index로 변환한 것을 df.drop()함수의 인자로 넣어서 해당 행을 삭제  
(열을 삭제할 때는 del df["칼럼명"]이었다. 행 삭제는 df.drop(해당 인덱스))이다.  


In [28]:
df.drop(pd.to_datetime('20210610'))

Unnamed: 0,A,B,C,D,F
2021-06-11,0.277473,0.977127,0.871562,0.756826,
2021-06-12,0.841923,0.908786,0.320936,0.236505,3.5
2021-06-13,0.081508,0.76494,0.243565,0.077799,6.1
2021-06-14,0.738182,0.36586,0.672628,0.262703,
2021-06-15,0.426397,0.093371,0.070274,0.936595,7.0


만약 2개 이상의 특정 행을 삭제하고 싶다면 

In [31]:
df.drop([pd.to_datetime('20210610'), pd.to_datetime('20210611')])

Unnamed: 0,A,B,C,D,F
2021-06-12,0.841923,0.908786,0.320936,0.236505,3.5
2021-06-13,0.081508,0.76494,0.243565,0.077799,6.1
2021-06-14,0.738182,0.36586,0.672628,0.262703,
2021-06-15,0.426397,0.093371,0.070274,0.936595,7.0


df.drop() 함수로, 열을 삭제할 수도 있다.  
drop()함수의 인자는 기본적으로 행의 인덱스를 가져야 하는데,  
이를 상쇄하면서 들어간 인자가 "컬럼명"이라는 것을 명시해주기 위해  
새로운 인자 axis=1이 들어간다.  
`df.drop('F', axis=1)`

In [32]:
df.drop('F', axis=1)

Unnamed: 0,A,B,C,D
2021-06-10,0.603472,0.225558,0.080184,0.884147
2021-06-11,0.277473,0.977127,0.871562,0.756826
2021-06-12,0.841923,0.908786,0.320936,0.236505
2021-06-13,0.081508,0.76494,0.243565,0.077799
2021-06-14,0.738182,0.36586,0.672628,0.262703
2021-06-15,0.426397,0.093371,0.070274,0.936595


In [33]:
df.drop(["B", "F"], axis=1)

Unnamed: 0,A,C,D
2021-06-10,0.603472,0.080184,0.884147
2021-06-11,0.277473,0.871562,0.756826
2021-06-12,0.841923,0.320936,0.236505
2021-06-13,0.081508,0.243565,0.077799
2021-06-14,0.738182,0.672628,0.262703
2021-06-15,0.426397,0.070274,0.936595
