## 1. 데이터 정렬 

In [10]:
import pandas as pd
import numpy as np

### 1-1. Series

###### <span style="color:blue">Example 1. Series</span>

In [6]:
# index = ['a', 'c', 'b', 'd']를 가진 0~4 Series 구성


a    0
c    1
b    2
d    3
dtype: int64

In [7]:
# index 정렬 


a    0
b    2
c    1
d    3
dtype: int64

### 1-2. DataFrame

###### <span style="color:blue">Example 2. DataFrame</span>

In [4]:
# index, column을 가진 dataframe 생성 (2x4)
# column = ['d', 'a', 'c', 'b']
# index = ['j', 'i']


Unnamed: 0,d,a,c,b
j,0,1,2,3
i,4,5,6,7


#### 1. sort_index()
Row나 Column의 index를 기준으로 정렬

In [30]:
# Row index(j, i) 정렬


Unnamed: 0,d,a,c,b
i,4,5,6,7
j,0,1,2,3


In [33]:
# column index(d, a, c, b) 정렬


Unnamed: 0,a,b,c,d
j,1,3,2,0
i,5,7,6,4


#### sort_values()
column 값을 기준으로 정렬

In [32]:
# 'a'값을 기준으로 내림차순 정렬
df.sort_values('a', ascending=False)

Unnamed: 0,d,a,c,b
i,4,5,6,7
j,0,1,2,3


## 2. 데이터 살펴보기 

In [11]:
cols = ['a', 'b', 'c', 'd', 'e']
df = pd.DataFrame(np.random.randint(20, size=20).reshape(-1, 5),
                  columns=cols)
df

Unnamed: 0,a,b,c,d,e
0,13,1,12,10,7
1,2,19,14,3,1
2,11,10,12,2,1
3,12,12,8,1,12


#### 1) 기초 통계량
count, mean, std, min/max등 출력

###### <span style="color:blue">Example 3. 통계량</span>

Unnamed: 0,a,b,c,d,e
count,4.0,4.0,4.0,4.0,4.0
mean,5.75,6.5,10.25,7.0,9.0
std,4.272002,4.795832,6.601767,5.477226,6.377042
min,1.0,2.0,3.0,0.0,0.0
25%,3.25,2.75,7.5,4.5,7.5
50%,5.5,6.0,9.5,7.5,10.5
75%,8.0,9.75,12.25,10.0,12.0
max,11.0,12.0,19.0,13.0,15.0


#### 2) DataFrame Info
자료 타입, 행/열 개수, 각 컬럼별 변수 타입, 메모리 사용량 출력

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
a    4 non-null int32
b    4 non-null int32
c    4 non-null int32
d    4 non-null int32
e    4 non-null int32
dtypes: int32(5)
memory usage: 160.0 bytes


#### 3) DataFrame Size/Shape

In [13]:
df.shape

(4, 5)

In [14]:
len(df)

4

#### 4)  Unique : Series data의 유일한 값을 반환함

In [32]:
cols = ['a', 'b', 'c', 'd', 'e']
df = pd.DataFrame(np.random.randint(20, size=20).reshape(-1, 5),
                  columns=cols)
df

Unnamed: 0,a,b,c,d,e
0,17,11,19,15,13
1,5,6,15,13,16
2,12,13,4,14,6
3,0,5,5,8,19


###### <span style="color:blue">Example 4. Unique</span>

In [22]:
# column 'a'의 Unique한 value  


array([ 4, 19,  9,  0], dtype=int64)

In [23]:
# column 'a'의 Unique한  개수 


4

In [33]:
# Unique한 원소들의 개수
pd.value_counts(df['a'])

5     1
12    1
17    1
0     1
Name: a, dtype: int64

## 3. 결측치 처리 

In [35]:
data = [[1., 6.5, 3.],
        [1., np.NaN, np.NaN],
        [np.NaN, np.NaN, np.NaN],
        [np.NaN, 6.5, 3]
]
cols = ['a', 'b', 'c']

df = pd.DataFrame(data, columns=cols)
df

Unnamed: 0,a,b,c
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


#### 1) dropna
누락된 데이터가 있는 axis (0:index, 1:column) 를 제외

###### <span style="color:blue">Example 5. dropna</span>

In [38]:
# index 기준 NaN 값을 가진 row 삭제


In [63]:
# index 기준 row의 전체 값이 NaN인 경우 해당 Row 삭제 


Unnamed: 0,a,b,c
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [64]:
df.dropna(how='any', axis=1)

0
1
2
3


In [66]:
df.dropna(how='all', axis=1)

Unnamed: 0,a,b,c
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


#### 2) fillna
누락된 데이터를 대신할 값을 채우거나 ‘ffill’ 또는 ‘bfill‘을 통해  값을 채움 

In [39]:
data = [[1., 2.5, 7.],
        [2., np.NaN, np.NaN],
        [np.NaN, np.NaN, np.NaN],
        [np.NaN, 6.5, 3]
]
cols = ['a', 'b', 'c']

df = pd.DataFrame(data, columns=cols)
df

Unnamed: 0,a,b,c
0,1.0,2.5,7.0
1,2.0,,
2,,,
3,,6.5,3.0


###### <span style="color:blue">Example 6. fillna</span>

In [40]:
# 결측치 0으로 채우기


Unnamed: 0,a,b,c
0,1.0,2.5,7.0
1,2.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


In [56]:
# 결측치 dataframe의 평균으로 채우기


Unnamed: 0,a,b,c
0,1.0,2.5,7.0
1,2.0,4.5,5.0
2,1.5,4.5,5.0
3,1.5,6.5,3.0


In [76]:
# 결측치 앞의 값으로 채우기


Unnamed: 0,a,b,c
0,1.0,2.5,7.0
1,2.0,2.5,7.0
2,2.0,2.5,7.0
3,2.0,6.5,3.0


In [77]:
# 결측치 뒤의 값으로 채우기


Unnamed: 0,a,b,c
0,1.0,2.5,7.0
1,2.0,6.5,3.0
2,,6.5,3.0
3,,6.5,3.0


#### 3) isnull, notnull

In [79]:
df

Unnamed: 0,a,b,c
0,1.0,2.5,7.0
1,2.0,,
2,,,
3,,6.5,3.0


In [80]:
df.isnull()

Unnamed: 0,a,b,c
0,False,False,False
1,False,True,True
2,True,True,True
3,True,False,False


In [82]:
df.notnull()

Unnamed: 0,a,b,c
0,True,True,True
1,True,False,False
2,False,False,False
3,False,True,True
