## Data 분석용 함수들

In [1]:
import pandas as pd
import numpy as np

data = [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]]
df = pd.DataFrame(data, columns=["one", "two"], index=["a", "b", "c", "d"])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


### 열방향으로의 합 (즉, 각 열의 합)

In [2]:
df.___

one    9.25
two   -5.80
dtype: float64

### 행방향으로의 합 (즉, 각 행의 합)

In [3]:
df.___

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

* 이때, 위에서 볼 수 있듯이 NaN값은 배제하고 계산한다.

### NaN 값을 배제하지 않고 계산하려면 아래와 같이 skipna에 대해 false를 지정해준다.

In [4]:
df.sum(axis=1, skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

### 특정 행 또는 특정 열에서만 계산하기

In [5]:
df['one'].sum()

9.25

In [6]:
df.loc['b'].sum()

2.5999999999999996

## pandas에서 DataFrame에 적용되는 함수들
* count 전체 성분의 (NaN이 아닌) 값의 갯수를 계산
* min, max 전체 성분의 최솟, 최댓값을 계산
* argmin, argmax 전체 성분의 최솟값, 최댓값이 위치한 (정수)인덱스를 반환
* idxmin, idxmax 전체 인덱스 중 최솟값, 최댓값을 반환
* quantile 전체 성분의 특정 사분위수에 해당하는 값을 반환 (0~1 사이)
* sum 전체 성분의 합을 계산
* mean 전체 성분의 평균을 계산
* median 전체 성분의 중간 값을 반환
* mad 전체 성분의 평균 값으로부터의 절대 편차(absolute deviation)의 평균을 계산
* std, var 전체 성분의 표준편차, 분산을 계산
* cumsum 맨 첫 번째 성분부터 각 성분까지의 누적합을 계산 (0에서부터 계속 더해짐)
* cumprod 맨 첫번째 성분부터 각 성분까지의 누적곱을 계산 (1에서부터 계속 곱해짐)

In [10]:
import pandas as pd
import numpy as np

df2 = pd.DataFrame(np.random.randn(6, 4),
                   columns=["A", "B", "C", "D"],
                   index=pd.date_range("20190601", periods=6))
df2

Unnamed: 0,A,B,C,D
2019-06-01,-0.346147,-0.359343,-1.027746,0.417706
2019-06-02,1.939523,-0.416922,-1.050768,0.221593
2019-06-03,-0.262752,0.561234,0.011623,2.215959
2019-06-04,2.462044,-0.315985,-0.188119,1.169608
2019-06-05,-0.545625,1.462682,-0.15734,1.599636
2019-06-06,0.585051,0.642638,-0.45258,-0.578222


### A열과 B열의 상관계수 구하기

In [11]:
# A열과 B열의 상관계수(CoRelation) 구하기
df2['A'].___(df2['B'])

-0.6273115153813591

### B열과 C열의 공분산 구하기

In [12]:
# B열과 C열의 공분산(CoVariance) 구하기
df2['B'].____(df2['C'])

0.22081927475426905

## 정렬함수 및 기타함수

In [14]:
dates = df2.index
random_dates = np.random.permutation(dates)
df2 = df2.reindex(index=random_dates, columns=["D", "B", "C", "A"])
df2

Unnamed: 0,D,B,C,A
2019-06-04,1.169608,-0.315985,-0.188119,2.462044
2019-06-01,0.417706,-0.359343,-1.027746,-0.346147
2019-06-03,2.215959,0.561234,0.011623,-0.262752
2019-06-02,0.221593,-0.416922,-1.050768,1.939523
2019-06-05,1.599636,1.462682,-0.15734,-0.545625
2019-06-06,-0.578222,0.642638,-0.45258,0.585051


In [15]:
# index와 column의 순서가 섞여있다.
# 이때 index가 오름차순이 되도록 정렬해보자
df2._____

Unnamed: 0,D,B,C,A
2019-06-01,0.417706,-0.359343,-1.027746,-0.346147
2019-06-02,0.221593,-0.416922,-1.050768,1.939523
2019-06-03,2.215959,0.561234,0.011623,-0.262752
2019-06-04,1.169608,-0.315985,-0.188119,2.462044
2019-06-05,1.599636,1.462682,-0.15734,-0.545625
2019-06-06,-0.578222,0.642638,-0.45258,0.585051


In [16]:
df2.______

Unnamed: 0,A,B,C,D
2019-06-04,2.462044,-0.315985,-0.188119,1.169608
2019-06-01,-0.346147,-0.359343,-1.027746,0.417706
2019-06-03,-0.262752,0.561234,0.011623,2.215959
2019-06-02,1.939523,-0.416922,-1.050768,0.221593
2019-06-05,-0.545625,1.462682,-0.15734,1.599636
2019-06-06,0.585051,0.642638,-0.45258,-0.578222


In [17]:
# 내림차순으로 정렬 
df2.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2019-06-04,1.169608,-0.188119,-0.315985,2.462044
2019-06-01,0.417706,-1.027746,-0.359343,-0.346147
2019-06-03,2.215959,0.011623,0.561234,-0.262752
2019-06-02,0.221593,-1.050768,-0.416922,1.939523
2019-06-05,1.599636,-0.15734,1.462682,-0.545625
2019-06-06,-0.578222,-0.45258,0.642638,0.585051


In [18]:
# 값 기준 정렬하기
# D열의 값이 오름차순이 되도록 정렬하기
df2.______

Unnamed: 0,D,B,C,A
2019-06-06,-0.578222,0.642638,-0.45258,0.585051
2019-06-02,0.221593,-0.416922,-1.050768,1.939523
2019-06-01,0.417706,-0.359343,-1.027746,-0.346147
2019-06-04,1.169608,-0.315985,-0.188119,2.462044
2019-06-05,1.599636,1.462682,-0.15734,-0.545625
2019-06-03,2.215959,0.561234,0.011623,-0.262752


In [19]:
# B열의 값이 내림차순이 되도록 정렬하기
df2.sort_values(by='B', ascending=False)

Unnamed: 0,D,B,C,A
2019-06-05,1.599636,1.462682,-0.15734,-0.545625
2019-06-06,-0.578222,0.642638,-0.45258,0.585051
2019-06-03,2.215959,0.561234,0.011623,-0.262752
2019-06-04,1.169608,-0.315985,-0.188119,2.462044
2019-06-01,0.417706,-0.359343,-1.027746,-0.346147
2019-06-02,0.221593,-0.416922,-1.050768,1.939523


In [20]:
df2["E"] = np.random.randint(0, 6, size=6)
df2["F"] = ["alpha", "beta", "gamma", "gamma", "alpha", "gamma"]
df2

Unnamed: 0,D,B,C,A,E,F
2019-06-04,1.169608,-0.315985,-0.188119,2.462044,4,alpha
2019-06-01,0.417706,-0.359343,-1.027746,-0.346147,4,beta
2019-06-03,2.215959,0.561234,0.011623,-0.262752,4,gamma
2019-06-02,0.221593,-0.416922,-1.050768,1.939523,5,gamma
2019-06-05,1.599636,1.462682,-0.15734,-0.545625,0,alpha
2019-06-06,-0.578222,0.642638,-0.45258,0.585051,3,gamma


In [21]:
# E열과 F열을 동시에 고려하여, 오름차순으로 하려면?
df2.________

Unnamed: 0,D,B,C,A,E,F
2019-06-05,1.599636,1.462682,-0.15734,-0.545625,0,alpha
2019-06-06,-0.578222,0.642638,-0.45258,0.585051,3,gamma
2019-06-04,1.169608,-0.315985,-0.188119,2.462044,4,alpha
2019-06-01,0.417706,-0.359343,-1.027746,-0.346147,4,beta
2019-06-03,2.215959,0.561234,0.011623,-0.262752,4,gamma
2019-06-02,0.221593,-0.416922,-1.050768,1.939523,5,gamma


In [22]:
# 지정한 행 또는 열에서 중복값을 제외한 유니크한 값만 얻기
df2['F']._____

array(['alpha', 'beta', 'gamma'], dtype=object)

In [23]:
# 지정한 행 또는 열에서 값에 따른 개수 얻기
df2['F']._____

gamma    3
alpha    2
beta     1
Name: F, dtype: int64

In [24]:
# 지정한 행 또는 열에서 입력한 값이 있는지 확인하기
df2['F'].____(['alpha','beta'])

2019-06-04     True
2019-06-01     True
2019-06-03    False
2019-06-02    False
2019-06-05     True
2019-06-06    False
Name: F, dtype: bool

In [25]:
# F열의 값이 alpha나 beta인 모든 행 구하기
........

Unnamed: 0,D,B,C,A,E,F
2019-06-04,1.169608,-0.315985,-0.188119,2.462044,4,alpha
2019-06-01,0.417706,-0.359343,-1.027746,-0.346147,4,beta
2019-06-05,1.599636,1.462682,-0.15734,-0.545625,0,alpha


In [26]:
## 사용자가 직접 만든 함수를 적용하기
df3 = pd.DataFrame(np.random.randn(4, 3), columns=["b", "d", "e"],
                   index=["Seoul", "Incheon", "Busan", "Daegu"])
df3

Unnamed: 0,b,d,e
Seoul,0.7387,-0.097906,-0.560055
Incheon,-1.756002,-0.134222,-1.231408
Busan,0.395583,-0.511267,0.394308
Daegu,-0.698884,0.330276,0.077048


In [27]:
func = lambda x: x.max() - x.min()
df3._____(func, axis=0)

b    2.494702
d    0.841542
e    1.625716
dtype: float64