## Data 분석용 함수들

In [1]:
import pandas as pd
import numpy as np

data = [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]]
df = pd.DataFrame(data, columns=["one", "two"], index=["a", "b", "c", "d"])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


### 열방향으로의 합 (즉, 각 열의 합)
* 수학 함수는 axis=0 열 기준이다.

In [3]:
df.sum()

one    9.25
two   -5.80
dtype: float64

### 행방향으로의 합 (즉, 각 행의 합)

In [4]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

* 이때, 위에서 볼 수 있듯이 NaN값은 배제하고 계산한다.

### NaN 값을 배제하지 않고 계산하려면 아래와 같이 skipna에 대해 false를 지정해준다.

In [5]:
df.sum(axis=1, skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [6]:
df.sum(axis=1, skipna=True)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

### 특정 행 또는 특정 열에서만 계산하기

In [7]:
df['one'].sum()

9.25

In [8]:
df.loc['b'].sum()

2.5999999999999996

## pandas에서 DataFrame에 적용되는 함수들
* count 전체 성분의 (NaN이 아닌) 값의 갯수를 계산
* min, max 전체 성분의 최솟, 최댓값을 계산
* argmin, argmax 전체 성분의 최솟값, 최댓값이 위치한 (정수)인덱스를 반환
* idxmin, idxmax 전체 인덱스 중 최솟값, 최댓값을 반환
* quantile 전체 성분의 특정 사분위수에 해당하는 값을 반환 (0~1 사이)
* sum 전체 성분의 합을 계산
* mean 전체 성분의 평균을 계산
* median 전체 성분의 중간 값을 반환
* mad 전체 성분의 평균 값으로부터의 절대 편차(absolute deviation)의 평균을 계산
* std, var 전체 성분의 표준편차, 분산을 계산
* cumsum 맨 첫 번째 성분부터 각 성분까지의 누적합을 계산 (0에서부터 계속 더해짐)
* cumprod 맨 첫번째 성분부터 각 성분까지의 누적곱을 계산 (1에서부터 계속 곱해짐)

In [9]:
import pandas as pd
import numpy as np

df2 = pd.DataFrame(np.random.randn(6, 4),
                   columns=["A", "B", "C", "D"],
                   index=pd.date_range("20190601", periods=6))
df2

Unnamed: 0,A,B,C,D
2019-06-01,0.276024,-2.35611,-0.900552,0.215594
2019-06-02,-1.15233,-2.076814,0.067284,-0.470207
2019-06-03,0.807636,0.321233,0.490852,-1.407791
2019-06-04,-0.683611,-0.656064,-2.028576,0.543326
2019-06-05,0.135124,-0.041434,0.558129,0.696369
2019-06-06,0.315077,1.481252,-0.183257,2.049313


### A열과 B열의 상관계수 구하기

In [10]:
# A열과 B열의 상관계수(CoRelation) 구하기
df2['A'].corr(df2['B'])

0.5165663440393212

### B열과 C열의 공분산 구하기

In [11]:
# B열과 C열의 공분산(CoVariance) 구하기
df2['B'].cov(df2['C'])

0.4137763418423477

## 정렬함수 및 기타함수

In [25]:
dates = df2.index
random_dates = np.random.permutation(dates)
df2 = df2.reindex(index=random_dates, columns=["D", "B", "C", "A"])
df2

Unnamed: 0,D,B,C,A
2019-06-01,0.215594,-2.35611,-0.900552,0.276024
2019-06-03,-1.407791,0.321233,0.490852,0.807636
2019-06-04,0.543326,-0.656064,-2.028576,-0.683611
2019-06-02,-0.470207,-2.076814,0.067284,-1.15233
2019-06-05,0.696369,-0.041434,0.558129,0.135124
2019-06-06,2.049313,1.481252,-0.183257,0.315077


In [26]:
# index와 column의 순서가 섞여있다.
# 이때 index가 오름차순이 되도록 정렬해보자
df2.sort_index()

Unnamed: 0,D,B,C,A
2019-06-01,0.215594,-2.35611,-0.900552,0.276024
2019-06-02,-0.470207,-2.076814,0.067284,-1.15233
2019-06-03,-1.407791,0.321233,0.490852,0.807636
2019-06-04,0.543326,-0.656064,-2.028576,-0.683611
2019-06-05,0.696369,-0.041434,0.558129,0.135124
2019-06-06,2.049313,1.481252,-0.183257,0.315077


In [27]:
df2.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2019-06-01,0.276024,-2.35611,-0.900552,0.215594
2019-06-03,0.807636,0.321233,0.490852,-1.407791
2019-06-04,-0.683611,-0.656064,-2.028576,0.543326
2019-06-02,-1.15233,-2.076814,0.067284,-0.470207
2019-06-05,0.135124,-0.041434,0.558129,0.696369
2019-06-06,0.315077,1.481252,-0.183257,2.049313


In [28]:
# 내림차순으로 정렬 
df2.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2019-06-01,0.215594,-0.900552,-2.35611,0.276024
2019-06-03,-1.407791,0.490852,0.321233,0.807636
2019-06-04,0.543326,-2.028576,-0.656064,-0.683611
2019-06-02,-0.470207,0.067284,-2.076814,-1.15233
2019-06-05,0.696369,0.558129,-0.041434,0.135124
2019-06-06,2.049313,-0.183257,1.481252,0.315077


In [29]:
# 값 기준 정렬하기
# D열의 값이 오름차순이 되도록 정렬하기
df2.sort_values(by="D")

Unnamed: 0,D,B,C,A
2019-06-03,-1.407791,0.321233,0.490852,0.807636
2019-06-02,-0.470207,-2.076814,0.067284,-1.15233
2019-06-01,0.215594,-2.35611,-0.900552,0.276024
2019-06-04,0.543326,-0.656064,-2.028576,-0.683611
2019-06-05,0.696369,-0.041434,0.558129,0.135124
2019-06-06,2.049313,1.481252,-0.183257,0.315077


In [30]:
# B열의 값이 내림차순이 되도록 정렬하기
df2.sort_values(by='B', ascending=False)

Unnamed: 0,D,B,C,A
2019-06-06,2.049313,1.481252,-0.183257,0.315077
2019-06-03,-1.407791,0.321233,0.490852,0.807636
2019-06-05,0.696369,-0.041434,0.558129,0.135124
2019-06-04,0.543326,-0.656064,-2.028576,-0.683611
2019-06-02,-0.470207,-2.076814,0.067284,-1.15233
2019-06-01,0.215594,-2.35611,-0.900552,0.276024


In [31]:
df2["E"] = np.random.randint(0, 6, size=6)
df2["F"] = ["alpha", "beta", "gamma", "gamma", "alpha", "gamma"]
df2

Unnamed: 0,D,B,C,A,E,F
2019-06-01,0.215594,-2.35611,-0.900552,0.276024,2,alpha
2019-06-03,-1.407791,0.321233,0.490852,0.807636,5,beta
2019-06-04,0.543326,-0.656064,-2.028576,-0.683611,2,gamma
2019-06-02,-0.470207,-2.076814,0.067284,-1.15233,0,gamma
2019-06-05,0.696369,-0.041434,0.558129,0.135124,4,alpha
2019-06-06,2.049313,1.481252,-0.183257,0.315077,3,gamma


In [32]:
# E열과 F열을 동시에 고려하여, 오름차순으로 하려면?
df2.sort_values(by=['E','F'])

Unnamed: 0,D,B,C,A,E,F
2019-06-02,-0.470207,-2.076814,0.067284,-1.15233,0,gamma
2019-06-01,0.215594,-2.35611,-0.900552,0.276024,2,alpha
2019-06-04,0.543326,-0.656064,-2.028576,-0.683611,2,gamma
2019-06-06,2.049313,1.481252,-0.183257,0.315077,3,gamma
2019-06-05,0.696369,-0.041434,0.558129,0.135124,4,alpha
2019-06-03,-1.407791,0.321233,0.490852,0.807636,5,beta


In [33]:
# 지정한 행 또는 열에서 중복값을 제외한 유니크한 값만 얻기
df2['F'].unique()

array(['alpha', 'beta', 'gamma'], dtype=object)

- [지정한 행 또는 열에서 값에 따른 개수 얻기](https://pandas.pydata.org/pandas-docs/stable/reference/index.html)

In [34]:
df2['F'].value_counts()

gamma    3
alpha    2
beta     1
Name: F, dtype: int64

In [35]:
# 지정한 행 또는 열에서 입력한 값이 있는지 확인하기
df2['F'].isin(['alpha','beta'])

2019-06-01     True
2019-06-03     True
2019-06-04    False
2019-06-02    False
2019-06-05     True
2019-06-06    False
Name: F, dtype: bool

In [36]:
df2['F'].isin(['alpha']) # 리스트로 넣어줘야 함

2019-06-01     True
2019-06-03    False
2019-06-04    False
2019-06-02    False
2019-06-05     True
2019-06-06    False
Name: F, dtype: bool

In [37]:
# F열의 값이 alpha나 beta인 모든 행 구하기
df2.loc[df2['F'].isin(['alpha','beta'])]

Unnamed: 0,D,B,C,A,E,F
2019-06-01,0.215594,-2.35611,-0.900552,0.276024,2,alpha
2019-06-03,-1.407791,0.321233,0.490852,0.807636,5,beta
2019-06-05,0.696369,-0.041434,0.558129,0.135124,4,alpha


## 사용자가 직접 만든 함수를 적용하기

In [40]:
df3 = pd.DataFrame(np.random.randn(4, 3), columns=["b", "d", "e"],
                   index=["Seoul", "Incheon", "Busan", "Daegu"])
df3

Unnamed: 0,b,d,e
Seoul,0.297883,0.54129,0.211086
Incheon,0.897044,3.191654,-1.589431
Busan,0.439156,0.875696,0.594956
Daegu,-0.562528,1.34934,0.9811


In [41]:
func = lambda x: x.max() - x.min()
df3.apply(func, axis=0)

b    1.459571
d    2.650364
e    2.570531
dtype: float64

In [42]:
df3.apply(lambda x: x.max() - x.min(), axis=0)

b    1.459571
d    2.650364
e    2.570531
dtype: float64