In [1]:
import pandas as pd

In [2]:
import numpy as np

### serises
* 같은 종류의 데이터 집합, 컬럼단위

In [3]:
pd.Series([1,3,5,np.nan,6,8]) # 우선순위가 높은 데이터형식으로 나타남

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
pd.Series([1,3,5,7,6,8])

0    1
1    3
2    5
3    7
4    6
5    8
dtype: int64

In [5]:
pd.Series([1,3,5,7,6,8],dtype="float32") # 데이터 type을 float으로 나타내라

0    1.0
1    3.0
2    5.0
3    7.0
4    6.0
5    8.0
dtype: float32

In [6]:
pd.Series([1,3,'5',7,6,8]) # '5' 문자열

0    1
1    3
2    5
3    7
4    6
5    8
dtype: object

In [7]:
pd.Series([1.0,3.0,'5',0.7,0.6,0.8])

0    1.0
1    3.0
2      5
3    0.7
4    0.6
5    0.8
dtype: object

In [8]:
dates = pd.date_range("20130101", periods=6) # 6개 가져옴

In [9]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

### DataFrame

##### index와 columns를 지정하지 않으면 자동으로 0부터 부여됨

In [10]:
                         # 6행 4열
pd.DataFrame(np.random.randn(6,4),
             index=dates,
             columns=list("ABCD"))

Unnamed: 0,A,B,C,D
2013-01-01,0.015146,0.52162,-1.860627,0.631986
2013-01-02,0.224745,-0.256303,0.200182,0.306204
2013-01-03,0.102017,1.060643,1.102173,-0.874888
2013-01-04,0.888651,-0.322405,0.46223,-0.281922
2013-01-05,1.168602,-0.828311,0.475186,0.835094
2013-01-06,0.100933,0.130612,0.527105,-1.325787


In [11]:
pd.DataFrame(np.random.randn(6,4))

Unnamed: 0,0,1,2,3
0,1.179727,1.276085,-0.178089,2.46191
1,0.24289,-1.752916,0.383787,-0.701244
2,-0.307826,1.124392,0.524874,1.526668
3,-0.728427,-0.421436,-1.551234,0.848509
4,-0.335674,0.800361,-1.797453,-1.11706
5,0.354171,-2.21444,1.308766,-0.709735


In [12]:
list("ABCD")

['A', 'B', 'C', 'D']

#### dictionary 형식

In [13]:
pd.DataFrame(
    {
        "A":0.1,
        "B":pd.Timestamp("20130102"),
        "C":pd.Series(1, index=list(range(4)), dtype="float32"),
        "D":np.array([3]*4, dtype="int32"),
        "E":pd.Categorical(["test","train","test", "train"]),
        "F":"foo"
    }
)

Unnamed: 0,A,B,C,D,E,F
0,0.1,2013-01-02,1.0,3,test,foo
1,0.1,2013-01-02,1.0,3,train,foo
2,0.1,2013-01-02,1.0,3,test,foo
3,0.1,2013-01-02,1.0,3,train,foo


##### "A":0.1 -> 하나의 값만 주어진 경우, 그 값이 확장되어 입력됨
##### "A":[0.1,0.2] -> 오류메시지가 나타남
##### => 하나의 값만 입력하거나, DataFrame의 행 또는 열의 개수에 맞춰서 입력해야함

In [14]:
df2 = pd.DataFrame(
    {
        "A":0.1,
        "B":pd.Timestamp("20130102"),
        "C":pd.Series(1, index=list(range(4)), dtype="float32"),
        "D":np.array([3]*4, dtype="int32"),
        "E":pd.Categorical(["test","train","test", "train"]),
        "F":"foo"
    }
)

In [15]:
type(df2)

pandas.core.frame.DataFrame

### dtypes
* 배열의 원소의 데이터type을 알 수 있음

In [16]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### shape
* 배열의 형태를 알려줌(n행,m열)

In [17]:
df2.shape

(4, 6)

## ndim
* 배열의 차원 수 or 배열의 축 수

In [18]:
df2.ndim

2

### info
* 데이터의 전반적인 정보를 보여주는 메소드

In [19]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   A       4 non-null      float64       
 1   B       4 non-null      datetime64[ns]
 2   C       4 non-null      float32       
 3   D       4 non-null      int32         
 4   E       4 non-null      category      
 5   F       4 non-null      object        
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int32(1), object(1)
memory usage: 288.0+ bytes


### head & tail
* 상단부분, 하단부분만 출력

In [20]:
df2.head(2)

Unnamed: 0,A,B,C,D,E,F
0,0.1,2013-01-02,1.0,3,test,foo
1,0.1,2013-01-02,1.0,3,train,foo


In [21]:
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F
2,0.1,2013-01-02,1.0,3,test,foo
3,0.1,2013-01-02,1.0,3,train,foo


### sample
* 무작위로 기본 값 가져옴

In [22]:
df2.sample(2)

Unnamed: 0,A,B,C,D,E,F
3,0.1,2013-01-02,1.0,3,train,foo
0,0.1,2013-01-02,1.0,3,test,foo


In [23]:
df2.sample(3)

Unnamed: 0,A,B,C,D,E,F
1,0.1,2013-01-02,1.0,3,train,foo
0,0.1,2013-01-02,1.0,3,test,foo
3,0.1,2013-01-02,1.0,3,train,foo


### index

In [24]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

### columns

In [25]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

### values
* 행 단위로 나타남

In [26]:
df2.values

array([[0.1, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [0.1, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [0.1, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [0.1, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [27]:
type(df2.index)

pandas.core.indexes.numeric.Int64Index

In [28]:
type(df2.columns)

pandas.core.indexes.base.Index

In [29]:
type(df2.values)

numpy.ndarray

### to_numpy
* columns 순서 변경 기능

In [30]:
df2.to_numpy()

array([[0.1, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [0.1, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [0.1, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [0.1, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [31]:
df = pd.DataFrame(np.random.randn(6,4),
                  index=dates,
                  columns=list('ABCD'))

In [32]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909
2013-01-03,0.086955,-0.229214,-1.445995,0.975162
2013-01-04,1.627132,0.123275,1.134978,-1.032749
2013-01-05,-1.173974,-0.092968,1.943073,1.290428
2013-01-06,0.182393,-0.398206,-0.171601,-0.160129


### describe
* 각 열에 대한 요약 통계

In [33]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.03757,-0.381607,-0.062766,-0.209467
std,0.949733,0.512931,1.387053,1.109842
min,-1.173974,-1.357128,-1.53973,-1.177602
25%,-0.491792,-0.382505,-1.158826,-1.122119
50%,-0.175676,-0.282307,-0.23446,-0.596439
75%,0.158534,-0.127029,0.808333,0.691339
max,1.627132,0.123275,1.943073,1.290428


In [34]:
df2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,0.1,1.0,3.0
std,0.0,0.0,0.0
min,0.1,1.0,3.0
25%,0.1,1.0,3.0
50%,0.1,1.0,3.0
75%,0.1,1.0,3.0
max,0.1,1.0,3.0


### transpose
* 행과 열의 자리 바꿈

In [35]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.438307,-0.509621,0.086955,1.627132,-1.173974,0.182393
B,-1.357128,-0.3354,-0.229214,0.123275,-0.092968,-0.398206
C,-1.53973,-0.297319,-1.445995,1.134978,1.943073,-0.171601
D,-1.177602,-1.151909,0.975162,-1.032749,1.290428,-0.160129


In [36]:
df.transpose()

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.438307,-0.509621,0.086955,1.627132,-1.173974,0.182393
B,-1.357128,-0.3354,-0.229214,0.123275,-0.092968,-0.398206
C,-1.53973,-0.297319,-1.445995,1.134978,1.943073,-0.171601
D,-1.177602,-1.151909,0.975162,-1.032749,1.290428,-0.160129


### sort_index
* index 정렬

In [37]:
df.sort_index(ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,0.182393,-0.398206,-0.171601,-0.160129
2013-01-05,-1.173974,-0.092968,1.943073,1.290428
2013-01-04,1.627132,0.123275,1.134978,-1.032749
2013-01-03,0.086955,-0.229214,-1.445995,0.975162
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602


In [38]:
df.sort_index(ascending=False, axis=1)

Unnamed: 0,D,C,B,A
2013-01-01,-1.177602,-1.53973,-1.357128,-0.438307
2013-01-02,-1.151909,-0.297319,-0.3354,-0.509621
2013-01-03,0.975162,-1.445995,-0.229214,0.086955
2013-01-04,-1.032749,1.134978,0.123275,1.627132
2013-01-05,1.290428,1.943073,-0.092968,-1.173974
2013-01-06,-0.160129,-0.171601,-0.398206,0.182393


##### 최근 날짜 순으로 index정렬
##### ascendig - False : 내림차순 /True : 오름차순 (기본 정렬 방식은 오름차순)
##### axis - 0 또는 생략 : 행기준 정렬 / 1 : 열기준 정렬

### sort_values
* columns 정렬

In [39]:
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2013-01-05,-1.173974,-0.092968,1.943073,1.290428
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602
2013-01-03,0.086955,-0.229214,-1.445995,0.975162
2013-01-06,0.182393,-0.398206,-0.171601,-0.160129
2013-01-04,1.627132,0.123275,1.134978,-1.032749


##### by안에 있는 것 기준으로 정렬 

In [40]:
df.sort_values(by=["A","B"], ascending=False)

Unnamed: 0,A,B,C,D
2013-01-04,1.627132,0.123275,1.134978,-1.032749
2013-01-06,0.182393,-0.398206,-0.171601,-0.160129
2013-01-03,0.086955,-0.229214,-1.445995,0.975162
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909
2013-01-05,-1.173974,-0.092968,1.943073,1.290428


##### 여러개의 columns 기준으로 정렬을 하고 싶다면, by안에 리스트 사용
##### A로 먼저 정렬한 다음, B로 정렬

### df.loc[행,열] 컬럼, 인덱스 이름 그대로 사용
### df.iloc[행,열] 인덱스
##### ' ':' ' -> 대괄호 사용X
##### [' ',' '] -> 대괄호 사용O

In [41]:
df.loc['2013-01-03':'2013-01-04',['B','C']]

Unnamed: 0,B,C
2013-01-03,-0.229214,-1.445995
2013-01-04,0.123275,1.134978


In [42]:
df.loc['2013-01-03':'2013-01-04','B':'C']

Unnamed: 0,B,C
2013-01-03,-0.229214,-1.445995
2013-01-04,0.123275,1.134978


In [43]:
df.iloc[1:3,2:4]

Unnamed: 0,C,D
2013-01-02,-0.297319,-1.151909
2013-01-03,-1.445995,0.975162


In [44]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.509621,-0.297319
2013-01-03,0.086955,-1.445995
2013-01-05,-1.173974,1.943073


## bool
* True / False 로 값 반환

In [45]:
# df[조건]

In [46]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2013-01-03,0.086955,-0.229214,-1.445995,0.975162
2013-01-04,1.627132,0.123275,1.134978,-1.032749
2013-01-06,0.182393,-0.398206,-0.171601,-0.160129


In [47]:
df['A']>0

2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04     True
2013-01-05    False
2013-01-06     True
Freq: D, Name: A, dtype: bool

In [48]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,,,
2013-01-03,0.086955,,,0.975162
2013-01-04,1.627132,0.123275,1.134978,
2013-01-05,,,1.943073,1.290428
2013-01-06,0.182393,,,


##### 조건에 만족하지 않는 값은 NaN으로 나타남

## copy & 추가하기
* df의 값을 복사해서 df2에 담고 'E' columns 추가하기

In [49]:
df2 = df.copy()

In [50]:
df2

Unnamed: 0,A,B,C,D
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909
2013-01-03,0.086955,-0.229214,-1.445995,0.975162
2013-01-04,1.627132,0.123275,1.134978,-1.032749
2013-01-05,-1.173974,-0.092968,1.943073,1.290428
2013-01-06,0.182393,-0.398206,-0.171601,-0.160129


In [51]:
df2["E"] = ["one", "one", "two", "three", "four", "three"]

In [52]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602,one
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909,one
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,two
2013-01-04,1.627132,0.123275,1.134978,-1.032749,three
2013-01-05,-1.173974,-0.092968,1.943073,1.290428,four
2013-01-06,0.182393,-0.398206,-0.171601,-0.160129,three


In [53]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909
2013-01-03,0.086955,-0.229214,-1.445995,0.975162
2013-01-04,1.627132,0.123275,1.134978,-1.032749
2013-01-05,-1.173974,-0.092968,1.943073,1.290428
2013-01-06,0.182393,-0.398206,-0.171601,-0.160129


## isin
* 값이 참이면 출력

In [54]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602,one
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909,one
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,two
2013-01-04,1.627132,0.123275,1.134978,-1.032749,three
2013-01-05,-1.173974,-0.092968,1.943073,1.290428,four
2013-01-06,0.182393,-0.398206,-0.171601,-0.160129,three


In [56]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,two
2013-01-05,-1.173974,-0.092968,1.943073,1.290428,four


In [57]:
df2[(df2['E']=='two') | (df2['E']=='four')]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,two
2013-01-05,-1.173974,-0.092968,1.943073,1.290428,four


## reindex
* 인덱스를 새 인덱스로 덮어씌우고 내용을 채움

In [58]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909
2013-01-03,0.086955,-0.229214,-1.445995,0.975162
2013-01-04,1.627132,0.123275,1.134978,-1.032749
2013-01-05,-1.173974,-0.092968,1.943073,1.290428
2013-01-06,0.182393,-0.398206,-0.171601,-0.160129


In [61]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+['E'])

In [62]:
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602,
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909,
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,
2013-01-04,1.627132,0.123275,1.134978,-1.032749,


In [63]:
df1.loc[dates[0]:dates[1], 'E'] = 1

In [64]:
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602,1.0
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909,1.0
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,
2013-01-04,1.627132,0.123275,1.134978,-1.032749,


## dropna
* nan값이 있는 열과 행 제거

In [65]:
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602,1.0
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909,1.0
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,
2013-01-04,1.627132,0.123275,1.134978,-1.032749,


#### how='all' 삭제하려는 행 또는 열의 모든 값이 nan일 때 제거

In [66]:
df1.dropna(how='all', axis=0)

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602,1.0
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909,1.0
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,
2013-01-04,1.627132,0.123275,1.134978,-1.032749,


#### how='any' 삭제하려는 행 또는 열 중에서 nan값이 하나라도 있으면 제거

In [67]:
df1.dropna(how='any', axis=0)

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602,1.0
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909,1.0


In [69]:
df1.dropna(how='any', axis=1)

Unnamed: 0,A,B,C,D
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909
2013-01-03,0.086955,-0.229214,-1.445995,0.975162
2013-01-04,1.627132,0.123275,1.134978,-1.032749


In [71]:
df1 = df.reindex(index=dates[0:4],
                 columns=list(df.columns)+["E"])
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602,
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909,
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,
2013-01-04,1.627132,0.123275,1.134978,-1.032749,


In [72]:
df1.dropna(how='any', axis=1)

Unnamed: 0,A,B,C,D
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909
2013-01-03,0.086955,-0.229214,-1.445995,0.975162
2013-01-04,1.627132,0.123275,1.134978,-1.032749


## fillna
* 원하는 값으로 값 채우기

In [73]:
df1 = df.reindex(index=dates[0:4],
                 columns=list(df.columns)+["E"])
df1.loc[dates[0]:dates[1],'E']=1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602,1.0
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909,1.0
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,
2013-01-04,1.627132,0.123275,1.134978,-1.032749,


In [74]:
df1['E'].mean()

1.0

In [75]:
df1.fillna(df1.mean())

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602,1.0
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909,1.0
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,1.0
2013-01-04,1.627132,0.123275,1.134978,-1.032749,1.0


In [76]:
df1 = df.reindex(index=dates[0:4],
                 columns=list(df.columns)+["E"])
df1.loc[dates[0]:dates[1],'E']=1
df1.loc[dates[0], 'A']=np.nan
df1.loc[dates[1], 'B']=np.nan
df1.loc[dates[3], 'C']=np.nan
df1.loc[dates[1], 'D']=np.nan
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,,-1.357128,-1.53973,-1.177602,1.0
2013-01-02,-0.509621,,-0.297319,,1.0
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,
2013-01-04,1.627132,0.123275,,-1.032749,


In [77]:
df1.mean()

A    0.401489
B   -0.487689
C   -1.094348
D   -0.411730
E    1.000000
dtype: float64

In [78]:
df1.fillna(df1.mean())

Unnamed: 0,A,B,C,D,E
2013-01-01,0.401489,-1.357128,-1.53973,-1.177602,1.0
2013-01-02,-0.509621,-0.487689,-0.297319,-0.41173,1.0
2013-01-03,0.086955,-0.229214,-1.445995,0.975162,1.0
2013-01-04,1.627132,0.123275,-1.094348,-1.032749,1.0


# operations
## mean(평균)/ std(표준편차)/ var(분산)/ max/ min/ median

In [84]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909
2013-01-03,0.086955,-0.229214,-1.445995,0.975162
2013-01-04,1.627132,0.123275,1.134978,-1.032749
2013-01-05,-1.173974,-0.092968,1.943073,1.290428
2013-01-06,0.182393,-0.398206,-0.171601,-0.160129


#### mean(0) -> 열(columns)기준

In [85]:
df.mean()

A   -0.037570
B   -0.381607
C   -0.062766
D   -0.209467
dtype: float64

#### mean(1) -> 행(index)기준

In [86]:
df.mean(1)

2013-01-01   -1.128192
2013-01-02   -0.573562
2013-01-03   -0.153273
2013-01-04    0.463159
2013-01-05    0.491640
2013-01-06   -0.136886
Freq: D, dtype: float64

In [87]:
df.std()

A    0.949733
B    0.512931
C    1.387053
D    1.109842
dtype: float64

In [88]:
df.var()

A    0.901992
B    0.263099
C    1.923917
D    1.231748
dtype: float64

In [89]:
df.max()

A    1.627132
B    0.123275
C    1.943073
D    1.290428
dtype: float64

In [90]:
df.min()

A   -1.173974
B   -1.357128
C   -1.539730
D   -1.177602
dtype: float64

In [91]:
df.median()

A   -0.175676
B   -0.282307
C   -0.234460
D   -0.596439
dtype: float64

## apply
* 함수를 적용할 때 사용

In [92]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602
2013-01-02,-0.509621,-0.3354,-0.297319,-1.151909
2013-01-03,0.086955,-0.229214,-1.445995,0.975162
2013-01-04,1.627132,0.123275,1.134978,-1.032749
2013-01-05,-1.173974,-0.092968,1.943073,1.290428
2013-01-06,0.182393,-0.398206,-0.171601,-0.160129


#### np.cumsum : 주어진 축을 따라 요소의 누적 합계를 반환

In [93]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,-0.438307,-1.357128,-1.53973,-1.177602
2013-01-02,-0.947927,-1.692528,-1.837049,-2.329512
2013-01-03,-0.860972,-1.921742,-3.283044,-1.35435
2013-01-04,0.76616,-1.798467,-2.148066,-2.387099
2013-01-05,-0.407814,-1.891435,-0.204993,-1.096671
2013-01-06,-0.225421,-2.289641,-0.376594,-1.2568


In [94]:
def dataa(x):
    return x.max() - x.min()

df.apply(dataa)

A    2.801106
B    1.480403
C    3.482803
D    2.468030
dtype: float64

#### lambda : 한줄 짜리 함수, 함수이름 없음

In [95]:
df.apply(lambda x: x.max()-x.min())

A    2.801106
B    1.480403
C    3.482803
D    2.468030
dtype: float64

## replace 
* 특정 값 변경하기

In [96]:
gender = pd.Series(['남자','여자','여자','여자','남','male'])
gender

0      남자
1      여자
2      여자
3      여자
4       남
5    male
dtype: object

In [97]:
gender.value_counts()

여자      3
남자      1
남       1
male    1
dtype: int64

In [98]:
gender.replace('남','남자')

0      남자
1      여자
2      여자
3      여자
4      남자
5    male
dtype: object

In [99]:
gender

0      남자
1      여자
2      여자
3      여자
4       남
5    male
dtype: object

#### 위와 같이 replace는 원본을 수정하지는 않는다.

In [100]:
gender.replace(['남','male'],['남자','남자'])

0    남자
1    여자
2    여자
3    여자
4    남자
5    남자
dtype: object

In [101]:
gender.str.replace('남', '남자')

0     남자자
1      여자
2      여자
3      여자
4      남자
5    male
dtype: object

#### str.함수 => 한세트, replace제외
#### replace : 전체위치
#### str.replace : 부분위치, list 사용X

## concat
* 데이터 속성 형태가 동일한 데이터끼리 합친다

In [103]:
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,1.486517,-0.430491,0.828333,0.752401
1,1.850941,1.247845,-0.335138,-1.565096
2,-0.384389,-0.658247,-0.35889,-0.284741
3,-0.268074,-1.074806,-0.075481,-0.376648
4,-0.312083,-1.00909,1.769617,-0.5037
5,-1.43171,-1.349411,0.064613,0.152677
6,-0.621432,-1.878995,0.817969,-0.2718
7,0.106999,-0.46947,-0.801502,-0.159133
8,-0.763259,-0.597178,2.294462,-1.687624
9,0.671513,-2.027336,0.227551,0.52972


In [104]:
pieces = [df[:3],df[3:7],df[7:]]
pieces

[          0         1         2         3
 0  1.486517 -0.430491  0.828333  0.752401
 1  1.850941  1.247845 -0.335138 -1.565096
 2 -0.384389 -0.658247 -0.358890 -0.284741,
           0         1         2         3
 3 -0.268074 -1.074806 -0.075481 -0.376648
 4 -0.312083 -1.009090  1.769617 -0.503700
 5 -1.431710 -1.349411  0.064613  0.152677
 6 -0.621432 -1.878995  0.817969 -0.271800,
           0         1         2         3
 7  0.106999 -0.469470 -0.801502 -0.159133
 8 -0.763259 -0.597178  2.294462 -1.687624
 9  0.671513 -2.027336  0.227551  0.529720]

In [105]:
pieces[0]

Unnamed: 0,0,1,2,3
0,1.486517,-0.430491,0.828333,0.752401
1,1.850941,1.247845,-0.335138,-1.565096
2,-0.384389,-0.658247,-0.35889,-0.284741


In [106]:
pieces[1]

Unnamed: 0,0,1,2,3
3,-0.268074,-1.074806,-0.075481,-0.376648
4,-0.312083,-1.00909,1.769617,-0.5037
5,-1.43171,-1.349411,0.064613,0.152677
6,-0.621432,-1.878995,0.817969,-0.2718


In [107]:
pieces[2]

Unnamed: 0,0,1,2,3
7,0.106999,-0.46947,-0.801502,-0.159133
8,-0.763259,-0.597178,2.294462,-1.687624
9,0.671513,-2.027336,0.227551,0.52972


#### axis=1 -> 옆으로 합치기, axis=0 -> 위아래로 합치기
#### join='inner' -> 교집합, join='outer' -> 합집합

In [109]:
pd.concat([pieces[0],pieces[1],pieces[2]])

Unnamed: 0,0,1,2,3
0,1.486517,-0.430491,0.828333,0.752401
1,1.850941,1.247845,-0.335138,-1.565096
2,-0.384389,-0.658247,-0.35889,-0.284741
3,-0.268074,-1.074806,-0.075481,-0.376648
4,-0.312083,-1.00909,1.769617,-0.5037
5,-1.43171,-1.349411,0.064613,0.152677
6,-0.621432,-1.878995,0.817969,-0.2718
7,0.106999,-0.46947,-0.801502,-0.159133
8,-0.763259,-0.597178,2.294462,-1.687624
9,0.671513,-2.027336,0.227551,0.52972


In [110]:
pd.concat([pieces[0],pieces[1],pieces[2]], axis=1)

Unnamed: 0,0,1,2,3,0.1,1.1,2.1,3.1,0.2,1.2,2.2,3.2
0,1.486517,-0.430491,0.828333,0.752401,,,,,,,,
1,1.850941,1.247845,-0.335138,-1.565096,,,,,,,,
2,-0.384389,-0.658247,-0.35889,-0.284741,,,,,,,,
3,,,,,-0.268074,-1.074806,-0.075481,-0.376648,,,,
4,,,,,-0.312083,-1.00909,1.769617,-0.5037,,,,
5,,,,,-1.43171,-1.349411,0.064613,0.152677,,,,
6,,,,,-0.621432,-1.878995,0.817969,-0.2718,,,,
7,,,,,,,,,0.106999,-0.46947,-0.801502,-0.159133
8,,,,,,,,,-0.763259,-0.597178,2.294462,-1.687624
9,,,,,,,,,0.671513,-2.027336,0.227551,0.52972


In [111]:
pd.concat([pieces[0],pieces[1],pieces[2]], axis=1, join='inner')

Unnamed: 0,0,1,2,3,0.1,1.1,2.1,3.1,0.2,1.2,2.2,3.2


In [112]:
pd.concat([pieces[0],pieces[1],pieces[2]], axis=0, join='inner')

Unnamed: 0,0,1,2,3
0,1.486517,-0.430491,0.828333,0.752401
1,1.850941,1.247845,-0.335138,-1.565096
2,-0.384389,-0.658247,-0.35889,-0.284741
3,-0.268074,-1.074806,-0.075481,-0.376648
4,-0.312083,-1.00909,1.769617,-0.5037
5,-1.43171,-1.349411,0.064613,0.152677
6,-0.621432,-1.878995,0.817969,-0.2718
7,0.106999,-0.46947,-0.801502,-0.159133
8,-0.763259,-0.597178,2.294462,-1.687624
9,0.671513,-2.027336,0.227551,0.52972


## merge
*  두 데이터프레임을 각 데이터에 존재하는 고유값(key)을 기준으로 병합

In [113]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})

In [114]:
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})

In [115]:
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [116]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [117]:
pd.merge(left,right,on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


## join
* merge랑 비슷, 다만 index기준으로 결합

#### set_index : 열을 인덱스로 설정

In [119]:
left.set_index('key',inplace=True)
right.set_index('key',inplace=True)

In [120]:
left

Unnamed: 0_level_0,lval
key,Unnamed: 1_level_1
foo,1
foo,2


In [121]:
right

Unnamed: 0_level_0,rval
key,Unnamed: 1_level_1
foo,4
foo,5


In [122]:
left.join(right,on="key")

Unnamed: 0_level_0,lval,rval
key,Unnamed: 1_level_1,Unnamed: 2_level_1
foo,1,4
foo,1,5
foo,2,4
foo,2,5


## groupby
* 같은 값을 하나로 묶어 통계 또는 집계 결과를 얻기 위해 사용하는 것

In [123]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)

In [124]:
df

Unnamed: 0,A,B,C,D
0,foo,one,0.094935,-2.264214
1,bar,one,0.55308,0.657981
2,foo,two,0.344477,0.112213
3,bar,three,-1.663113,-2.210008
4,foo,two,-0.007223,0.916471
5,bar,two,2.841684,0.230081
6,foo,one,-0.077956,0.026132
7,foo,three,-1.213544,-0.338063


In [125]:
df.sum()

A        foobarfoobarfoobarfoofoo
B    oneonetwothreetwotwoonethree
C                         0.87234
D                       -2.869407
dtype: object

In [126]:
df.groupby("A")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000F23A4AD580>

In [127]:
df.groupby("A").sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.731651,-1.321947
foo,-0.859311,-1.54746


In [128]:
test = df.groupby(["A","B"]).sum()

In [129]:
test

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.55308,0.657981
bar,three,-1.663113,-2.210008
bar,two,2.841684,0.230081
foo,one,0.016979,-2.238081
foo,three,-1.213544,-0.338063
foo,two,0.337254,1.028684


## stack
* 열을 피벗하여 하위 인덱스로 변환

In [130]:
df

Unnamed: 0,A,B,C,D
0,foo,one,0.094935,-2.264214
1,bar,one,0.55308,0.657981
2,foo,two,0.344477,0.112213
3,bar,three,-1.663113,-2.210008
4,foo,two,-0.007223,0.916471
5,bar,two,2.841684,0.230081
6,foo,one,-0.077956,0.026132
7,foo,three,-1.213544,-0.338063


In [131]:
test = df.groupby(["A","B"]).sum()
test

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.55308,0.657981
bar,three,-1.663113,-2.210008
bar,two,2.841684,0.230081
foo,one,0.016979,-2.238081
foo,three,-1.213544,-0.338063
foo,two,0.337254,1.028684


In [132]:
test.stack()

A    B       
bar  one    C    0.553080
            D    0.657981
     three  C   -1.663113
            D   -2.210008
     two    C    2.841684
            D    0.230081
foo  one    C    0.016979
            D   -2.238081
     three  C   -1.213544
            D   -0.338063
     two    C    0.337254
            D    1.028684
dtype: float64

## unstack
* 행을 언피벗하여 하위 열로 변환, stack과 반대

In [133]:
test.stack().unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.55308,0.657981
bar,three,-1.663113,-2.210008
bar,two,2.841684,0.230081
foo,one,0.016979,-2.238081
foo,three,-1.213544,-0.338063
foo,two,0.337254,1.028684


In [134]:
test1 = test.stack()
test1.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.55308,0.657981
bar,three,-1.663113,-2.210008
bar,two,2.841684,0.230081
foo,one,0.016979,-2.238081
foo,three,-1.213544,-0.338063
foo,two,0.337254,1.028684


In [135]:
test1 = test.stack()
test1.unstack(0)

Unnamed: 0_level_0,A,bar,foo
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,C,0.55308,0.016979
one,D,0.657981,-2.238081
three,C,-1.663113,-1.213544
three,D,-2.210008,-0.338063
two,C,2.841684,0.337254
two,D,0.230081,1.028684


In [136]:
test1.unstack(1)

Unnamed: 0_level_0,B,one,three,two
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,C,0.55308,-1.663113,2.841684
bar,D,0.657981,-2.210008,0.230081
foo,C,0.016979,-1.213544,0.337254
foo,D,-2.238081,-0.338063,1.028684


## pivot_table
* 데이터를 스프레드시트 기반 피벗 테이블로 변환

In [138]:
df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)

In [139]:
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,0.032511,0.193294
1,one,B,foo,-0.208809,2.299993
2,two,C,foo,-0.435826,0.847822
3,three,A,bar,-1.953169,0.999573
4,one,B,bar,-0.889053,-0.778017
5,one,C,bar,-0.740611,0.867806
6,two,A,foo,0.2302,0.186346
7,three,B,foo,-0.103308,0.168797
8,one,C,foo,0.095932,0.35791
9,one,A,bar,-0.409915,0.188003


In [140]:
test = pd.pivot_table(df,values="D",index=["A","B"],columns=["C"],aggfunc="sum")

In [141]:
test

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.409915,0.032511
one,B,-0.889053,-0.208809
one,C,-0.740611,0.095932
three,A,-1.953169,
three,B,,-0.103308
three,C,-0.724976,
two,A,,0.2302
two,B,0.999096,
two,C,,-0.435826
