In [1]:
import pandas as pd

In [2]:
from pandas import Series, DataFrame

In [3]:
import numpy as np

함수 적용과 매핑

In [4]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-1.633975,-0.270339,-0.29196
Ohio,-2.168335,0.486296,-1.58991
Texas,-0.380667,0.049473,-0.066992
Oregon,2.033423,1.040111,0.172218


In [5]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.633975,0.270339,0.29196
Ohio,2.168335,0.486296,1.58991
Texas,0.380667,0.049473,0.066992
Oregon,2.033423,1.040111,0.172218


In [6]:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    4.201758
d    1.310449
e    1.762129
dtype: float64

In [7]:
frame.apply(f, axis='columns')

Utah      0.943690
Ohio      2.612958
Texas     1.418388
Oregon    1.370797
dtype: float64

In [8]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.68619,-0.730595,-1.364714
max,0.053674,2.04056,1.04084


In [9]:
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.44,0.51,0.28
Ohio,-0.57,2.04,1.04
Texas,0.05,-0.72,-1.36
Oregon,-0.69,-0.73,0.64


In [10]:
frame['e'].map(format)

Utah       0.28
Ohio       1.04
Texas     -1.36
Oregon     0.64
Name: e, dtype: object

정렬과 순위

In [11]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [12]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [13]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [14]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [15]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [16]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [17]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [18]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [19]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [20]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [21]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [22]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [23]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [24]:
frame.rank(axis='columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


중복 색인

In [25]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [26]:
obj.index.is_unique

False

In [27]:
obj['a']

a    0
a    1
dtype: int64

In [28]:
obj['c']

4

In [29]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,0.894843,0.431021,0.507158
a,0.43444,0.349419,1.145505
b,-1.010369,-0.361897,0.350393
b,-0.859032,-1.797875,0.046911


In [30]:
df.loc['b']

Unnamed: 0,0,1,2
b,-1.010369,-0.361897,0.350393
b,-0.859032,-1.797875,0.046911


기술 통계 계산과 요약

In [31]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [32]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [33]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [34]:
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [35]:
df.idxmax()

one    b
two    d
dtype: object

In [36]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [37]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [38]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

상관관계와 공분산

In [39]:
conda install pandas-datareader

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [40]:
import pandas_datareader.data as web

In [41]:
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = pd.DataFrame({ticker: data['Adj Close']
                      for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})

In [42]:
returns = price.pct_change()

In [43]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-08-20,0.02219,-0.005572,0.023271,0.022113
2020-08-21,0.051532,8.1e-05,-0.00727,-0.000841
2020-08-24,0.01196,0.020461,0.003145,0.004923
2020-08-25,-0.008204,-0.008275,0.013009,0.012605
2020-08-26,0.013599,-0.003771,0.02162,0.027459


In [44]:
returns['MSFT'].corr(returns['IBM'])

0.581584631809679

In [45]:
returns['MSFT'].cov(returns['IBM'])

0.00016168660919265227

In [46]:
returns.MSFT.corr(returns.IBM)

0.581584631809679

In [47]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.5101,0.704032,0.658077
IBM,0.5101,1.0,0.581585,0.535056
MSFT,0.704032,0.581585,1.0,0.783108
GOOG,0.658077,0.535056,0.783108,1.0


In [48]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000337,0.00015,0.000224,0.000198
IBM,0.00015,0.000257,0.000162,0.000141
MSFT,0.000224,0.000162,0.000301,0.000223
GOOG,0.000198,0.000141,0.000223,0.000269


In [49]:
returns.corrwith(returns.IBM)

AAPL    0.510100
IBM     1.000000
MSFT    0.581585
GOOG    0.535056
dtype: float64

In [50]:
returns.corrwith(volume)

AAPL   -0.086625
IBM    -0.099827
MSFT   -0.050771
GOOG   -0.149919
dtype: float64

유일값, 값 세기, 멤버십

In [51]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [52]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [53]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [54]:
pd.value_counts(obj.values, sort=False)

c    3
b    2
a    3
d    1
dtype: int64

In [55]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [56]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [57]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [58]:
to_match = pd.Series(['c', 'a', 'b', 'c', 'a'])

In [59]:
unique_vals = pd.Series(['c', 'b', 'a'])

In [60]:
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 0, 2], dtype=int64)

In [61]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [62]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
