# DAY6 行列混合操作

In [3]:
import pandas as pd
import numpy as np

In [4]:
np.random.seed(1)
df = pd.DataFrame(np.random.randn(10,4), 
                  columns=list('ABCD'),
                  index = pd.date_range('20220101', periods=10))
df

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-02,0.865408,-2.301539,1.744812,-0.761207
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-04,-0.322417,-0.384054,1.133769,-1.099891
2022-01-05,-0.172428,-0.877858,0.042214,0.582815
2022-01-06,-1.100619,1.144724,0.901591,0.502494
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769
2022-01-08,-0.267888,0.530355,-0.691661,-0.396754
2022-01-09,-0.687173,-0.845206,-0.671246,-0.012665
2022-01-10,-1.11731,0.234416,1.659802,0.742044


### 1. 行列切片

- loc: location 传入的是 label
- iloc: integer location 传入的是整数（位置）
---

1. 中括号
2. 行,列
3. :

In [3]:
df.loc['2022-01-03','A']

0.31903909605709857

In [4]:
df.loc['2022-01-03',['A','C']]

A    0.319039
C    1.462108
Name: 2022-01-03 00:00:00, dtype: float64

In [5]:
df.loc['2022-01-03':,['A','C']]

Unnamed: 0,A,C
2022-01-03,0.319039,1.462108
2022-01-04,-0.322417,1.133769
2022-01-05,-0.172428,0.042214
2022-01-06,-1.100619,0.901591
2022-01-07,0.900856,-0.12289
2022-01-08,-0.267888,-0.691661
2022-01-09,-0.687173,-0.671246
2022-01-10,-1.11731,1.659802


In [6]:
df.iloc[2,2]

1.462107937044974

In [7]:
df.iloc[2:,:2]

Unnamed: 0,A,B
2022-01-03,0.319039,-0.24937
2022-01-04,-0.322417,-0.384054
2022-01-05,-0.172428,-0.877858
2022-01-06,-1.100619,1.144724
2022-01-07,0.900856,-0.683728
2022-01-08,-0.267888,0.530355
2022-01-09,-0.687173,-0.845206
2022-01-10,-1.11731,0.234416


### 2. 行列筛选（进阶）

In [5]:
df['A']>0

2022-01-01     True
2022-01-02     True
2022-01-03     True
2022-01-04    False
2022-01-05    False
2022-01-06    False
2022-01-07     True
2022-01-08    False
2022-01-09    False
2022-01-10    False
Freq: D, Name: A, dtype: bool

In [6]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-02,0.865408,-2.301539,1.744812,-0.761207
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769


In [10]:
df[(df['A']>0) & (df['C']<0)]  # and

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769


In [11]:
df[(df['A']>1) | (df['D']<-1)]  # or

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-04,-0.322417,-0.384054,1.133769,-1.099891


1. 输入的expr是str
2. 列名存在空格，需要用 1 左边按键 的那个符号
3. 可以引用变量，用@

In [12]:
df.query('A>0')

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-02,0.865408,-2.301539,1.744812,-0.761207
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769


In [13]:
df.query('A>0 and (C<0 or D>0)') 

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769


In [14]:
df.query('A>1 or D<-1')

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-04,-0.322417,-0.384054,1.133769,-1.099891


In [18]:
thres = .4
df.query('A>@thres or D<-1*@thres')

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-02,0.865408,-2.301539,1.744812,-0.761207
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-04,-0.322417,-0.384054,1.133769,-1.099891
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769


- .isin相当于sql中的in

In [16]:
df[df.index.isin(['2022-02-01','2022-01-01','2022-01-04'])]

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-04,-0.322417,-0.384054,1.133769,-1.099891


In [26]:
~df.index.isin(['2022-01-01','2022-01-04'])

array([False,  True,  True, False,  True,  True,  True,  True,  True,
        True])

In [17]:
df[~df.index.isin(['2022-01-01','2022-01-04'])]

Unnamed: 0,A,B,C,D
2022-01-02,0.865408,-2.301539,1.744812,-0.761207
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-05,-0.172428,-0.877858,0.042214,0.582815
2022-01-06,-1.100619,1.144724,0.901591,0.502494
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769
2022-01-08,-0.267888,0.530355,-0.691661,-0.396754
2022-01-09,-0.687173,-0.845206,-0.671246,-0.012665
2022-01-10,-1.11731,0.234416,1.659802,0.742044


In [27]:
df

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-02,0.865408,-2.301539,1.744812,-0.761207
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-04,-0.322417,-0.384054,1.133769,-1.099891
2022-01-05,-0.172428,-0.877858,0.042214,0.582815
2022-01-06,-1.100619,1.144724,0.901591,0.502494
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769
2022-01-08,-0.267888,0.530355,-0.691661,-0.396754
2022-01-09,-0.687173,-0.845206,-0.671246,-0.012665
2022-01-10,-1.11731,0.234416,1.659802,0.742044


In [18]:
df.nsmallest(4, 'C')

Unnamed: 0,A,B,C,D
2022-01-08,-0.267888,0.530355,-0.691661,-0.396754
2022-01-09,-0.687173,-0.845206,-0.671246,-0.012665
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769


In [19]:
df.nlargest(3, 'A')

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769
2022-01-02,0.865408,-2.301539,1.744812,-0.761207


### 习题

In [20]:
np.random.seed(1)
df = pd.DataFrame(np.random.randn(5,4), 
                  columns=list('ABCD'),
                  index = pd.date_range('20220101', freq = 'MS',periods=5))
df

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-02-01,0.865408,-2.301539,1.744812,-0.761207
2022-03-01,0.319039,-0.24937,1.462108,-2.060141
2022-04-01,-0.322417,-0.384054,1.133769,-1.099891
2022-05-01,-0.172428,-0.877858,0.042214,0.582815


In [21]:
df[df.index.strftime('%b').map(lambda x:x.startswith('M'))]
# df[df.index.strftime('%b').str.startswith("M")]

Unnamed: 0,A,B,C,D
2022-03-01,0.319039,-0.24937,1.462108,-2.060141
2022-05-01,-0.172428,-0.877858,0.042214,0.582815


In [22]:
df.query('A+B>0 and C-D>0')

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-03-01,0.319039,-0.24937,1.462108,-2.060141


In [23]:
for i in np.linspace(-1,1,11):
    tmp = len(df.query('A+B>@i and C-D>@i'))
    print(f'当i={round(i,2)}时，满足条件的行数是{tmp}')

当i=-1.0时，满足条件的行数是3
当i=-0.8时，满足条件的行数是3
当i=-0.6时，满足条件的行数是2
当i=-0.4时，满足条件的行数是2
当i=-0.2时，满足条件的行数是2
当i=0.0时，满足条件的行数是2
当i=0.2时，满足条件的行数是1
当i=0.4时，满足条件的行数是1
当i=0.6时，满足条件的行数是0
当i=0.8时，满足条件的行数是0
当i=1.0时，满足条件的行数是0
