# DAY6 行列混合操作

In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(1)
df = pd.DataFrame(np.random.randn(10,4), 
                  columns=list('ABCD'),
                  index = pd.date_range('20220101', periods=10))
df

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-02,0.865408,-2.301539,1.744812,-0.761207
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-04,-0.322417,-0.384054,1.133769,-1.099891
2022-01-05,-0.172428,-0.877858,0.042214,0.582815
2022-01-06,-1.100619,1.144724,0.901591,0.502494
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769
2022-01-08,-0.267888,0.530355,-0.691661,-0.396754
2022-01-09,-0.687173,-0.845206,-0.671246,-0.012665
2022-01-10,-1.11731,0.234416,1.659802,0.742044


### 1. 行列切片

- loc: location 传入的是 label
- iloc: integer location 传入的是整数（位置）
---

1. 中括号
2. 行,列
3. :

In [3]:
df.loc['2022-01-03','A']

0.31903909605709857

In [4]:
df.loc['2022-01-03',['A','C']]

A    0.319039
C    1.462108
Name: 2022-01-03 00:00:00, dtype: float64

In [5]:
df.loc['2022-01-03':,['A','C']]

Unnamed: 0,A,C
2022-01-03,0.319039,1.462108
2022-01-04,-0.322417,1.133769
2022-01-05,-0.172428,0.042214
2022-01-06,-1.100619,0.901591
2022-01-07,0.900856,-0.12289
2022-01-08,-0.267888,-0.691661
2022-01-09,-0.687173,-0.671246
2022-01-10,-1.11731,1.659802


In [6]:
df.iloc[2,2]

1.462107937044974

In [7]:
df.iloc[2:,:2]

Unnamed: 0,A,B
2022-01-03,0.319039,-0.24937
2022-01-04,-0.322417,-0.384054
2022-01-05,-0.172428,-0.877858
2022-01-06,-1.100619,1.144724
2022-01-07,0.900856,-0.683728
2022-01-08,-0.267888,0.530355
2022-01-09,-0.687173,-0.845206
2022-01-10,-1.11731,0.234416


### 2. 行列筛选（进阶）

In [8]:
df['A']>0

2022-01-01     True
2022-01-02     True
2022-01-03     True
2022-01-04    False
2022-01-05    False
2022-01-06    False
2022-01-07     True
2022-01-08    False
2022-01-09    False
2022-01-10    False
Freq: D, Name: A, dtype: bool

In [9]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-02,0.865408,-2.301539,1.744812,-0.761207
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769


In [10]:
df[(df['A']>0) & (df['C']<0)]  # and

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769


In [11]:
df[(df['A']>1) | (df['D']<-1)]  # or

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-04,-0.322417,-0.384054,1.133769,-1.099891


1. 输入的expr是str
2. 列名存在空格，需要用 1 左边按键 的那个符号
3. 可以引用变量，用@

In [12]:
df.query('A>0')

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-02,0.865408,-2.301539,1.744812,-0.761207
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769


In [13]:
df.query('A>0 and (C<0 or D>0)') 

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769


In [14]:
df.query('A>1 or D<-1')

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-04,-0.322417,-0.384054,1.133769,-1.099891


In [15]:
thres = .4
df.query('A>@thres or D<-1*@thres')

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-02,0.865408,-2.301539,1.744812,-0.761207
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-04,-0.322417,-0.384054,1.133769,-1.099891
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769


- .isin相当于sql中的in

In [16]:
df[df.index.isin(['2022-02-01','2022-01-01','2022-01-04'])]

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-04,-0.322417,-0.384054,1.133769,-1.099891


In [17]:
~df.index.isin(['2022-01-01','2022-01-04'])

array([False,  True,  True, False,  True,  True,  True,  True,  True,
        True])

In [18]:
df[~df.index.isin(['2022-01-01','2022-01-04'])]

Unnamed: 0,A,B,C,D
2022-01-02,0.865408,-2.301539,1.744812,-0.761207
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-05,-0.172428,-0.877858,0.042214,0.582815
2022-01-06,-1.100619,1.144724,0.901591,0.502494
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769
2022-01-08,-0.267888,0.530355,-0.691661,-0.396754
2022-01-09,-0.687173,-0.845206,-0.671246,-0.012665
2022-01-10,-1.11731,0.234416,1.659802,0.742044


In [19]:
df

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-02,0.865408,-2.301539,1.744812,-0.761207
2022-01-03,0.319039,-0.24937,1.462108,-2.060141
2022-01-04,-0.322417,-0.384054,1.133769,-1.099891
2022-01-05,-0.172428,-0.877858,0.042214,0.582815
2022-01-06,-1.100619,1.144724,0.901591,0.502494
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769
2022-01-08,-0.267888,0.530355,-0.691661,-0.396754
2022-01-09,-0.687173,-0.845206,-0.671246,-0.012665
2022-01-10,-1.11731,0.234416,1.659802,0.742044


In [20]:
df.nsmallest(4, 'C')

Unnamed: 0,A,B,C,D
2022-01-08,-0.267888,0.530355,-0.691661,-0.396754
2022-01-09,-0.687173,-0.845206,-0.671246,-0.012665
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769


In [21]:
df.nlargest(3, 'A')

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-01-07,0.900856,-0.683728,-0.12289,-0.935769
2022-01-02,0.865408,-2.301539,1.744812,-0.761207


### 习题

In [22]:
np.random.seed(1)
df = pd.DataFrame(np.random.randn(1000,4), 
                  columns=list('ABCD'),
                  index = pd.date_range('20220101', freq = 'MS',periods=1000))
df

Unnamed: 0,A,B,C,D
2022-01-01,1.624345,-0.611756,-0.528172,-1.072969
2022-02-01,0.865408,-2.301539,1.744812,-0.761207
2022-03-01,0.319039,-0.249370,1.462108,-2.060141
2022-04-01,-0.322417,-0.384054,1.133769,-1.099891
2022-05-01,-0.172428,-0.877858,0.042214,0.582815
...,...,...,...,...
2104-12-01,-0.138881,2.652140,-0.656247,0.279562
2105-01-01,-0.607715,0.729814,-0.887188,0.077327
2105-02-01,0.073416,0.416026,-1.879200,0.575459
2105-03-01,0.102062,1.184304,-0.794843,-0.125903


In [23]:
mylist = []
for date in df.index.strftime(date_format = '%b'):
    if date.startswith('M'):
        mylist.append(True)
    else:
        mylist.append(False)

In [24]:
df[mylist]

Unnamed: 0,A,B,C,D
2022-03-01,0.319039,-0.249370,1.462108,-2.060141
2022-05-01,-0.172428,-0.877858,0.042214,0.582815
2023-03-01,0.838983,0.931102,0.285587,0.885141
2023-05-01,0.488518,-0.075572,1.131629,1.519817
2024-03-01,-1.094912,0.169382,0.740556,-0.953701
...,...,...,...,...
2103-03-01,-0.729965,-1.591216,0.595189,0.273334
2103-05-01,2.742155,-0.510218,0.833351,0.088922
2104-03-01,2.478005,-1.778695,0.395753,2.003272
2104-05-01,-0.182916,1.193456,-0.959332,0.263967


In [25]:
df[df.index.strftime('%b').to_series(index = df.index).apply(lambda x: x.startswith('M'))]

Unnamed: 0,A,B,C,D
2022-03-01,0.319039,-0.249370,1.462108,-2.060141
2022-05-01,-0.172428,-0.877858,0.042214,0.582815
2023-03-01,0.838983,0.931102,0.285587,0.885141
2023-05-01,0.488518,-0.075572,1.131629,1.519817
2024-03-01,-1.094912,0.169382,0.740556,-0.953701
...,...,...,...,...
2103-03-01,-0.729965,-1.591216,0.595189,0.273334
2103-05-01,2.742155,-0.510218,0.833351,0.088922
2104-03-01,2.478005,-1.778695,0.395753,2.003272
2104-05-01,-0.182916,1.193456,-0.959332,0.263967


In [26]:
df[df.index.strftime('%b').map(lambda x:x.startswith('M'))]

Unnamed: 0,A,B,C,D
2022-03-01,0.319039,-0.249370,1.462108,-2.060141
2022-05-01,-0.172428,-0.877858,0.042214,0.582815
2023-03-01,0.838983,0.931102,0.285587,0.885141
2023-05-01,0.488518,-0.075572,1.131629,1.519817
2024-03-01,-1.094912,0.169382,0.740556,-0.953701
...,...,...,...,...
2103-03-01,-0.729965,-1.591216,0.595189,0.273334
2103-05-01,2.742155,-0.510218,0.833351,0.088922
2104-03-01,2.478005,-1.778695,0.395753,2.003272
2104-05-01,-0.182916,1.193456,-0.959332,0.263967


In [27]:
df[df.index.strftime('%b').map(lambda x:x.startswith('M'))].query('A>0 and C<0').nlargest(10,'D')
# df[df.index.strftime('%b').str.startswith("M")]

Unnamed: 0,A,B,C,D
2036-03-01,0.39788,-0.996011,-1.195862,2.50598
2031-03-01,0.451284,-1.68406,-1.16017,1.350107
2085-03-01,1.487621,0.839731,-1.256278,1.236639
2049-03-01,0.638929,-0.46566,-0.967012,1.217716
2065-03-01,0.033929,0.26492,-0.235312,1.054868
2077-05-01,1.587083,0.666381,-0.72598,0.979425
2071-03-01,0.066393,0.540602,-1.318897,0.845426
2035-05-01,0.984952,1.071252,-1.097154,0.838635
2059-05-01,0.873674,0.933253,-0.216155,0.833679
2070-05-01,1.135339,-1.272776,-0.987621,0.659419


In [28]:
for n in np.linspace(-1,1,11):
    tmp = len(df.query('A+B>@n and C-D>@n'))
    print(f'当n={round(n,2)}时，满足条件的行数是{tmp}')

当n=-1.0时，满足条件的行数是579
当n=-0.8时，满足条件的行数是506
当n=-0.6时，满足条件的行数是422
当n=-0.4时，满足条件的行数是360
当n=-0.2时，满足条件的行数是298
当n=0.0时，满足条件的行数是237
当n=0.2时，满足条件的行数是176
当n=0.4时，满足条件的行数是143
当n=0.6时，满足条件的行数是99
当n=0.8时，满足条件的行数是77
当n=1.0时，满足条件的行数是54
