In [155]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys

%matplotlib inline

# DataFrame

## Create DataFrame

`pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)`

- `data` : numpy ndarray (structured or homogeneous), dict, or DataFrame. Dict can contain Series, arrays, constants, or list-like objects

- `index` : Index or array-like Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and no index provided

- `dtype` : dtype, default None. Data type to force. Only a single dtype is allowed. If None, infer

- `columns` : Index or array-like Column labels to use for resulting frame. Will default to RangeIndex (0, 1, 2, ..., n) if no column labels are provided

### From Dictionary

In [682]:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d, index=['a', 'b'], dtype=np.int8)
df

Unnamed: 0,col1,col2
a,1,3
b,2,4


In [683]:
df.dtypes # If None, inferred dtype is int64.

col1    int8
col2    int8
dtype: object

### From numpy ndarray

In [684]:
df2 = pd.DataFrame(np.random.randint(low=0, high=10, size=(10, 5)), columns=['a', 'b', 'c', 'd', 'e'])
df2

Unnamed: 0,a,b,c,d,e
0,7,5,5,2,2
1,7,8,4,1,9
2,8,3,5,5,7
3,7,1,5,9,0
4,2,3,0,7,3
5,5,1,0,8,2
6,5,6,5,9,4
7,7,5,2,4,3
8,3,8,5,8,6
9,9,8,1,0,0


### From list

In [685]:
data = [[12, 2], [0, 4], [10, 20], [1, 4], [7, 1], [16, 36]] # data list
tuples = [('cobra', 'mark i'), ('cobra', 'mark ii'),('sidewinder', 'mark i'), 
          ('sidewinder', 'mark ii'),('viper', 'mark ii'), ('viper', 'mark iii')]
index = pd.MultiIndex.from_tuples(tuples)
index

MultiIndex(levels=[['cobra', 'sidewinder', 'viper'], ['mark i', 'mark ii', 'mark iii']],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 1, 2]])

In [686]:
df = pd.DataFrame(data, columns=['max_speed', 'sheild'], index=index)
df

Unnamed: 0,Unnamed: 1,max_speed,sheild
cobra,mark i,12,2
cobra,mark ii,0,4
sidewinder,mark i,10,20
sidewinder,mark ii,1,4
viper,mark ii,7,1
viper,mark iii,16,36


## DataFrame Methods

In [687]:
d = list(range(10, 13))
df = pd.DataFrame(d)
df

Unnamed: 0,0
0,10
1,11
2,12


### column, index

In [688]:
df.columns

RangeIndex(start=0, stop=1, step=1)

In [689]:
df.columns = ['col1'] # rename column(s)
df

Unnamed: 0,col1
0,10
1,11
2,12


In [690]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [691]:
df.index = ['a', 'b', 'c']
df

Unnamed: 0,col1
a,10
b,11
c,12


In [692]:
df_ = df.set_index(['col1'])
df_.index

Int64Index([10, 11, 12], dtype='int64', name='col1')

#### Add new columns

In [693]:
df['new_col'] = 13 # add new column. Can't use df.new_col
df

Unnamed: 0,col1,new_col
a,10,13
b,11,13
c,12,13


### Select 

#### Select vertical pieces

In [694]:
df.new_col # pandas.core.series.Series # Here we can use df.new_col

a    13
b    13
c    13
Name: new_col, dtype: int64

In [695]:
df[['new_col', 'col1']]

Unnamed: 0,new_col,col1
a,13,10
b,13,11
c,13,12


#### Select horizontal pieces

In [696]:
df.loc['a'] # pandas.core.series.Series

col1       10
new_col    13
Name: a, dtype: int64

In [697]:
df.loc['a':'b'] # df.loc[inclusive:inclusive]

Unnamed: 0,col1,new_col
a,10,13
b,11,13


#### Select both vertically and horizontally

In [698]:
df.loc['a', 'new_col'] # numpy.int64

13

In [699]:
df.loc['a', 'col1':'new_col'] # pandas.core.series.Series

col1       10
new_col    13
Name: a, dtype: int64

In [700]:
df.loc['a':'b', 'col1':'new_col'] # pandas.core.frame.DataFrame ## 'a':'b' can be replaced by df.index[0:2]

Unnamed: 0,col1,new_col
a,10,13
b,11,13


In [701]:
df.loc[['a','c'], ['col1', 'new_col']] # pandas.core.frame.DataFrame

Unnamed: 0,col1,new_col
a,10,13
c,12,13


#### Select by boolean array

In [702]:
df.loc[[True, True, False]] # select first two rows

Unnamed: 0,col1,new_col
a,10,13
b,11,13


In [703]:
df.loc[df['col1']>10] # select rows whose col1 > 10

Unnamed: 0,col1,new_col
b,11,13
c,12,13


In [704]:
df.loc[lambda df: df['col1']>10, ['new_col']] # select rows whose rol1>10 and show their new_col column

Unnamed: 0,new_col
b,13
c,13


### Stack, Unstack

In [705]:
stack = df.stack() # Bring the columns and place them in the index
stack

a  col1       10
   new_col    13
b  col1       11
   new_col    13
c  col1       12
   new_col    13
dtype: int64

In [706]:
stack.index # # The index now includes the column names

MultiIndex(levels=[['a', 'b', 'c'], ['col1', 'new_col']],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [707]:
unstack = df.unstack()
unstack

col1     a    10
         b    11
         c    12
new_col  a    13
         b    13
         c    13
dtype: int64

In [708]:
unstack.index

MultiIndex(levels=[['col1', 'new_col'], ['a', 'b', 'c']],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [709]:
df = stack.unstack()
df

Unnamed: 0,col1,new_col
a,10,13
b,11,13
c,12,13


In [710]:
df.T # exchange columns and indices

Unnamed: 0,a,b,c
col1,10,11,12
new_col,13,13,13


### groupby

In [11]:
d = {'num1':[1,1,2,2,3,3],
     'num2':[1,2,3,1,2,3],
     'letter':['a','a','a','b','b','b']}

df = pd.DataFrame(d)
df

Unnamed: 0,letter,num1,num2
0,a,1,1
1,a,1,2
2,a,2,3
3,b,2,1
4,b,3,2
5,b,3,3


In [12]:
letter = df.groupby(['letter']).sum()
letter

Unnamed: 0_level_0,num1,num2
letter,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4,6
b,8,6


In [19]:
print(df.groupby(['letter']))
print(df.groupby(['letter'])['letter'])
df.groupby(['letter'])['letter'].count()

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x122d66b38>
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x122d66be0>


letter
a    3
b    3
Name: letter, dtype: int64

In [713]:
letter_num = df.groupby(['letter', 'num1']).sum()
letter_num

Unnamed: 0_level_0,Unnamed: 1_level_0,num2
letter,num1,Unnamed: 2_level_1
a,1,3
a,2,3
b,2,1
b,3,5


In [714]:
letter_num.index

MultiIndex(levels=[['a', 'b'], [1, 2, 3]],
           labels=[[0, 0, 1, 1], [0, 1, 1, 2]],
           names=['letter', 'num1'])

In [715]:
letter_num = df.groupby(['letter', 'num1'], as_index=False).sum()
letter_num

Unnamed: 0,letter,num1,num2
0,a,1,3
1,a,2,3
2,b,2,1
3,b,3,5


In [716]:
letter_num.index

Int64Index([0, 1, 2, 3], dtype='int64')

# Application

## Calc for outliers

In [717]:
States = ['NY', 'NY', 'NY', 'NY', 'FL', 'FL', 'GA', 'GA', 'FL', 'FL'] 
data = [1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10]
idx = pd.date_range('1/1/2012', periods=10, freq='MS')
df1 = pd.DataFrame(data, index=idx, columns=['Revenue'])
df1['State'] = States

data2 = [10.0, 10.0, 9, 9, 8, 8, 7, 7, 6, 6]
idx2 = pd.date_range('1/1/2013', periods=10, freq='MS') # @1
df2 = pd.DataFrame(data2, index=idx2, columns=['Revenue'])
df2['State'] = States

df = pd.concat([df1, df2])
df

Unnamed: 0,Revenue,State
2012-01-01,1.0,NY
2012-02-01,2.0,NY
2012-03-01,3.0,NY
2012-04-01,4.0,NY
2012-05-01,5.0,FL
2012-06-01,6.0,FL
2012-07-01,7.0,GA
2012-08-01,8.0,GA
2012-09-01,9.0,FL
2012-10-01,10.0,FL


see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases for more

### Method 1: groupby.transform

In [718]:
newdf = df.copy()
StateMonth = newdf.groupby(['State', lambda x:x.month])
StateMonth.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Revenue
State,Unnamed: 1_level_1,Unnamed: 2_level_1
FL,5,13.0
FL,6,14.0
FL,9,15.0
FL,10,16.0
GA,7,14.0
GA,8,15.0
NY,1,11.0
NY,2,12.0
NY,3,12.0
NY,4,13.0


In [719]:
newdf['x-Mean'] = StateMonth.transform(lambda x: abs(x-x.mean()))
newdf['1.96*std'] = StateMonth.transform(lambda x: 1.96*x.std())
newdf['Outlier'] = StateMonth.transform(lambda x: abs(x-x.mean()) > 1.96*x.std())
newdf

Unnamed: 0,Revenue,State,x-Mean,1.96*std,Outlier
2012-01-01,1.0,NY,4.5,12.473364,False
2012-02-01,2.0,NY,4.0,11.087434,False
2012-03-01,3.0,NY,3.0,8.315576,False
2012-04-01,4.0,NY,2.5,6.929646,False
2012-05-01,5.0,FL,1.5,4.157788,False
2012-06-01,6.0,FL,1.0,2.771859,False
2012-07-01,7.0,GA,0.0,0.0,False
2012-08-01,8.0,GA,0.5,1.385929,False
2012-09-01,9.0,FL,1.5,4.157788,False
2012-10-01,10.0,FL,2.0,5.543717,False


### Method 2: groupby.apply

In [720]:
newdf = df.copy()

StateMonth = newdf.groupby(['State', lambda x:x.month])

def s(group):
    group['x-Mean'] = abs(group['Revenue'] - group['Revenue'].mean())
    group['1.96*std'] = 1.96 * group['Revenue'].std()
    group['Outlier'] = group['x-Mean'] > group['1.96*std']
    return group

newdf2 = StateMonth.apply(s)
newdf2

Unnamed: 0,Revenue,State,x-Mean,1.96*std,Outlier
2012-01-01,1.0,NY,4.5,12.473364,False
2012-02-01,2.0,NY,4.0,11.087434,False
2012-03-01,3.0,NY,3.0,8.315576,False
2012-04-01,4.0,NY,2.5,6.929646,False
2012-05-01,5.0,FL,1.5,4.157788,False
2012-06-01,6.0,FL,1.0,2.771859,False
2012-07-01,7.0,GA,0.0,0.0,False
2012-08-01,8.0,GA,0.5,1.385929,False
2012-09-01,9.0,FL,1.5,4.157788,False
2012-10-01,10.0,FL,2.0,5.543717,False


### Compare groupby.transform and groupby.apply

#### groupby.transform

In [721]:
df_ = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar'],
                    'B' : ['one', 'one', 'two', 'three','two', 'two'],
                    'C' : [1, 5, 5, 2, 5, 5],
                    'D' : [2.0, 5., 8., 1., 2., 9.]})
df_

Unnamed: 0,A,B,C,D
0,foo,one,1,2.0
1,bar,one,5,5.0
2,foo,two,5,8.0
3,bar,three,2,1.0
4,foo,two,5,2.0
5,bar,two,5,9.0


In [722]:
grouped = df_.groupby('A')
grouped.transform(lambda x: (x - x.mean()) / x.std()) # @1

Unnamed: 0,C,D
0,-1.154701,-0.57735
1,0.57735,0.0
2,0.57735,1.154701
3,-1.154701,-1.0
4,0.57735,-0.57735
5,0.57735,1.0


@1: Call function producing a like-indexed Series on **each group** and return a Series having the same indexes as the original object filled with the transformed values

####  groupby.apply 

In [723]:
ser = pd.Series([0, 1, 2], index='a a b'.split())
ser

a    0
a    1
b    2
dtype: int64

In [724]:
ser.index

Index(['a', 'a', 'b'], dtype='object')

In [725]:
g = ser.groupby(ser.index)
g.apply(lambda x:  x*2 if x.name == 'b' else x/2) # @1

a    0.0
a    0.5
b    4.0
dtype: float64

@1: Apply function ``func``  group-wise and combine the results together.
The function passed to ``apply`` must take a series as its first
argument and return a dataframe, a series or a scalar. ``apply`` will
then take care of combining the results back together into a single
dataframe or series. ``apply`` is therefore a highly flexible
grouping method.

In my opinion, `transform` can get integral property of a group of numbers (such as mean, std, and anything), while `apply` can only do calc on individual number in a group.

### Method 3: Consider all data

In [726]:
newdf = df.copy()

newdf['x-Mean'] = abs(newdf['Revenue'] - newdf['Revenue'].mean())
newdf['1.96*std'] = 1.96*newdf['Revenue'].std()
newdf['Outlier'] = newdf['x-Mean'] > newdf['1.96*std']
newdf

Unnamed: 0,Revenue,State,x-Mean,1.96*std,Outlier
2012-01-01,1.0,NY,5.75,5.200273,True
2012-02-01,2.0,NY,4.75,5.200273,False
2012-03-01,3.0,NY,3.75,5.200273,False
2012-04-01,4.0,NY,2.75,5.200273,False
2012-05-01,5.0,FL,1.75,5.200273,False
2012-06-01,6.0,FL,0.75,5.200273,False
2012-07-01,7.0,GA,0.25,5.200273,False
2012-08-01,8.0,GA,1.25,5.200273,False
2012-09-01,9.0,FL,2.25,5.200273,False
2012-10-01,10.0,FL,3.25,5.200273,False


### If non-Gaussian distribution: 

In [727]:
# make a copy of original df
newdf = df.copy()

State = newdf.groupby('State')

newdf['Lower'] = State['Revenue'].transform(lambda x: x.quantile(q=.25) 
                                            - (1.5*(x.quantile(q=.75)-x.quantile(q=.25))) )
newdf['Upper'] = State['Revenue'].transform(lambda x: x.quantile(q=.75) 
                                            + (1.5*(x.quantile(q=.75)-x.quantile(q=.25))) )
newdf['Outlier'] = (newdf['Revenue'] < newdf['Lower']) | (newdf['Revenue'] > newdf['Upper']) 
newdf

Unnamed: 0,Revenue,State,Lower,Upper,Outlier
2012-01-01,1.0,NY,-7.0,19.0,False
2012-02-01,2.0,NY,-7.0,19.0,False
2012-03-01,3.0,NY,-7.0,19.0,False
2012-04-01,4.0,NY,-7.0,19.0,False
2012-05-01,5.0,FL,2.625,11.625,False
2012-06-01,6.0,FL,2.625,11.625,False
2012-07-01,7.0,GA,6.625,7.625,False
2012-08-01,8.0,GA,6.625,7.625,True
2012-09-01,9.0,FL,2.625,11.625,False
2012-10-01,10.0,FL,2.625,11.625,False


# Export Data

In [170]:
# df.to_csv('data.csv') # columns=None, header=True, index=True, index_label=None
# df.to_csv('data.txt') 
# df.to_excel('data.xls') # sheet_name='Sheet1', columns=None, header=True, index=True, 
                          # index_label=None, startrow=0, startcol=0
# df.to_json('data.json')

# Load Data

In [729]:
# pd.read_csv('data.csv') # header='infer', names=None, index_col=None, dtype=None
# pd.read_csv('data.txt') 
# pd.read_excel('data.xls') # sheet_name=0, header=0, names=None, index_col=None
# pd.read_json('data.json') 

# Merge, Join, Concat, Append

Reference：

[Pandas中的拼接操作(concat,append,join,merge)](https://blog.csdn.net/guofei_fly/article/details/85455813)

[PANDAS 数据合并与重塑(concat篇)](https://blog.csdn.net/stevenkwong/article/details/52528616)

[PANDAS 数据合并与重塑(join/merge篇)](https://blog.csdn.net/stevenkwong/article/details/52540605)

## Merge

Merge on normal columns.

pandas 的顶级方法 (也是 dataframe 数据类型的方法)，提供了类似于 SQL 数据库连接操作的功能，支持左联、右联、内联和外联等全部四种SQL连接操作类型

`join(other, on=None, how='left', lsuffix='', rsuffix='', sort=False)`

既可作为pandas的顶级方法使用，也可作为DataFrame数据结构的方法进行调用
常用参数说明：
- `how`:{'left’, ‘right’, ‘outer’, ‘inner’}, 默认‘inner’，类似于SQL的内联。'left’类似于SQL的左联；'right’类似于SQL的右联；
    ‘outer’类似于SQL的全联。
- `on`:进行合并的参照列名，必须一样。若为None，方法会自动匹配两张表中相同的列名
- `left_on`: 左边df进行连接的列
- `right_on`: 右边df进行连接的列
- `suffixes`: 左、右列名称前缀
- `validate`：默认None，可定义为“one_to_one” 、“one_to_many” 、“many_to_one”和“many_to_many”，即验证是否一对一、一对多、多对一或
    多对多关系

In [164]:
import pandas as pd

d = {'row1': [1, 2, 3], 'row2': [4, 5, 6]}
df = pd.DataFrame(d)
df.index = ['a', 'b', 'c']
df = df.T
df

Unnamed: 0,a,b,c
row1,1,2,3
row2,4,5,6


In [165]:
dd = {'a': [1, 1, 4], 'd': ['x', 'y', 'z']}
dff = pd.DataFrame(dd)
dff

Unnamed: 0,a,d
0,1,x
1,1,y
2,4,z


In [154]:
df.merge(dff) # merge 会自己发现相同的列进行合并，也可以指定 on='a' 进行 a 列的合并

Unnamed: 0,a,b,c,d
0,1,2,3,x
1,1,2,3,y
2,4,5,6,z


## Join

For join, index column is always the first thing to consider.

dataframe 数据类型的方法，提供了列方向的拼接操作，支持左联、右联、内联和外联四种操作类型

`pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
          left_index=False, right_index=False, sort=False,
          suffixes=('_x', '_y'), copy=True, indicator=False,
          validate=None):`

常用参数说明：

- `on`：参照的左边指定列（可能需要先进行set_index操作），若未指明，按照index进行join
- `how`：{‘left’, ‘right’, ‘outer’, ‘inner’}, 默认‘left’，即按照左边df的index（若声明了on，则按照对应的列）；若为‘right’按照左边的df; 若‘inner’为内联方式；若为‘outer’为全连联方式。
- `sort`：是否按照join的key对应的值大小进行排序，默认False
- `lsuffix`，`rsuffix`：当left和right两个df的列名出现冲突时候，通过设定后缀的方式避免错误

In [167]:
import pandas as pd


d = {'row1': [1, 2, 3], 'row2': [4, 5, 6]}
df = pd.DataFrame(d)
df.index = ['a', 'b', 'c']
df = df.T
df

Unnamed: 0,a,b,c
row1,1,2,3
row2,4,5,6


In [168]:
dd = {'a': [1, 1, 4], 'd': ['x', 'y', 'z']}
dff = pd.DataFrame(dd)
dff

Unnamed: 0,a,d
0,1,x
1,1,y
2,4,z


In [149]:
df.set_index('a').join(dff) # df 的 index 是 [1,4]，dff 的 index 是 [0, 1, 2] 只有 1 跟 df 的 index 重叠

Unnamed: 0_level_0,b,c,a,d
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,3,1.0,y
4,5,6,,


In [148]:
df.join(dff, on='a', lsuffix='_l', rsuffix='_r') # 这里虽然指定 on='a'， 但是依旧只是一个普通列而已，因此需要指定 suffix

Unnamed: 0,a_l,b,c,a_r,d
row1,1,2,3,1.0,y
row2,4,5,6,,


In [150]:
df.set_index('a').join(dff.set_index('a'))

Unnamed: 0_level_0,b,c,d
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2,3,x
1,2,3,y
4,5,6,z


## Concat

pandas 的顶级方法，提供了 `axis` 设置可用于df间**行方向**或**列方向**进行内联或外联拼接操作

`concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
           keys=None, levels=None, names=None, verify_integrity=False,
           copy=True)`

常用参数说明：
- `axis`：拼接轴方向，默认为0，沿行拼接；若为1，沿列拼接
- `join`：默认外联'outer'，拼接另一轴所有的label，缺失值用NaN填充；内联'inner'，只拼接另一轴相同的label；
- `join_axes`: 指定需要拼接的轴的labels，可在join既不内联又不外联的时候使用
- `ignore_index`：对index进行重新排序
- `keys`：多重索引

In [None]:
import pandas as pd

# Always use this produce DataFrame
def df_maker(cols, idxs):
    return pd.DataFrame({c:[c+str(i) for i in idxs] for c in cols}, index=idxs)

In [156]:
df1 = df_maker('abc',[1,2,3])
df1

Unnamed: 0,a,b,c
1,a1,b1,c1
2,a2,b2,c2
3,a3,b3,c3


In [157]:
df2 = df_maker('cde',[3,4,5])
df2

Unnamed: 0,c,d,e
3,c3,d3,e3
4,c4,d4,e4
5,c5,d5,e5


In [159]:
pd.concat([df1, df2])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b,c,d,e
1,a1,b1,c1,,
2,a2,b2,c2,,
3,a3,b3,c3,,
3,,,c3,d3,e3
4,,,c4,d4,e4
5,,,c5,d5,e5


In [160]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,a,b,c,c.1,d,e
1,a1,b1,c1,,,
2,a2,b2,c2,,,
3,a3,b3,c3,c3,d3,e3
4,,,,c4,d4,e4
5,,,,c5,d5,e5


## Append

dataframe 数据类型的方法，提供了**行方向**的拼接操作

`append(self, other, ignore_index=False, verify_integrity=False)`

常用参数说明：
- `other`：另一个df
- `ignore_index`：若为True，则对index进行重排
- `verify_integrity`：对index的唯一性进行验证，若有重复，报错。若已经设置了ignore_index，则该参数无效

In [None]:
import pandas as pd

# Always use this produce DataFrame
def df_maker(cols, idxs):
    return pd.DataFrame({c:[c+str(i) for i in idxs] for c in cols}, index=idxs)

In [161]:
df1 = df_maker('abc',[1,2,3])
df1

Unnamed: 0,a,b,c
1,a1,b1,c1
2,a2,b2,c2
3,a3,b3,c3


In [162]:
df2 = df_maker('cde',[3,4,5])
df2

Unnamed: 0,c,d,e
3,c3,d3,e3
4,c4,d4,e4
5,c5,d5,e5


In [163]:
df1.append(df2) # 效果同 pd.concat([df1,df2]) 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,a,b,c,d,e
1,a1,b1,c1,,
2,a2,b2,c2,,
3,a3,b3,c3,,
3,,,c3,d3,e3
4,,,c4,d4,e4
5,,,c5,d5,e5


# With Numpy

In [181]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7,8,9], 'd': [10,11,12]})
df

Unnamed: 0,a,b,c,d
0,1,4,7,10
1,2,5,8,11
2,3,6,9,12


In [192]:
df[['a', 'b']] - df[['c', 'd']]

Unnamed: 0,a,b,c,d
0,,,,
1,,,,
2,,,,


In [193]:
np.array(df[['a', 'b']], dtype='int') - np.array(df[['c','d']], dtype='int')

array([[-6, -6],
       [-6, -6],
       [-6, -6]])