In [1]:
import pandas as pd
import numpy as np

In [2]:
index = pd.date_range('1/1/2000', periods=8)
index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [3]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a   -0.463352
b   -0.520496
c   -0.046909
d    0.318434
e   -0.233496
dtype: float64

In [4]:
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
2000-01-01,0.374304,1.415713,-0.09653
2000-01-02,1.77481,-0.357298,-0.132917
2000-01-03,0.441579,-1.189512,-0.021344
2000-01-04,0.265214,-1.428729,-1.057423
2000-01-05,-0.925222,1.010544,-0.292167
2000-01-06,0.458922,0.602199,-0.819307
2000-01-07,-0.019045,1.049963,1.498954
2000-01-08,0.979287,-0.666694,1.542465


In [5]:
wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
              major_axis=pd.date_range('1/1/2000', periods=5),
              minor_axis=['A', 'B', 'C', 'D'])
wp

Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  exec(code_obj, self.user_global_ns, self.user_ns)


<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis)
Items axis: Item1 to Item2
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-05 00:00:00
Minor_axis axis: A to D

### 头和尾

In [6]:
long_series = pd.Series(np.random.randn(1000))
long_series.head()

0    0.037798
1    0.836171
2    0.422760
3   -0.150723
4   -0.312759
dtype: float64

In [7]:
long_series.tail(3)

997    1.306334
998    1.972151
999    0.795137
dtype: float64

### 属性和基础数据

In [8]:
df[:2]

Unnamed: 0,A,B,C
2000-01-01,0.374304,1.415713,-0.09653
2000-01-02,1.77481,-0.357298,-0.132917


In [9]:
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,a,b,c
2000-01-01,0.374304,1.415713,-0.09653
2000-01-02,1.77481,-0.357298,-0.132917
2000-01-03,0.441579,-1.189512,-0.021344
2000-01-04,0.265214,-1.428729,-1.057423
2000-01-05,-0.925222,1.010544,-0.292167
2000-01-06,0.458922,0.602199,-0.819307
2000-01-07,-0.019045,1.049963,1.498954
2000-01-08,0.979287,-0.666694,1.542465


In [10]:
s.array

<PandasArray>
[-0.46335170984936624,  -0.5204956052774767, -0.04690866330254646,
   0.3184343829832564,  -0.2334955883424004]
Length: 5, dtype: float64

In [11]:
s.index.array

<PandasArray>
['a', 'b', 'c', 'd', 'e']
Length: 5, dtype: object

In [12]:
s.to_numpy()

array([-0.46335171, -0.52049561, -0.04690866,  0.31843438, -0.23349559])

In [13]:
np.asarray(s)

array([-0.46335171, -0.52049561, -0.04690866,  0.31843438, -0.23349559])

In [14]:
ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
ser.to_numpy(dtype=object)

array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
       Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
      dtype=object)

In [15]:
ser.to_numpy(dtype="datetime64[ns]")

array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00.000000000'],
      dtype='datetime64[ns]')

In [16]:
df.to_numpy()

array([[ 0.37430361,  1.415713  , -0.09653013],
       [ 1.77481003, -0.35729772, -0.13291746],
       [ 0.44157946, -1.18951177, -0.02134401],
       [ 0.26521385, -1.42872931, -1.0574226 ],
       [-0.92522185,  1.0105444 , -0.29216731],
       [ 0.45892246,  0.60219935, -0.81930745],
       [-0.01904504,  1.04996319,  1.49895439],
       [ 0.97928745, -0.66669416,  1.54246464]])

### 加速操作

In [17]:
pd.set_option('compute.use_bottleneck', False)
pd.set_option('compute.use_numexpr', False)

### 灵活二进制操作

In [18]:
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
    'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
    'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])
})
df

Unnamed: 0,one,three,two
a,-1.158248,,-1.406855
b,-0.702384,0.353842,-0.852675
c,0.515502,1.682529,-0.295702
d,,-1.164221,0.012505


In [19]:
row = df.iloc[1]
column = df['two']
df.sub(row, axis='columns')

Unnamed: 0,one,three,two
a,-0.455864,,-0.55418
b,0.0,0.0,0.0
c,1.217886,1.328688,0.556974
d,,-1.518062,0.86518


In [20]:
df.sub(row, axis=1)

Unnamed: 0,one,three,two
a,-0.455864,,-0.55418
b,0.0,0.0,0.0
c,1.217886,1.328688,0.556974
d,,-1.518062,0.86518


In [21]:
df.sub(row, axis='index')

Unnamed: 0,one,three,two
a,,,
b,,,
c,,,
d,,,
one,,,
three,,,
two,,,


In [22]:
df.sub(column, axis=0)

Unnamed: 0,one,three,two
a,0.248607,,0.0
b,0.150291,1.206517,0.0
c,0.811203,1.978231,0.0
d,,-1.176725,0.0


In [23]:
dfmi = df.copy()
dfmi.index = pd.MultiIndex.from_tuples([(1, 'a'), (1, 'b'),
                                        (1, 'c'), (2, 'a')],
                                       names=['first', 'second'])
dfmi.sub(column, axis=0, level='second')

Unnamed: 0_level_0,Unnamed: 1_level_0,one,three,two
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,0.248607,,0.0
1,b,0.150291,1.206517,0.0
1,c,0.811203,1.978231,0.0
2,a,,0.242635,1.41936


In [24]:
major_mean = wp.mean(axis='major')
major_mean

Unnamed: 0,Item1,Item2
A,-0.004877,-0.419428
B,-0.788043,0.491937
C,0.374434,-0.107481
D,0.380518,-0.104108


In [25]:
wp.sub(major_mean, axis='major')

Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  return self._combine_frame(other, func, axis=axis)


<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis)
Items axis: Item1 to Item2
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-05 00:00:00
Minor_axis axis: A to D

In [26]:
s = pd.Series(np.arange(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [27]:
div, rem = divmod(s, 3)
div

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int64

In [28]:
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int64

In [29]:
idx = pd.Index(np.arange(10))
idx

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [30]:
div, rem = divmod(idx, 3)
div

Int64Index([0, 0, 0, 1, 1, 1, 2, 2, 2, 3], dtype='int64')

In [31]:
rem

Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2, 0], dtype='int64')

In [32]:
div, rem = divmod(s, [2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
div

0    0
1    0
2    0
3    1
4    1
5    1
6    1
7    1
8    1
9    1
dtype: int64

In [33]:
rem

0    0
1    1
2    2
3    0
4    0
5    1
6    1
7    2
8    2
9    3
dtype: int64

### 缺少填充值的数据/操作

In [34]:
df

Unnamed: 0,one,three,two
a,-1.158248,,-1.406855
b,-0.702384,0.353842,-0.852675
c,0.515502,1.682529,-0.295702
d,,-1.164221,0.012505


In [35]:
df2 = pd.DataFrame(np.array([[1.400810, -1.643041, 1.000000],
                             [-0.356470, 1.045911, 0.395023],
                             [0.797268, 0.924515, -0.007090],
                             [np.nan, 1.553693, -1.670830]]), index=['a', 'b', 'c', 'd'], columns=['one', 'two', 'three'])
df2

Unnamed: 0,one,two,three
a,1.40081,-1.643041,1.0
b,-0.35647,1.045911,0.395023
c,0.797268,0.924515,-0.00709
d,,1.553693,-1.67083


In [36]:
df + df2

Unnamed: 0,one,three,two
a,0.242562,,-3.049896
b,-1.058854,0.748865,0.193236
c,1.31277,1.675439,0.628813
d,,-2.835051,1.566198


In [37]:
df.add(df2, fill_value=0)

Unnamed: 0,one,three,two
a,0.242562,1.0,-3.049896
b,-1.058854,0.748865,0.193236
c,1.31277,1.675439,0.628813
d,,-2.835051,1.566198


In [38]:
df.gt(df2)

Unnamed: 0,one,three,two
a,False,False,True
b,False,False,False
c,False,True,False
d,False,True,False


### 灵活比较

In [39]:
df2.ne(df)

Unnamed: 0,one,three,two
a,True,True,True
b,True,True,True
c,True,True,True
d,True,True,True


### 布尔缩减

In [40]:
(df > 0).all()

one      False
three    False
two      False
dtype: bool

In [41]:
(df > 0).any()

one      True
three    True
two      True
dtype: bool

In [42]:
(df > 0).any().any()

True

In [43]:
df.empty

False

In [44]:
pd.DataFrame(columns=list('ABC')).empty

True

In [45]:
pd.Series([True]).bool()

True

In [46]:
pd.Series([False]).bool()

False

In [47]:
pd.DataFrame([[True]]).bool()

True

In [48]:
pd.DataFrame([[False]]).bool()

False

### 比较对象是否相等

In [49]:
df + df == df * 2

Unnamed: 0,one,three,two
a,True,False,True
b,True,True,True
c,True,True,True
d,False,True,True


In [50]:
(df + df == df * 2).all()

one      False
three    False
two       True
dtype: bool

In [51]:
np.nan == np.nan

False

In [52]:
(df + df).equals(df * 2)

True

In [53]:
df1 = pd.DataFrame({'col': ['foo', 0, np.nan]})
df2 = pd.DataFrame({'col': [np.nan, 0, 'foo']}, index=[2, 1, 0])
df1.equals(df2)

False

In [54]:
df1.equals(df2.sort_index())

True

### 比较类似数组的对象

In [55]:
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [56]:
pd.Index(['foo', 'bar', 'baz']) == 'foo'

array([ True, False, False])

In [57]:
pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo', 'bar'])

ValueError: Can only compare identically-labeled Series objects

In [None]:
pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo'])

In [None]:
np.array([1, 2, 3]) == np.array([2])

In [None]:
np.array([1, 2, 3]) == np.array([1, 2])

### 组合重叠数据

In [None]:
df1 = pd.DataFrame({'A': [1., np.nan, 3., 5., np.nan],
                    'B': [np.nan, 2., 3., np.nan, 6.]})
df2 = pd.DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
                    'B': [np.nan, np.nan, 3., 4., 6., 8.]})

In [None]:
df1

In [None]:
df2

In [58]:
df1.combine_first(df2)

Unnamed: 0,col
0,foo
1,0
2,


### 描述性统计

In [59]:
def combiner(x, y):
    return np.where(pd.isna(x), y, x)

In [60]:
df

Unnamed: 0,one,three,two
a,-1.158248,,-1.406855
b,-0.702384,0.353842,-0.852675
c,0.515502,1.682529,-0.295702
d,,-1.164221,0.012505


In [61]:
df.mean(0)

one     -0.448377
three    0.290717
two     -0.635682
dtype: float64

In [62]:
df.mean(1)

a   -1.282552
b   -0.400406
c    0.634110
d   -0.575858
dtype: float64

In [63]:
df.sum(0, skipna=False)

one           NaN
three         NaN
two     -2.542728
dtype: float64

In [64]:
df.sum(axis=1, skipna=True)

a   -2.565103
b   -1.201218
c    1.902329
d   -1.151716
dtype: float64

In [65]:
ts_stand = (df - df.mean()) / df.std()
ts_stand.std()

one      1.0
three    1.0
two      1.0
dtype: float64

In [66]:
xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)
xs_stand.std(1)

a    1.0
b    1.0
c    1.0
d    1.0
dtype: float64

In [67]:
df.cumsum()

Unnamed: 0,one,three,two
a,-1.158248,,-1.406855
b,-1.860632,0.353842,-2.259531
c,-1.34513,2.036371,-2.555232
d,,0.87215,-2.542728


In [68]:
np.mean(df['one'])

-0.44837679669661124

In [69]:
np.mean(df['one'].to_numpy())

nan

In [70]:
series = pd.Series(np.random.randn(500))
series[20:500] = np.nan
series[10:20] = 5
series.nunique()

11

In [71]:
series = pd.Series(np.random.randn(1000))
series[::2] = np.nan
series.describe()

count    500.000000
mean      -0.040444
std        0.959247
min       -2.820263
25%       -0.690273
50%       -0.053576
75%        0.595738
max        2.820262
dtype: float64

In [72]:
frame = pd.DataFrame(np.random.randn(1000, 5),
                     columns=['a', 'b', 'c', 'd', 'e'])
frame.iloc[::2] = np.nan
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,0.007574,-0.127937,-0.04187,-0.005547,-0.032304
std,0.992601,0.977219,0.986683,0.969432,1.017345
min,-2.792484,-3.100089,-2.822789,-2.696494,-2.653405
25%,-0.708073,-0.765885,-0.71524,-0.629647,-0.73335
50%,-0.023506,-0.150542,-0.074518,-0.036171,-0.004246
75%,0.681514,0.549863,0.594947,0.658378,0.623858
max,3.096502,2.97648,2.98566,2.532404,3.46302


In [73]:
series.describe(percentiles=[.05, .25, .75, .95])

count    500.000000
mean      -0.040444
std        0.959247
min       -2.820263
5%        -1.512834
25%       -0.690273
50%       -0.053576
75%        0.595738
95%        1.601484
max        2.820262
dtype: float64

In [74]:
s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])

In [75]:
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

In [76]:
frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)})
frame.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [77]:
frame.describe(include=['object'])

Unnamed: 0,a
count,4
unique,2
top,Yes
freq,2


In [78]:
frame.describe(include=['number'])

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [79]:
frame.describe(include='all')

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,Yes,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


In [80]:
s1 = pd.Series(np.random.randn(5))
s1

0   -2.217575
1    0.377829
2   -2.418630
3   -2.241221
4    1.016761
dtype: float64

In [81]:
s1.idxmin(), s1.idxmax()

(2, 4)

In [82]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
df1

Unnamed: 0,A,B,C
0,-2.319458,0.596974,-1.333744
1,-0.752551,-0.321508,-1.099623
2,-0.583096,1.650922,-0.709284
3,0.120387,-0.059448,0.71377
4,0.393303,0.972209,-0.158566


In [83]:
df1.idxmin(axis=0)

A    0
B    1
C    0
dtype: int64

In [84]:
df1.idxmax(axis=1)

0    B
1    B
2    B
3    C
4    B
dtype: object

In [85]:
df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba'))
df3

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [86]:
df3['A'].idxmin()

'd'

In [87]:
data = np.random.randint(0, 7, size=50)
data

array([4, 5, 3, 5, 5, 4, 0, 3, 0, 6, 6, 4, 0, 6, 2, 4, 1, 4, 2, 2, 3, 4,
       5, 3, 2, 6, 0, 2, 5, 4, 1, 4, 6, 4, 2, 3, 0, 2, 2, 5, 6, 6, 2, 0,
       2, 0, 1, 3, 5, 0])

In [88]:
s = pd.Series(data)
s.value_counts()

2    10
4     9
0     8
6     7
5     7
3     6
1     3
dtype: int64

In [89]:
pd.value_counts(data)

2    10
4     9
0     8
6     7
5     7
3     6
1     3
dtype: int64

In [90]:
s5 = pd.Series([1, 1, 3, 3, 5, 5, 7, 7, 7])
s5.mode()

0    7
dtype: int64

In [91]:
df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
                    "B": np.random.randint(-10, 15, size=50)})
df5.mode()

Unnamed: 0,A,B
0,4,14


In [92]:
arr = np.random.randn(20)
factor = pd.cut(arr, 4)
factor

[(0.137, 0.918], (-0.644, 0.137], (-0.644, 0.137], (0.918, 1.699], (-0.644, 0.137], ..., (0.137, 0.918], (0.137, 0.918], (-1.428, -0.644], (-0.644, 0.137], (-0.644, 0.137]]
Length: 20
Categories (4, interval[float64]): [(-1.428, -0.644] < (-0.644, 0.137] < (0.137, 0.918] < (0.918, 1.699]]

In [93]:
factor = pd.cut(arr, [-5, -1, 0, 1, 5])
factor

[(0, 1], (-1, 0], (-1, 0], (1, 5], (0, 1], ..., (0, 1], (0, 1], (-5, -1], (-1, 0], (-1, 0]]
Length: 20
Categories (4, interval[int64]): [(-5, -1] < (-1, 0] < (0, 1] < (1, 5]]

In [94]:
arr = np.random.randn(30)
factor = pd.qcut(arr, [0, .25, .5, .75, 1])
factor

[(-0.175, 0.723], (-2.791, -0.938], (-2.791, -0.938], (0.723, 1.643], (0.723, 1.643], ..., (-0.938, -0.175], (0.723, 1.643], (-0.175, 0.723], (-2.791, -0.938], (-2.791, -0.938]]
Length: 30
Categories (4, interval[float64]): [(-2.791, -0.938] < (-0.938, -0.175] < (-0.175, 0.723] < (0.723, 1.643]]

In [95]:
pd.value_counts(factor)

(0.723, 1.643]      8
(-2.791, -0.938]    8
(-0.175, 0.723]     7
(-0.938, -0.175]    7
dtype: int64

In [96]:
arr = np.random.randn(20)
factor = pd.cut(arr, [-np.inf, 0, np.inf])
factor

[(0.0, inf], (-inf, 0.0], (-inf, 0.0], (0.0, inf], (0.0, inf], ..., (0.0, inf], (0.0, inf], (-inf, 0.0], (-inf, 0.0], (0.0, inf]]
Length: 20
Categories (2, interval[float64]): [(-inf, 0.0] < (0.0, inf]]

### 功能应用

In [97]:
f(g(h(df), arg1=1), arg2=2, arg3=3)

NameError: name 'f' is not defined

In [None]:
(df.pipe(h)
   .pipe(g, arg1=1)
   .pipe(f, arg2=2, arg3=3))

In [98]:
import statsmodels.formula.api as sm
bb = pd.read_csv('data/baseball.csv', index_col='id')
(bb.query('h > 0')
   .assign(ln_h=lambda df: np.log(df.h))
   .pipe((sm.ols, 'data'), 'hr ~ ln_h  + year + g + C(lg)')
   .fit()
   .summary()
)

FileNotFoundError: [Errno 2] File b'data/baseball.csv' does not exist: b'data/baseball.csv'

In [None]:
df.apply(np.mean)

In [None]:
df.apply(np.mean, axis=1)

In [None]:
df.apply(lambda x: x.max() - x.min())

In [None]:
df.apply(np.cumsum)

In [None]:
df.apply(np.exp)

In [99]:
df.apply('mean')

one     -0.448377
three    0.290717
two     -0.635682
dtype: float64

In [100]:
df.apply('mean', axis=1)

a   -1.282552
b   -0.400406
c    0.634110
d   -0.575858
dtype: float64

In [101]:
tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], index=pd.date_range('1/1/2000', periods=1000))
tsdf.apply(lambda x: x.idxmax())

A   2002-04-29
B   2000-01-23
C   2001-09-02
dtype: datetime64[ns]

In [102]:
def substract_and_divide(x, sub, divide=1):
    return (x - sub) / divide

In [103]:
df.apply(substract_and_divide, args=(5, ), divide=3)

Unnamed: 0,one,three,two
a,-2.052749,,-2.135618
b,-1.900795,-1.548719,-1.950892
c,-1.494833,-1.105824,-1.765234
d,,-2.05474,-1.662498


In [104]:
tsdf

Unnamed: 0,A,B,C
2000-01-01,-0.885030,0.519193,-0.315662
2000-01-02,-0.506911,0.780193,0.561234
2000-01-03,-0.061337,-0.566011,0.130622
2000-01-04,0.435786,-0.680732,-1.291759
2000-01-05,-0.581685,0.531013,-0.317229
2000-01-06,-0.631151,-0.633974,1.818261
2000-01-07,0.602184,-1.384053,-0.210380
2000-01-08,0.516327,0.128852,-0.650490
2000-01-09,1.236779,-0.046858,1.643519
2000-01-10,-0.356763,-1.254073,-0.465999


In [105]:
tsdf.apply(pd.Series.interpolate)

Unnamed: 0,A,B,C
2000-01-01,-0.885030,0.519193,-0.315662
2000-01-02,-0.506911,0.780193,0.561234
2000-01-03,-0.061337,-0.566011,0.130622
2000-01-04,0.435786,-0.680732,-1.291759
2000-01-05,-0.581685,0.531013,-0.317229
2000-01-06,-0.631151,-0.633974,1.818261
2000-01-07,0.602184,-1.384053,-0.210380
2000-01-08,0.516327,0.128852,-0.650490
2000-01-09,1.236779,-0.046858,1.643519
2000-01-10,-0.356763,-1.254073,-0.465999


In [106]:
tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], index=pd.date_range('1/1/2000', periods=10))
tsdf.iloc[3:7] = np.nan
tsdf

Unnamed: 0,A,B,C
2000-01-01,0.780484,-1.090558,-0.025155
2000-01-02,-1.130485,-0.475706,-1.521572
2000-01-03,0.718977,0.477187,-0.676274
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,-0.467322,-1.626374,0.68678
2000-01-09,0.535968,-0.439502,0.933224
2000-01-10,0.587766,1.236755,-0.327513


In [107]:
tsdf.agg(np.sum)

A    1.025387
B   -1.918199
C   -0.930509
dtype: float64

In [108]:
tsdf.agg('sum')

A    1.025387
B   -1.918199
C   -0.930509
dtype: float64

In [109]:
tsdf.sum()

A    1.025387
B   -1.918199
C   -0.930509
dtype: float64

In [110]:
tsdf.A.agg('sum')

1.025387253011075

In [111]:
tsdf.agg(['sum'])

Unnamed: 0,A,B,C
sum,1.025387,-1.918199,-0.930509


In [112]:
tsdf.agg(['sum', 'mean'])

Unnamed: 0,A,B,C
sum,1.025387,-1.918199,-0.930509
mean,0.170898,-0.3197,-0.155085


In [113]:
tsdf.A.agg(['sum', 'mean'])

sum     1.025387
mean    0.170898
Name: A, dtype: float64

In [114]:
tsdf.A.agg(['sum', lambda x: x.mean()])

sum         1.025387
<lambda>    0.170898
Name: A, dtype: float64

In [115]:
def mymean(x):
    return x.mean()

In [116]:
tsdf.A.agg(['sum', mymean])

sum       1.025387
mymean    0.170898
Name: A, dtype: float64

In [117]:
tsdf.agg({'A': 'mean', 'B': 'sum'})

A    0.170898
B   -1.918199
dtype: float64

In [118]:
tsdf.agg({'A': ['mean', 'min'], 'B': 'sum'})

Unnamed: 0,A,B
mean,0.170898,
min,-1.130485,
sum,,-1.918199


In [119]:
mdf = pd.DataFrame({'A': [1, 2, 3],
                    'B': [1., 2., 3.],
                    'C': ['foo', 'bar', 'baz'],
                    'D': pd.date_range('20130101', periods=3)})
mdf.dtypes

A             int64
B           float64
C            object
D    datetime64[ns]
dtype: object

In [120]:
mdf.agg(['min', 'sum'])

Unnamed: 0,A,B,C,D
min,1,1.0,bar,2013-01-01
sum,6,6.0,foobarbaz,NaT


In [121]:
from functools import partial
q_25 = partial(pd.Series.quantile, q=0.25)
q_25.__name__ = '25%'
q_75 = partial(pd.Series.quantile, q=0.75)
q_75.__name__ = '75%'
tsdf.agg(['count', 'mean', 'std', 'min', q_25, 'median', q_75, 'max'])

Unnamed: 0,A,B,C
count,6.0,6.0,6.0
mean,0.170898,-0.3197,-0.155085
std,0.784852,1.039027,0.903182
min,-1.130485,-1.626374,-1.521572
25%,-0.2165,-0.936845,-0.589084
median,0.561867,-0.457604,-0.176334
75%,0.686174,0.248015,0.508796
max,0.780484,1.236755,0.933224


In [122]:
tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], index=pd.date_range('1/1/2000', periods=10))
tsdf.iloc[3:7] = np.nan
tsdf

Unnamed: 0,A,B,C
2000-01-01,-0.187132,-0.34135,-1.283622
2000-01-02,0.799916,-0.12261,-0.237411
2000-01-03,1.184041,-0.936899,-0.645575
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,-1.120066,0.23709,0.026925
2000-01-09,-0.797198,-0.004988,-0.937847
2000-01-10,-0.462077,0.436891,0.278542


In [123]:
tsdf.transform(np.abs)

Unnamed: 0,A,B,C
2000-01-01,0.187132,0.34135,1.283622
2000-01-02,0.799916,0.12261,0.237411
2000-01-03,1.184041,0.936899,0.645575
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.120066,0.23709,0.026925
2000-01-09,0.797198,0.004988,0.937847
2000-01-10,0.462077,0.436891,0.278542


In [124]:
tsdf.transform('abs')

Unnamed: 0,A,B,C
2000-01-01,0.187132,0.34135,1.283622
2000-01-02,0.799916,0.12261,0.237411
2000-01-03,1.184041,0.936899,0.645575
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.120066,0.23709,0.026925
2000-01-09,0.797198,0.004988,0.937847
2000-01-10,0.462077,0.436891,0.278542


In [125]:
tsdf.transform(lambda x: x.abs())

Unnamed: 0,A,B,C
2000-01-01,0.187132,0.34135,1.283622
2000-01-02,0.799916,0.12261,0.237411
2000-01-03,1.184041,0.936899,0.645575
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.120066,0.23709,0.026925
2000-01-09,0.797198,0.004988,0.937847
2000-01-10,0.462077,0.436891,0.278542


In [126]:
np.abs(tsdf)

Unnamed: 0,A,B,C
2000-01-01,0.187132,0.34135,1.283622
2000-01-02,0.799916,0.12261,0.237411
2000-01-03,1.184041,0.936899,0.645575
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.120066,0.23709,0.026925
2000-01-09,0.797198,0.004988,0.937847
2000-01-10,0.462077,0.436891,0.278542


In [127]:
tsdf.A.transform(np.abs)

2000-01-01    0.187132
2000-01-02    0.799916
2000-01-03    1.184041
2000-01-04         NaN
2000-01-05         NaN
2000-01-06         NaN
2000-01-07         NaN
2000-01-08    1.120066
2000-01-09    0.797198
2000-01-10    0.462077
Freq: D, Name: A, dtype: float64

In [128]:
tsdf.transform([np.abs, lambda x: x + 1])

Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,absolute,<lambda>,absolute,<lambda>,absolute,<lambda>
2000-01-01,0.187132,0.812868,0.34135,0.65865,1.283622,-0.283622
2000-01-02,0.799916,1.799916,0.12261,0.87739,0.237411,0.762589
2000-01-03,1.184041,2.184041,0.936899,0.063101,0.645575,0.354425
2000-01-04,,,,,,
2000-01-05,,,,,,
2000-01-06,,,,,,
2000-01-07,,,,,,
2000-01-08,1.120066,-0.120066,0.23709,1.23709,0.026925,1.026925
2000-01-09,0.797198,0.202802,0.004988,0.995012,0.937847,0.062153
2000-01-10,0.462077,0.537923,0.436891,1.436891,0.278542,1.278542


In [129]:
tsdf.A.transform([np.abs, lambda x: x + 1])

Unnamed: 0,absolute,<lambda>
2000-01-01,0.187132,0.812868
2000-01-02,0.799916,1.799916
2000-01-03,1.184041,2.184041
2000-01-04,,
2000-01-05,,
2000-01-06,,
2000-01-07,,
2000-01-08,1.120066,-0.120066
2000-01-09,0.797198,0.202802
2000-01-10,0.462077,0.537923


In [130]:
tsdf.transform({'A': np.abs, 'B': lambda x: x + 1})

Unnamed: 0,A,B
2000-01-01,0.187132,0.65865
2000-01-02,0.799916,0.87739
2000-01-03,1.184041,0.063101
2000-01-04,,
2000-01-05,,
2000-01-06,,
2000-01-07,,
2000-01-08,1.120066,1.23709
2000-01-09,0.797198,0.995012
2000-01-10,0.462077,1.436891


In [131]:
tsdf.transform({'A': np.abs, 'B': [lambda x: x + 1, 'sqrt']})

  return f(self, *args, **kwargs)


Unnamed: 0_level_0,A,B,B
Unnamed: 0_level_1,absolute,<lambda>,sqrt
2000-01-01,0.187132,0.65865,
2000-01-02,0.799916,0.87739,
2000-01-03,1.184041,0.063101,
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.120066,1.23709,0.486919
2000-01-09,0.797198,0.995012,
2000-01-10,0.462077,1.436891,0.660977


In [132]:
df4 = pd.DataFrame(np.random.randn(4, 3), index=['a', 'b', 'c', 'd'], columns=['one', 'two', 'three'])
df4

Unnamed: 0,one,two,three
a,-0.52391,-0.492182,-0.32929
b,-0.431068,0.559833,1.6522
c,-0.794078,0.015949,0.706268
d,-0.356205,1.56948,2.140819


In [133]:
def f(x):
    return len(str(x))

In [134]:
df4['one'].map(f)

a    18
b    19
c    19
d    19
Name: one, dtype: int64

In [135]:
df4.applymap(f)

Unnamed: 0,one,two,three
a,18,20,19
b,19,18,18
c,19,20,18
d,19,18,18


In [136]:
s = pd.Series(['six', 'seven', 'six', 'seven', 'six'], index=['a', 'b', 'c', 'd', 'e'])
t = pd.Series({'six': 6., 'seven': 7.})

In [137]:
s

a      six
b    seven
c      six
d    seven
e      six
dtype: object

In [138]:
s.map(t)

a    6.0
b    7.0
c    6.0
d    7.0
e    6.0
dtype: float64

### 重新索引和更改标签

In [140]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a    0.825464
b    0.613113
c    0.771264
d   -0.750357
e    0.490530
dtype: float64

In [141]:
s.reindex(['e', 'b', 'f', 'd'])

e    0.490530
b    0.613113
f         NaN
d   -0.750357
dtype: float64

In [142]:
df

Unnamed: 0,one,three,two
a,-1.158248,,-1.406855
b,-0.702384,0.353842,-0.852675
c,0.515502,1.682529,-0.295702
d,,-1.164221,0.012505


In [143]:
df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])

Unnamed: 0,three,two,one
c,1.682529,-0.295702,0.515502
f,,,
b,0.353842,-0.852675,-0.702384


In [144]:
df.reindex(['c', 'f', 'b'], axis='index')

Unnamed: 0,one,three,two
c,0.515502,1.682529,-0.295702
f,,,
b,-0.702384,0.353842,-0.852675


In [145]:
rs = s.reindex(df.index)
rs

a    0.825464
b    0.613113
c    0.771264
d   -0.750357
dtype: float64

In [146]:
rs.index is df.index

True

In [147]:
df.reindex(['c', 'f', 'b'], axis='index')

Unnamed: 0,one,three,two
c,0.515502,1.682529,-0.295702
f,,,
b,-0.702384,0.353842,-0.852675


In [148]:
df.reindex(['three', 'two', 'one'], axis='columns')

Unnamed: 0,three,two,one
a,,-1.406855,-1.158248
b,0.353842,-0.852675,-0.702384
c,1.682529,-0.295702,0.515502
d,-1.164221,0.012505,


In [150]:
df2

Unnamed: 0,col
2,
1,0
0,foo


In [151]:
df3

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [152]:
df.reindex_like(df2)

Unnamed: 0,col
2,
1,
0,


In [154]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s1 = s[:4]
s2 = s[1:]
s1.align(s2)

(a    0.943123
 b   -0.019113
 c    0.304212
 d    0.229990
 e         NaN
 dtype: float64, a         NaN
 b   -0.019113
 c    0.304212
 d    0.229990
 e    0.385382
 dtype: float64)

In [155]:
s1.align(s2, join='inner')

(b   -0.019113
 c    0.304212
 d    0.229990
 dtype: float64, b   -0.019113
 c    0.304212
 d    0.229990
 dtype: float64)

In [156]:
s1.align(s2, join='left')

(a    0.943123
 b   -0.019113
 c    0.304212
 d    0.229990
 dtype: float64, a         NaN
 b   -0.019113
 c    0.304212
 d    0.229990
 dtype: float64)

In [157]:
df.align(df2, join='inner')

(Empty DataFrame
 Columns: []
 Index: [], Empty DataFrame
 Columns: []
 Index: [])

In [158]:
df.align(df2, join='inner', axis=0)

(Empty DataFrame
 Columns: [one, three, two]
 Index: [], Empty DataFrame
 Columns: [col]
 Index: [])

In [159]:
df.align(df2.iloc[0], axis=1)

(   col       one     three       two
 a  NaN -1.158248       NaN -1.406855
 b  NaN -0.702384  0.353842 -0.852675
 c  NaN  0.515502  1.682529 -0.295702
 d  NaN       NaN -1.164221  0.012505, col      NaN
 one      NaN
 three    NaN
 two      NaN
 Name: 2, dtype: object)

In [160]:
rng = pd.date_range('1/3/2000', periods=8)
ts = pd.Series(np.random.randn(8), index=rng)
ts2 = ts[[0, 3, 6]]
ts

2000-01-03   -1.136500
2000-01-04   -0.203633
2000-01-05    0.670051
2000-01-06    0.490326
2000-01-07    0.418383
2000-01-08    1.478009
2000-01-09    0.101713
2000-01-10    0.675576
Freq: D, dtype: float64

In [161]:
ts2

2000-01-03   -1.136500
2000-01-06    0.490326
2000-01-09    0.101713
dtype: float64

In [162]:
ts2.reindex(ts.index)

2000-01-03   -1.136500
2000-01-04         NaN
2000-01-05         NaN
2000-01-06    0.490326
2000-01-07         NaN
2000-01-08         NaN
2000-01-09    0.101713
2000-01-10         NaN
Freq: D, dtype: float64

In [163]:
ts2.reindex(ts.index, method='ffill')

2000-01-03   -1.136500
2000-01-04   -1.136500
2000-01-05   -1.136500
2000-01-06    0.490326
2000-01-07    0.490326
2000-01-08    0.490326
2000-01-09    0.101713
2000-01-10    0.101713
Freq: D, dtype: float64

In [164]:
ts2.reindex(ts.index, method='bfill')

2000-01-03   -1.136500
2000-01-04    0.490326
2000-01-05    0.490326
2000-01-06    0.490326
2000-01-07    0.101713
2000-01-08    0.101713
2000-01-09    0.101713
2000-01-10         NaN
Freq: D, dtype: float64

In [165]:
ts2.reindex(ts.index, method='nearest')

2000-01-03   -1.136500
2000-01-04   -1.136500
2000-01-05    0.490326
2000-01-06    0.490326
2000-01-07    0.490326
2000-01-08    0.101713
2000-01-09    0.101713
2000-01-10    0.101713
Freq: D, dtype: float64

In [166]:
ts2.reindex(ts.index).fillna(method='ffill')

2000-01-03   -1.136500
2000-01-04   -1.136500
2000-01-05   -1.136500
2000-01-06    0.490326
2000-01-07    0.490326
2000-01-08    0.490326
2000-01-09    0.101713
2000-01-10    0.101713
Freq: D, dtype: float64

In [167]:
ts2.reindex(ts.index, method='ffill', limit=1)

2000-01-03   -1.136500
2000-01-04   -1.136500
2000-01-05         NaN
2000-01-06    0.490326
2000-01-07    0.490326
2000-01-08         NaN
2000-01-09    0.101713
2000-01-10    0.101713
Freq: D, dtype: float64

In [168]:
ts2.reindex(ts.index, method='ffill', tolerance='1 day')

2000-01-03   -1.136500
2000-01-04   -1.136500
2000-01-05         NaN
2000-01-06    0.490326
2000-01-07    0.490326
2000-01-08         NaN
2000-01-09    0.101713
2000-01-10    0.101713
Freq: D, dtype: float64

In [169]:
df

Unnamed: 0,one,three,two
a,-1.158248,,-1.406855
b,-0.702384,0.353842,-0.852675
c,0.515502,1.682529,-0.295702
d,,-1.164221,0.012505


In [170]:
df.drop(['a', 'b'], axis=0)

Unnamed: 0,one,three,two
c,0.515502,1.682529,-0.295702
d,,-1.164221,0.012505


In [171]:
df.drop(['one'], axis=1)

Unnamed: 0,three,two
a,,-1.406855
b,0.353842,-0.852675
c,1.682529,-0.295702
d,-1.164221,0.012505


In [172]:
df.reindex(df.index.difference(['a', 'b']))

Unnamed: 0,one,three,two
c,0.515502,1.682529,-0.295702
d,,-1.164221,0.012505


In [173]:
s

a    0.943123
b   -0.019113
c    0.304212
d    0.229990
e    0.385382
dtype: float64

In [174]:
s.rename(str.upper)

A    0.943123
B   -0.019113
C    0.304212
D    0.229990
E    0.385382
dtype: float64

In [175]:
df.rename(columns={'one': 'foo', 'two': 'bar'}, index={'a': 'apple', 'b': 'banana', 'd': 'durian'})

Unnamed: 0,foo,three,bar
apple,-1.158248,,-1.406855
banana,-0.702384,0.353842,-0.852675
c,0.515502,1.682529,-0.295702
durian,,-1.164221,0.012505


In [176]:
df.rename({'one': 'foo', 'two': 'bar'}, axis='columns')

Unnamed: 0,foo,three,bar
a,-1.158248,,-1.406855
b,-0.702384,0.353842,-0.852675
c,0.515502,1.682529,-0.295702
d,,-1.164221,0.012505


In [177]:
df.rename({'a': 'apple', 'b': 'banana', 'd': 'durian'}, axis='index')

Unnamed: 0,one,three,two
apple,-1.158248,,-1.406855
banana,-0.702384,0.353842,-0.852675
c,0.515502,1.682529,-0.295702
durian,,-1.164221,0.012505


In [178]:
s.rename('scalar-name')

a    0.943123
b   -0.019113
c    0.304212
d    0.229990
e    0.385382
Name: scalar-name, dtype: float64

In [181]:
df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6],
                   'y': [10, 20, 30, 40, 50, 60]},
                    index=pd.MultiIndex.from_product([['a', 'b', 'c'],[1,2]], names=['let', 'num']))

In [182]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y
let,num,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,10
a,2,2,20
b,1,3,30
b,2,4,40
c,1,5,50
c,2,6,60


In [183]:
df.rename_axis(index={'let': 'abc'})

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y
abc,num,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,10
a,2,2,20
b,1,3,30
b,2,4,40
c,1,5,50
c,2,6,60


In [184]:
df.rename_axis(index=str.upper)

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y
LET,NUM,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,10
a,2,2,20
b,1,3,30
b,2,4,40
c,1,5,50
c,2,6,60


### iteration

In [185]:
df = pd.DataFrame({'col1': np.random.randn(3),
                   'col2': np.random.randn(3)}, index=['a', 'b', 'c'])
for col in df:
    print(col)

col1
col2


In [187]:
for item, frame in wp.iteritems():
    print(item)
    print(frame)

Item1
                   A         B         C         D
2000-01-01  0.048042 -1.190397  1.149368 -1.384862
2000-01-02  0.151154 -1.796917 -0.254995  2.883547
2000-01-03 -0.526786  0.400227  0.415145 -0.401234
2000-01-04  1.327953  1.220325  0.222846  0.691284
2000-01-05 -1.024748 -2.573451  0.339804  0.113854
Item2
                   A         B         C         D
2000-01-01 -1.348633  1.537692  0.200129 -0.545853
2000-01-02 -0.023530  0.309226  0.173042 -0.499830
2000-01-03 -0.317651 -0.473876 -0.979277  0.372593
2000-01-04 -0.604792  1.774118  0.711612  0.123457
2000-01-05  0.197468 -0.687475 -0.642911  0.029090


In [188]:
for row_index, row in df.iterrows():
    print(row_index, row, sep='\n')

a
col1    0.695164
col2   -0.791288
Name: a, dtype: float64
b
col1   -0.201901
col2    0.485342
Name: b, dtype: float64
c
col1    1.292131
col2    0.382135
Name: c, dtype: float64


In [189]:
df_orig = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
df_orig.dtypes

int        int64
float    float64
dtype: object

In [190]:
row = next(df_orig.iterrows())[1]
row

int      1.0
float    1.5
Name: 0, dtype: float64

In [191]:
row['int'].dtype

dtype('float64')

In [192]:
df_orig['int'].dtype

dtype('int64')

In [193]:
df2 = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
print(df2)

   x  y
0  1  4
1  2  5
2  3  6


In [194]:
print(df2.T)

   0  1  2
x  1  2  3
y  4  5  6


In [197]:
df2_t = pd.DataFrame({idx: values for idx, values in df2.iterrows()})
print(df2_t)

   0  1  2
x  1  2  3
y  4  5  6


In [198]:
for row in df.itertuples():
    print(row)

Pandas(Index='a', col1=0.6951642062227844, col2=-0.7912876576850234)
Pandas(Index='b', col1=-0.20190108885621927, col2=0.4853423714185732)
Pandas(Index='c', col1=1.2921309792784654, col2=0.382135202786473)


### .dt访问器

In [199]:
s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))
s

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [200]:
s.dt.hour

0    9
1    9
2    9
3    9
dtype: int64

In [201]:
s.dt.second

0    12
1    12
2    12
3    12
dtype: int64

In [202]:
s.dt.day

0    1
1    2
2    3
3    4
dtype: int64

In [203]:
s[s.dt.day == 2]

1   2013-01-02 09:10:12
dtype: datetime64[ns]

In [204]:
stz = s.dt.tz_localize('US/Eastern')
stz

0   2013-01-01 09:10:12-05:00
1   2013-01-02 09:10:12-05:00
2   2013-01-03 09:10:12-05:00
3   2013-01-04 09:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [205]:
stz.dt.tz

<DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>

In [206]:
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

0   2013-01-01 04:10:12-05:00
1   2013-01-02 04:10:12-05:00
2   2013-01-03 04:10:12-05:00
3   2013-01-04 04:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [207]:
s = pd.Series(pd.date_range('20130101', periods=4))
s

0   2013-01-01
1   2013-01-02
2   2013-01-03
3   2013-01-04
dtype: datetime64[ns]

In [208]:
s.dt.strftime('%Y/%m/%d')

0    2013/01/01
1    2013/01/02
2    2013/01/03
3    2013/01/04
dtype: object

In [209]:
s = pd.Series(pd.period_range('20130101', periods=4))
s

0    2013-01-01
1    2013-01-02
2    2013-01-03
3    2013-01-04
dtype: period[D]

In [210]:
s.dt.strftime('%Y/%m/%d')

0    2013/01/01
1    2013/01/02
2    2013/01/03
3    2013/01/04
dtype: object

In [211]:
s.dt.year

0    2013
1    2013
2    2013
3    2013
dtype: int64

In [212]:
s.dt.day

0    1
1    2
2    3
3    4
dtype: int64

In [213]:
s = pd.Series(pd.timedelta_range('1 day 00:00:05', periods=4, freq='s'))
s

0   1 days 00:00:05
1   1 days 00:00:06
2   1 days 00:00:07
3   1 days 00:00:08
dtype: timedelta64[ns]

In [214]:
s.dt.days

0    1
1    1
2    1
3    1
dtype: int64

In [215]:
s.dt.seconds

0    5
1    6
2    7
3    8
dtype: int64

In [216]:
s.dt.components

Unnamed: 0,days,hours,minutes,seconds,milliseconds,microseconds,nanoseconds
0,1,0,0,5,0,0,0
1,1,0,0,6,0,0,0
2,1,0,0,7,0,0,0
3,1,0,0,8,0,0,0


### 矢量化字符串方法

In [217]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

### Sorting

In [218]:
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
    'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
    'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])
})

In [219]:
unsorted_df = df.reindex(index=['a', 'b', 'c', 'd'],
                         columns=['three', 'two', 'one'])

In [220]:
unsorted_df

Unnamed: 0,three,two,one
a,,0.374403,-0.542528
b,-0.355575,-0.19807,-0.415187
c,-0.565975,0.710937,-0.684563
d,-0.660764,0.093659,


In [221]:
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,0.374403,-0.542528
b,-0.355575,-0.19807,-0.415187
c,-0.565975,0.710937,-0.684563
d,-0.660764,0.093659,


In [222]:
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,-0.660764,0.093659,
c,-0.565975,0.710937,-0.684563
b,-0.355575,-0.19807,-0.415187
a,,0.374403,-0.542528


In [223]:
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,-0.542528,,0.374403
b,-0.415187,-0.355575,-0.19807
c,-0.684563,-0.565975,0.710937
d,,-0.660764,0.093659


In [224]:
unsorted_df['three'].sort_index()

a         NaN
b   -0.355575
c   -0.565975
d   -0.660764
Name: three, dtype: float64

In [225]:
df1 = pd.DataFrame({
    'one': [2, 1, 1, 1],
    'two': [1, 3, 2, 4],
    'three': [5, 4, 3, 2]
})

In [226]:
df1.sort_values(by='two')

Unnamed: 0,one,three,two
0,2,5,1
2,1,3,2
1,1,4,3
3,1,2,4


In [227]:
df1[['one', 'two', 'three']].sort_values(by=['one', 'two'])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [229]:
s[2] = np.nan
s.sort_values()

0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
2     NaN
5     NaN
dtype: object

In [230]:
s.sort_values(na_position='first')

2     NaN
5     NaN
0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
dtype: object

In [231]:
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2),
                                 ('b', 1), ('b', 1), ('b', 1)])
idx.names = ['first', 'second']
df_multi = pd.DataFrame({
    'A': np.arange(6, 0, -1)
}, index=idx)

In [232]:
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
a,1,6
a,2,5
a,2,4
b,1,3
b,1,2
b,1,1


In [233]:
df_multi.sort_values(by=['second', 'A'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
b,1,1
b,1,2
b,1,3
a,1,6
a,2,4
a,2,5


In [234]:
ser = pd.Series([1, 2, 3])
ser.searchsorted([0, 3])

array([0, 2])

In [235]:
ser.searchsorted([0, 4])

array([0, 3])

In [236]:
ser.searchsorted([1, 3], side='right')

array([1, 3])

In [237]:
ser.searchsorted([1, 3], side='left')

array([0, 2])

In [238]:
ser = pd.Series([3, 1, 2])

In [239]:
ser.searchsorted([0, 3], sorter=np.argsort(ser))

array([0, 2])

In [240]:
s = pd.Series(np.random.permutation(10))

In [241]:
s

0    7
1    4
2    5
3    0
4    6
5    2
6    8
7    3
8    9
9    1
dtype: int64

In [242]:
s.sort_values()

3    0
9    1
5    2
7    3
1    4
2    5
4    6
0    7
6    8
8    9
dtype: int64

In [243]:
s.nsmallest()

3    0
9    1
5    2
7    3
1    4
dtype: int64

In [244]:
s.nlargest(3)

8    9
6    8
0    7
dtype: int64

In [245]:
df = pd.DataFrame({
    'a': [-2, -1, 1, 10, 8, 11, -1],
    'b': list('abdceff'),
    'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]
})

In [246]:
df.nlargest(3, 'a')

Unnamed: 0,a,b,c
5,11,f,3.0
3,10,c,3.2
4,8,e,


In [248]:
df.nlargest(5, ['a', 'c'])

Unnamed: 0,a,b,c
5,11,f,3.0
3,10,c,3.2
4,8,e,
2,1,d,4.0
6,-1,f,4.0


In [249]:
df.nsmallest(3, 'a')

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
6,-1,f,4.0


In [250]:
df.nsmallest(5, ['a', 'c'])

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
6,-1,f,4.0
2,1,d,4.0
4,8,e,


In [251]:
df1.columns = pd.MultiIndex.from_tuples([
    ('a', 'one'),
    ('a', 'two'),
    ('b', 'three')
])
df1.sort_values(by=('a', 'two'))

Unnamed: 0_level_0,a,a,b
Unnamed: 0_level_1,one,two,three
3,1,2,4
2,1,3,2
1,1,4,3
0,2,5,1


### copying

### dtypes

In [253]:
dft = pd.DataFrame({
    'A': np.random.rand(3),
    'B': 1,
    'C': 'foo',
    'D': pd.Timestamp('20010102'),
    'E': pd.Series([1.0] * 3).astype('float32'),
    'F': False,
    'G': pd.Series([1] * 2, dtype='int8')
})

In [254]:
dft

Unnamed: 0,A,B,C,D,E,F,G
0,0.101013,1,foo,2001-01-02,1.0,False,1.0
1,0.834083,1,foo,2001-01-02,1.0,False,1.0
2,0.346879,1,foo,2001-01-02,1.0,False,


In [255]:
dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G           float64
dtype: object

In [256]:
dft['A'].dtype

dtype('float64')

In [257]:
pd.Series([1,2,3,4,5,6.])

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
dtype: float64

In [258]:
pd.Series([1, 2, 3, 6., 'foo'])

0      1
1      2
2      3
3      6
4    foo
dtype: object

In [259]:
dft.get_dtype_counts()

bool              1
datetime64[ns]    1
float32           1
float64           2
int64             1
object            1
dtype: int64

In [260]:
df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32')

In [261]:
df1

Unnamed: 0,A
0,-0.916161
1,1.793746
2,1.79528
3,0.001239
4,-1.238104
5,0.331469
6,-0.281921
7,-0.736581


In [262]:
df1.dtypes

A    float32
dtype: object

In [264]:
df2 = pd.DataFrame({
    'A': pd.Series(np.random.randn(8), dtype='float16'),
    'B': pd.Series(np.random.randn(8)),
    'C': pd.Series(np.array(np.random.randn(8), dtype='uint8'))
})

In [265]:
df2

Unnamed: 0,A,B,C
0,-0.67041,-0.37821,0
1,-0.105896,-1.571246,1
2,0.713379,2.005863,0
3,-0.72168,0.095385,0
4,1.213867,0.783884,0
5,1.521484,1.258749,255
6,1.924805,0.672811,255
7,-0.556152,-0.257006,0


In [266]:
df2.dtypes

A    float16
B    float64
C      uint8
dtype: object

In [267]:
pd.DataFrame([1, 2], columns=['a']).dtypes

a    int64
dtype: object

In [269]:
pd.DataFrame({'a': [1, 2]}).dtypes

a    int64
dtype: object

In [270]:
pd.DataFrame({'a': 1}, index=list(range(2))).dtypes

a    int64
dtype: object

In [272]:
frame = pd.DataFrame(np.array([1, 2]))
frame

Unnamed: 0,0
0,1
1,2


In [273]:
df3 = df1.reindex_like(df2).fillna(value=0.0) + df2

In [274]:
df3

Unnamed: 0,A,B,C
0,-1.586571,-0.37821,0.0
1,1.68785,-1.571246,1.0
2,2.508658,2.005863,0.0
3,-0.720441,0.095385,0.0
4,-0.024237,0.783884,0.0
5,1.852954,1.258749,255.0
6,1.642884,0.672811,255.0
7,-1.292733,-0.257006,0.0


In [275]:
df3.dtypes

A    float32
B    float64
C    float64
dtype: object

In [276]:
df3.to_numpy().dtype

dtype('float64')

In [277]:
df3

Unnamed: 0,A,B,C
0,-1.586571,-0.37821,0.0
1,1.68785,-1.571246,1.0
2,2.508658,2.005863,0.0
3,-0.720441,0.095385,0.0
4,-0.024237,0.783884,0.0
5,1.852954,1.258749,255.0
6,1.642884,0.672811,255.0
7,-1.292733,-0.257006,0.0


In [278]:
df3.dtypes

A    float32
B    float64
C    float64
dtype: object

In [279]:
df3.astype('float32').dtypes

A    float32
B    float32
C    float32
dtype: object

In [280]:
dft = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
dft[['a', 'b']] = dft[['a', 'b']].astype(np.uint8)

In [281]:
dft

Unnamed: 0,a,b,c
0,1,4,7
1,2,5,8
2,3,6,9


In [282]:
dft.dtypes

a    int64
b    int64
c    int64
dtype: object

In [283]:
import datetime

In [288]:
df = pd.DataFrame([
    [1,2],
    ['a', 'b'],
    [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]
    ])
df = df.T

In [289]:
df

Unnamed: 0,0,1,2
0,1,a,2016-03-02 00:00:00
1,2,b,2016-03-02 00:00:00


In [290]:
df.dtypes

0    object
1    object
2    object
dtype: object

In [291]:
df.infer_objects().dtypes

0             int64
1            object
2    datetime64[ns]
dtype: object

In [292]:
m = ['1.1', 2, 3]
pd.to_numeric(m)

array([1.1, 2. , 3. ])

In [293]:
import datetime
m = ['2016-07-09', datetime.datetime(2016, 3, 2)]
pd.to_datetime(m)

DatetimeIndex(['2016-07-09', '2016-03-02'], dtype='datetime64[ns]', freq=None)

In [299]:
m = ['5us', pd.Timedelta('1day')]
pd.to_timedelta(m)

TimedeltaIndex(['0 days 00:00:00.000005', '1 days 00:00:00'], dtype='timedelta64[ns]', freq=None)

In [300]:
m = ['apple', datetime.datetime(2016, 3, 2)]
pd.to_datetime(m, errors='ignore')

Index(['apple', 2016-03-02 00:00:00], dtype='object')

In [301]:
m = ['apple', 2, 3]
pd.to_numeric(m, errors='ignore')

array(['apple', 2, 3], dtype=object)

In [302]:
m = ['apple', pd.Timedelta('1day')]
pd.to_timedelta(m, errors='ignore')

array(['apple', Timedelta('1 days 00:00:00')], dtype=object)

In [303]:
m = ['1', 2, 3]
pd.to_numeric(m, downcast='integer')

array([1, 2, 3], dtype=int8)

In [304]:
pd.to_numeric(m, downcast='signed')

array([1, 2, 3], dtype=int8)

In [305]:
pd.to_numeric(m, downcast='unsigned')

array([1, 2, 3], dtype=uint8)

In [306]:
pd.to_numeric(m, downcast='float')

array([1., 2., 3.], dtype=float32)

In [311]:
df = pd.DataFrame([['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O')
df

Unnamed: 0,0,1
0,2016-07-09,2016-03-02 00:00:00
1,2016-07-09,2016-03-02 00:00:00


In [312]:
df.apply(pd.to_datetime)

Unnamed: 0,0,1
0,2016-07-09,2016-03-02
1,2016-07-09,2016-03-02


In [314]:
df = pd.DataFrame([['1.1', 2, 3]] * 2, dtype='O')
df

Unnamed: 0,0,1,2
0,1.1,2,3
1,1.1,2,3


In [315]:
df.apply(pd.to_numeric)

Unnamed: 0,0,1,2
0,1.1,2,3
1,1.1,2,3


In [317]:
df = pd.DataFrame([['5us', pd.Timedelta('1day')]] * 2, dtype='O')
df

Unnamed: 0,0,1
0,5us,1 days 00:00:00
1,5us,1 days 00:00:00


In [318]:
df.apply(pd.to_timedelta)

Unnamed: 0,0,1
0,00:00:00.000005,1 days
1,00:00:00.000005,1 days


In [320]:
dfi = df3.astype('int32')
dfi['E'] = 1

In [321]:
dfi

Unnamed: 0,A,B,C,E
0,-1,0,0,1
1,1,-1,1,1
2,2,2,0,1
3,0,0,0,1
4,0,0,0,1
5,1,1,255,1
6,1,0,255,1
7,-1,0,0,1


In [322]:
dfi.dtypes

A    int32
B    int32
C    int32
E    int64
dtype: object

In [323]:
casted = dfi[dfi > 0]
casted

Unnamed: 0,A,B,C,E
0,,,,1
1,1.0,,1.0,1
2,2.0,2.0,,1
3,,,,1
4,,,,1
5,1.0,1.0,255.0,1
6,1.0,,255.0,1
7,,,,1


In [324]:
casted.dtypes

A    float64
B    float64
C    float64
E      int64
dtype: object

In [325]:
dfa = df3.copy()

In [326]:
dfa['A'] = dfa['A'].astype(float)

In [327]:
dfa.dtypes

A    float64
B    float64
C    float64
dtype: object

In [328]:
casted = dfa[df2 > 0]

In [329]:
casted

Unnamed: 0,A,B,C
0,,,
1,,,1.0
2,2.508658,2.005863,
3,,0.095385,
4,-0.024237,0.783884,
5,1.852954,1.258749,255.0
6,1.642884,0.672811,255.0
7,,,


In [330]:
casted.dtypes

A    float64
B    float64
C    float64
dtype: object

In [331]:
df = pd.DataFrame({'string': list('abc'),
   .....:                    'int64': list(range(1, 4)),
   .....:                    'uint8': np.arange(3, 6).astype('u1'),
   .....:                    'float64': np.arange(4.0, 7.0),
   .....:                    'bool1': [True, False, True],
   .....:                    'bool2': [False, True, False],
   .....:                    'dates': pd.date_range('now', periods=3),
   .....:                    'category': pd.Series(list("ABC")).astype('category')})

In [332]:
df['tdeltas'] = df.dates.diff()
df['uint64'] = np.arange(3, 6).astype('u8')
df['other_dates'] = pd.date_range('20130101', periods=3)
df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern')

In [333]:
df

Unnamed: 0,bool1,bool2,category,dates,float64,int64,string,uint8,tdeltas,uint64,other_dates,tz_aware_dates
0,True,False,A,2019-05-14 15:09:52.739860,4.0,1,a,3,NaT,3,2013-01-01,2013-01-01 00:00:00-05:00
1,False,True,B,2019-05-15 15:09:52.739860,5.0,2,b,4,1 days,4,2013-01-02,2013-01-02 00:00:00-05:00
2,True,False,C,2019-05-16 15:09:52.739860,6.0,3,c,5,1 days,5,2013-01-03,2013-01-03 00:00:00-05:00


In [334]:
df.dtypes

bool1                                   bool
bool2                                   bool
category                            category
dates                         datetime64[ns]
float64                              float64
int64                                  int64
string                                object
uint8                                  uint8
tdeltas                      timedelta64[ns]
uint64                                uint64
other_dates                   datetime64[ns]
tz_aware_dates    datetime64[ns, US/Eastern]
dtype: object

In [336]:
df.select_dtypes(include=['bool'])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


In [337]:
df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger'])

Unnamed: 0,bool1,bool2,float64,int64,tdeltas
0,True,False,4.0,1,NaT
1,False,True,5.0,2,1 days
2,True,False,6.0,3,1 days


In [338]:
df.select_dtypes(include=['object'])

Unnamed: 0,string
0,a
1,b
2,c


In [343]:
def subdtypes(dtype):
    subs = dtype.__subclasses__()
    if not subs:
        return dtype
    return [dtype, [subdtypes(dt) for dt in subs]]

In [344]:
subtypes(np.generic)

[numpy.generic,
 [[numpy.number,
   [[numpy.integer,
     [[numpy.signedinteger,
       [numpy.int8,
        numpy.int16,
        numpy.int32,
        numpy.int64,
        numpy.int64,
        numpy.timedelta64]],
      [numpy.unsignedinteger,
       [numpy.uint8,
        numpy.uint16,
        numpy.uint64,
        numpy.uint64,
        numpy.uint32]]]],
    [numpy.inexact,
     [[numpy.floating,
       [numpy.float16, numpy.float32, numpy.float128, numpy.float64]],
      [numpy.complexfloating,
       [numpy.complex64, numpy.complex128, numpy.complex256]]]]]],
  [numpy.flexible,
   [[numpy.character, [numpy.bytes_, numpy.str_]],
    [numpy.void, [numpy.record]]]],
  numpy.object_,
  numpy.datetime64,
  numpy.bool_]]