# Pandas Documentation on Essential Basic Functionality

In this notebook, you will work through the Pandas documentation on DataFrames.

## Imports

In [1]:
import numpy as np
import pandas as pd

## Pandas essential basic functionality

In this notebook, you are going to learn how to use Pandas by typing the code from the Pandas documentation into this notebook.

* Go to the Pandas [Essential Basic Functionality](http://pandas.pydata.org/pandas-docs/stable/basics.html#essential-basic-functionality).
* Type all of the code from that section of the documentation into this notebook and get it working.
* **To learn this API well, you must type the code rather than copy and pasting it**.
* Create a new cell in this section for each `In[]` prompt in the documentation.
* Ignore the cells in the **Grading** section below.
* No Markdown comments are needed.
* Skip the following sub-sections:
  - Tablewise Function Application
  - Applying with a Panel

In [2]:
index = pd.date_range('1/1/2000', periods=8)

In [3]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [4]:
df = pd.DataFrame(np.random.randn(8, 3), index=index,
                   columns=['A', 'B', 'C'])

In [5]:
wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
               major_axis=pd.date_range('1/1/2000', periods=5),
               minor_axis=['A', 'B', 'C', 'D'])

In [6]:
# Head and Tail
long_series = pd.Series(np.random.randn(1000))

In [7]:
long_series.head()

0    1.007282
1    1.791732
2   -0.588209
3   -1.297478
4   -0.504481
dtype: float64

In [8]:
long_series.tail(3)

997    1.123623
998    1.092019
999    0.013054
dtype: float64

In [9]:
# Attributes and the raw ndarray(s)
df[:2]

Unnamed: 0,A,B,C
2000-01-01,-0.406569,1.008304,-1.195713
2000-01-02,0.040278,-1.209185,0.672758


In [10]:
df.columns = [x.lower() for x in df.columns]

In [11]:
df

Unnamed: 0,a,b,c
2000-01-01,-0.406569,1.008304,-1.195713
2000-01-02,0.040278,-1.209185,0.672758
2000-01-03,-1.213263,0.043566,0.205806
2000-01-04,-0.524375,-1.056861,-0.526648
2000-01-05,-1.785636,0.006293,-0.136739
2000-01-06,0.317605,-1.362336,-0.639666
2000-01-07,0.32729,-0.960951,-0.407948
2000-01-08,-0.817593,-0.001213,0.431291


In [12]:
s.values

array([-0.93761274,  0.48239152, -0.89723692,  0.39659004, -1.58131646])

In [13]:
df.values

array([[ -4.06569077e-01,   1.00830361e+00,  -1.19571293e+00],
       [  4.02776142e-02,  -1.20918459e+00,   6.72758092e-01],
       [ -1.21326295e+00,   4.35659865e-02,   2.05805565e-01],
       [ -5.24375187e-01,  -1.05686056e+00,  -5.26648477e-01],
       [ -1.78563589e+00,   6.29315724e-03,  -1.36739225e-01],
       [  3.17605165e-01,  -1.36233565e+00,  -6.39666489e-01],
       [  3.27290478e-01,  -9.60951388e-01,  -4.07948164e-01],
       [ -8.17592626e-01,  -1.21319405e-03,   4.31291094e-01]])

In [14]:
wp.values

array([[[ 0.6185533 ,  0.79508584,  0.77416437,  0.38271012],
        [ 0.44550173, -0.28173788, -1.20638538,  0.62883017],
        [ 1.50969406,  0.79859473,  0.55982914, -0.80534882],
        [-0.64765053, -1.07752164,  0.25485545, -0.96224105],
        [-0.37993041, -1.08680739,  1.58622185, -0.11733421]],

       [[-0.55350429, -0.48792384, -1.89073818,  1.67311822],
        [-1.31201539,  1.31273259,  1.21552832,  1.17460155],
        [-0.00619637,  0.57875409,  0.22643859,  1.09387442],
        [-1.54091142, -0.02515883,  0.64643328,  1.22300603],
        [ 0.61654038,  1.1911688 ,  0.75604101, -1.17623843]]])

In [15]:
# Matching / broadcasting behavior
df = pd.DataFrame({ 'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                    'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                    'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [16]:
df

Unnamed: 0,one,three,two
a,0.35063,,0.295866
b,-0.39494,0.044568,-0.943682
c,0.90247,0.148269,-0.865695
d,,-0.16224,0.26923


In [17]:
row = df.ix[1]

In [18]:
column = df['two']

In [19]:
df.sub(row, axis='columns')

Unnamed: 0,one,three,two
a,0.74557,,1.239548
b,0.0,0.0,0.0
c,1.29741,0.103701,0.077987
d,,-0.206808,1.212912


In [20]:
df.sub(row, axis=1)

Unnamed: 0,one,three,two
a,0.74557,,1.239548
b,0.0,0.0,0.0
c,1.29741,0.103701,0.077987
d,,-0.206808,1.212912


In [21]:
df.sub(column, axis='index')

Unnamed: 0,one,three,two
a,0.054764,,0
b,0.548743,0.988251,0
c,1.768166,1.013964,0
d,,-0.431469,0


In [22]:
df.sub(column, axis=0)

Unnamed: 0,one,three,two
a,0.054764,,0
b,0.548743,0.988251,0
c,1.768166,1.013964,0
d,,-0.431469,0


In [23]:
dfmi = df.copy()

In [24]:
dfmi.index = pd.MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')],
                                        names=['first','second'])

In [25]:
dfmi.sub(column, axis=0, level='second')

Unnamed: 0_level_0,Unnamed: 1_level_0,one,three,two
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,0.054764,,0.0
1,b,0.548743,0.988251,0.0
1,c,1.768166,1.013964,0.0
2,a,,-0.458106,-0.026636


In [26]:
major_mean = wp.mean(axis='major')

In [27]:
major_mean

Unnamed: 0,Item1,Item2
A,0.309234,-0.559217
B,-0.170477,0.513915
C,0.393737,0.190741
D,-0.174677,0.797672


In [28]:
wp.sub(major_mean, axis='major')

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis)
Items axis: Item1 to Item2
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-05 00:00:00
Minor_axis axis: A to D

In [29]:
# Missing data / operations with fill values
df

Unnamed: 0,one,three,two
a,0.35063,,0.295866
b,-0.39494,0.044568,-0.943682
c,0.90247,0.148269,-0.865695
d,,-0.16224,0.26923


In [30]:
df2 = df

In [31]:
df2

Unnamed: 0,one,three,two
a,0.35063,,0.295866
b,-0.39494,0.044568,-0.943682
c,0.90247,0.148269,-0.865695
d,,-0.16224,0.26923


In [32]:
df + df2

Unnamed: 0,one,three,two
a,0.70126,,0.591732
b,-0.789879,0.089137,-1.887365
c,1.804941,0.296538,-1.731391
d,,-0.324479,0.538459


In [33]:
df.add(df2, fill_value=0)

Unnamed: 0,one,three,two
a,0.70126,,0.591732
b,-0.789879,0.089137,-1.887365
c,1.804941,0.296538,-1.731391
d,,-0.324479,0.538459


In [34]:
# Flexible Comparisons
df.gt(df2)

Unnamed: 0,one,three,two
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [35]:
df2.ne(df)

Unnamed: 0,one,three,two
a,False,True,False
b,False,False,False
c,False,False,False
d,True,False,False


In [36]:
# Boolean Reductions
(df > 0).all()

one      False
three    False
two      False
dtype: bool

In [37]:
(df > 0).any()

one      True
three    True
two      True
dtype: bool

In [38]:
(df > 0).any().any()

True

In [39]:
df.empty

False

In [40]:
pd.DataFrame(columns=list('ABC')).empty

True

In [41]:
pd.Series([True]).bool()

True

In [42]:
pd.Series([False]).bool()

False

In [43]:
pd.DataFrame([[True]]).bool()

True

In [44]:
pd.DataFrame([[False]]).bool()

False

In [45]:
df+df == df*2

Unnamed: 0,one,three,two
a,True,False,True
b,True,True,True
c,True,True,True
d,False,True,True


In [46]:
(df+df == df*2).all()

one      False
three    False
two       True
dtype: bool

In [47]:
np.nan == np.nan

False

In [48]:
(df+df).equals(df*2)

True

In [49]:
df1 = pd.DataFrame({'col':['foo', 0, np.nan]})

In [50]:
df2 = pd.DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0])

In [51]:
df1.equals(df2)

False

In [52]:
df1.equals(df2.sort_index())

True

In [53]:
# Comparing array-like objects
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [54]:
pd.Index(['foo', 'bar', 'baz']) == 'foo'

array([ True, False, False], dtype=bool)

In [55]:
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [56]:
pd.Series(['foo', 'bar', 'baz']) == np.array(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [57]:
np.array([1, 2, 3]) == np.array([2])

array([False,  True, False], dtype=bool)

In [58]:
np.array([1, 2, 3]) == np.array([1, 2])

  if __name__ == '__main__':


False

In [59]:
# Combining overlapping data sets
df1 = pd.DataFrame({'A' : [1., np.nan, 3., 5., np.nan],
                    'B' : [np.nan, 2., 3., np.nan, 6.]})

In [60]:
df2 = pd.DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.],
                    'B' : [np.nan, np.nan, 3., 4., 6., 8.]})

In [61]:
df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [62]:
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [63]:
df1.combine_first(df2)

Unnamed: 0,A,B
0,1,
1,2,2.0
2,3,3.0
3,5,4.0
4,3,6.0
5,7,8.0


In [64]:
# General DataFrame Combine
combiner = lambda x, y: np.where(pd.isnull(x), y, x)

In [65]:
df1.combine(df2, combiner)

Unnamed: 0,A,B
0,1,
1,2,2.0
2,3,3.0
3,5,4.0
4,3,6.0
5,7,8.0


In [66]:
# Descriptive statistics
df

Unnamed: 0,one,three,two
a,0.35063,,0.295866
b,-0.39494,0.044568,-0.943682
c,0.90247,0.148269,-0.865695
d,,-0.16224,0.26923


In [67]:
df.mean(0)

one      0.286054
three    0.010199
two     -0.311071
dtype: float64

In [68]:
df.mean(1)

a    0.323248
b   -0.431351
c    0.061681
d    0.053495
dtype: float64

In [69]:
df.sum(0, skipna=False)

one           NaN
three         NaN
two     -1.244282
dtype: float64

In [70]:
df.sum(axis=1, skipna=True)

a    0.646496
b   -1.294054
c    0.185044
d    0.106990
dtype: float64

In [71]:
ts_stand = (df - df.mean()) / df.std()

In [72]:
ts_stand.std()

one      1
three    1
two      1
dtype: float64

In [73]:
xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)

In [74]:
xs_stand.std(1)

a    1
b    1
c    1
d    1
dtype: float64

In [75]:
df.cumsum()

Unnamed: 0,one,three,two
a,0.35063,,0.295866
b,-0.044309,0.044568,-0.647816
c,0.858161,0.192838,-1.513512
d,,0.030598,-1.244282


In [76]:
np.mean(df['one'])

0.28605363610209505

In [77]:
np.mean(df['one'].values)

nan

In [78]:
series = pd.Series(np.random.randn(500))

In [79]:
series[20:500] = np.nan

In [80]:
series[10:20]  = 5

In [81]:
series.nunique()

11

In [82]:
# Summarizing data: describe
series = pd.Series(np.random.randn(1000))

In [83]:
series[::2] = np.nan

In [84]:
series[::2] = np.nan

In [85]:
frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e'])

In [86]:
frame.ix[::2] = np.nan

In [87]:
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,-0.081432,0.008728,-0.062403,0.047038,-0.064781
std,0.969924,1.027523,1.025435,1.018291,1.024505
min,-2.650648,-3.052942,-3.043583,-3.448384,-2.784825
25%,-0.744477,-0.619958,-0.731273,-0.685602,-0.787611
50%,-0.079619,0.000487,-0.094522,0.101878,-0.064818
75%,0.546256,0.660236,0.633699,0.742916,0.640715
max,2.776508,2.971246,3.120496,2.52405,3.150353


In [88]:
series.describe(percentiles=[.05, .25, .75, .95])

count    500.000000
mean       0.016607
std        1.035755
min       -2.866843
5%        -1.570856
25%       -0.727958
50%       -0.002712
75%        0.674244
95%        1.701096
max        3.345164
dtype: float64

In [89]:
s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])

In [90]:
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

In [91]:
frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)})

In [92]:
frame.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [93]:
frame.describe(include=['object'])

Unnamed: 0,a
count,4
unique,2
top,No
freq,2


In [94]:
frame.describe(include=['number'])

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [95]:
frame.describe(include='all')

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,No,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


In [96]:
# Index of Min/Max Values
s1 = pd.Series(np.random.randn(5))

In [97]:
s1

0   -0.755452
1   -0.346534
2   -1.698153
3    0.768436
4   -0.278066
dtype: float64

In [98]:
s1.idxmin(), s1.idxmax()

(2, 3)

In [99]:
df1 = pd.DataFrame(np.random.randn(5,3), columns=['A','B','C'])


In [100]:
df1

Unnamed: 0,A,B,C
0,-0.863096,-0.498235,0.069294
1,1.156667,-0.52437,0.68505
2,1.315697,-0.301356,1.231122
3,-0.073582,-1.044886,-1.303257
4,-0.025669,-0.188136,0.105554


In [101]:
df1.idxmin(axis=0)

A    0
B    3
C    3
dtype: int64

In [102]:
df1.idxmax(axis=1)

0    C
1    A
2    A
3    A
4    C
dtype: object

In [103]:
df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba'))

In [104]:
df3

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [105]:
df3['A'].idxmin()

'd'

In [106]:
# Value counts (histogramming) / Mode
data = np.random.randint(0, 7, size=50)

In [107]:
data

array([4, 3, 6, 2, 1, 1, 4, 0, 3, 5, 6, 4, 3, 4, 0, 3, 4, 2, 3, 6, 2, 1, 4,
       3, 5, 0, 0, 6, 5, 4, 5, 1, 6, 5, 0, 6, 6, 6, 1, 5, 0, 3, 2, 2, 6, 5,
       1, 1, 3, 5])

In [108]:
s = pd.Series(data)

In [109]:
s.value_counts()

6    9
5    8
3    8
4    7
1    7
0    6
2    5
dtype: int64

In [110]:
pd.value_counts(data)

6    9
5    8
3    8
4    7
1    7
0    6
2    5
dtype: int64

In [111]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])

In [112]:
s5.mode()


0    3
1    7
dtype: int64

In [113]:
df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
                    "B": np.random.randint(-10, 15, size=50)})

In [114]:
df5.mode()

Unnamed: 0,A,B
0,0,7


In [115]:
# Discretization and quantiling
arr = np.random.randn(20)

In [116]:
factor = pd.cut(arr, 4)

In [117]:
factor

[(-0.312, 0.682], (-0.312, 0.682], (-1.307, -0.312], (-0.312, 0.682], (-1.307, -0.312], ..., (-0.312, 0.682], (-0.312, 0.682], (-2.305, -1.307], (-0.312, 0.682], (-0.312, 0.682]]
Length: 20
Categories (4, object): [(-2.305, -1.307] < (-1.307, -0.312] < (-0.312, 0.682] < (0.682, 1.677]]

In [118]:
factor = pd.cut(arr, [-5, -1, 0, 1, 5])

In [119]:
factor

[(0, 1], (-1, 0], (-5, -1], (0, 1], (-1, 0], ..., (0, 1], (0, 1], (-5, -1], (0, 1], (0, 1]]
Length: 20
Categories (4, object): [(-5, -1] < (-1, 0] < (0, 1] < (1, 5]]

In [120]:
arr = np.random.randn(30)

In [121]:
factor = pd.qcut(arr, [0, .25, .5, .75, 1])

In [122]:
factor

[(0.5, 1.215], (0.5, 1.215], (-0.1, 0.5], (-0.849, -0.1], (0.5, 1.215], ..., [-2.468, -0.849], (-0.849, -0.1], (-0.1, 0.5], (-0.1, 0.5], [-2.468, -0.849]]
Length: 30
Categories (4, object): [[-2.468, -0.849] < (-0.849, -0.1] < (-0.1, 0.5] < (0.5, 1.215]]

In [123]:
pd.value_counts(factor)

(0.5, 1.215]        8
[-2.468, -0.849]    8
(-0.1, 0.5]         7
(-0.849, -0.1]      7
dtype: int64

In [124]:
arr = np.random.randn(20)

In [125]:
factor = pd.cut(arr, [-np.inf, 0, np.inf])

In [126]:
factor

[(0, inf], (0, inf], (0, inf], (-inf, 0], (0, inf], ..., (-inf, 0], (-inf, 0], (0, inf], (-inf, 0], (-inf, 0]]
Length: 20
Categories (2, object): [(-inf, 0] < (0, inf]]

In [127]:
# Function application
# Row or Column-wise Function Application
df.apply(np.mean)

one      0.286054
three    0.010199
two     -0.311071
dtype: float64

In [128]:
df.apply(np.mean, axis=1)

a    0.323248
b   -0.431351
c    0.061681
d    0.053495
dtype: float64

In [129]:
df.apply(lambda x: x.max() - x.min())

one      1.297410
three    0.310509
two      1.239548
dtype: float64

In [130]:
df.apply(np.cumsum)

Unnamed: 0,one,three,two
a,0.35063,,0.295866
b,-0.044309,0.044568,-0.647816
c,0.858161,0.192838,-1.513512
d,,0.030598,-1.244282


In [131]:
df.apply(np.exp)

Unnamed: 0,one,three,two
a,1.419962,,1.34429
b,0.673721,1.045576,0.389192
c,2.465687,1.159825,0.420759
d,,0.850237,1.308956


In [132]:
tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'],
                     index=pd.date_range('1/1/2000', periods=1000))

In [133]:
tsdf.apply(lambda x: x.idxmax())

A   2002-03-25
B   2001-08-15
C   2001-04-22
dtype: datetime64[ns]

In [134]:
tsdf

Unnamed: 0,A,B,C
2000-01-01,-0.605017,-0.503466,-0.589260
2000-01-02,0.673715,-0.214184,0.036850
2000-01-03,-0.660706,-0.536584,0.615845
2000-01-04,-0.854516,0.856811,-1.474554
2000-01-05,-0.686327,-1.045775,0.003639
2000-01-06,0.118494,0.921428,0.971279
2000-01-07,0.809830,1.808542,-1.497320
2000-01-08,-0.297301,-0.242663,-1.976904
2000-01-09,-0.810761,0.821544,-0.409226
2000-01-10,1.796416,0.828419,-0.516571


In [135]:
tsdf.apply(pd.Series.interpolate)

Unnamed: 0,A,B,C
2000-01-01,-0.605017,-0.503466,-0.589260
2000-01-02,0.673715,-0.214184,0.036850
2000-01-03,-0.660706,-0.536584,0.615845
2000-01-04,-0.854516,0.856811,-1.474554
2000-01-05,-0.686327,-1.045775,0.003639
2000-01-06,0.118494,0.921428,0.971279
2000-01-07,0.809830,1.808542,-1.497320
2000-01-08,-0.297301,-0.242663,-1.976904
2000-01-09,-0.810761,0.821544,-0.409226
2000-01-10,1.796416,0.828419,-0.516571


In [136]:
# Applying elementwise Python functions
df4 = df
df4

Unnamed: 0,one,three,two
a,0.35063,,0.295866
b,-0.39494,0.044568,-0.943682
c,0.90247,0.148269,-0.865695
d,,-0.16224,0.26923


In [137]:
f = lambda x: len(str(x))

In [138]:
df4['one'].map(f)

a    14
b    15
c    13
d     3
Name: one, dtype: int64

In [139]:
df4.applymap(f)

Unnamed: 0,one,three,two
a,14,3,14
b,15,13,15
c,13,14,15
d,3,15,14


In [140]:
s = pd.Series(['six', 'seven', 'six', 'seven', 'six'],
               index=['a', 'b', 'c', 'd', 'e'])

In [141]:
t = pd.Series({'six' : 6., 'seven' : 7.})

In [142]:
s

a      six
b    seven
c      six
d    seven
e      six
dtype: object

In [143]:
s.map(t)

a    6
b    7
c    6
d    7
e    6
dtype: float64

In [144]:
# Reindexing and altering labels
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [145]:
s

a   -1.728846
b   -0.357358
c   -0.864404
d   -0.107477
e    0.191669
dtype: float64

In [146]:
s.reindex(['e', 'b', 'f', 'd'])

e    0.191669
b   -0.357358
f         NaN
d   -0.107477
dtype: float64

In [147]:
df

Unnamed: 0,one,three,two
a,0.35063,,0.295866
b,-0.39494,0.044568,-0.943682
c,0.90247,0.148269,-0.865695
d,,-0.16224,0.26923


In [148]:
df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])

Unnamed: 0,three,two,one
c,0.148269,-0.865695,0.90247
f,,,
b,0.044568,-0.943682,-0.39494


In [149]:
 rs = s.reindex(df.index)

In [150]:
rs

a   -1.728846
b   -0.357358
c   -0.864404
d   -0.107477
dtype: float64

In [151]:
rs.index is df.index

True

In [152]:
# Reindexing to align with another object
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [153]:
df3

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [154]:
df.reindex_like(df2)

Unnamed: 0,A,B
0,,
1,,
2,,
3,,
4,,
5,,


In [155]:
# Aligning objects with each other with align
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a    0.842105
b    0.881107
c    0.898795
d   -0.997632
e    0.884702
dtype: float64

In [156]:
s1 = s[:4]
s1

a    0.842105
b    0.881107
c    0.898795
d   -0.997632
dtype: float64

In [157]:
s2 = s[1:]
s2

b    0.881107
c    0.898795
d   -0.997632
e    0.884702
dtype: float64

In [158]:
s1.align(s2)

(a    0.842105
 b    0.881107
 c    0.898795
 d   -0.997632
 e         NaN
 dtype: float64, a         NaN
 b    0.881107
 c    0.898795
 d   -0.997632
 e    0.884702
 dtype: float64)

In [159]:
s1.align(s2, join='inner')

(b    0.881107
 c    0.898795
 d   -0.997632
 dtype: float64, b    0.881107
 c    0.898795
 d   -0.997632
 dtype: float64)

In [160]:
s1.align(s2, join='left')

(a    0.842105
 b    0.881107
 c    0.898795
 d   -0.997632
 dtype: float64, a         NaN
 b    0.881107
 c    0.898795
 d   -0.997632
 dtype: float64)

In [161]:
df.align(df2, join='inner')

(Empty DataFrame
 Columns: []
 Index: [], Empty DataFrame
 Columns: []
 Index: [])

In [162]:
df.align(df2, join='inner', axis=0)

(Empty DataFrame
 Columns: [one, three, two]
 Index: [], Empty DataFrame
 Columns: [A, B]
 Index: [])

In [163]:
df.align(df2.ix[0], axis=1)

(    A   B      one     three       two
 a NaN NaN  0.35063       NaN  0.295866
 b NaN NaN -0.39494  0.044568 -0.943682
 c NaN NaN  0.90247  0.148269 -0.865695
 d NaN NaN      NaN -0.162240  0.269230, A         5
 B       NaN
 one     NaN
 three   NaN
 two     NaN
 Name: 0, dtype: float64)

In [164]:
# Filling while reindexing
rng = pd.date_range('1/3/2000', periods=8)

In [165]:
ts = pd.Series(np.random.randn(8), index=rng)

In [166]:
ts2 = ts[[0, 3, 6]]

In [167]:
ts

2000-01-03    0.694584
2000-01-04   -0.760721
2000-01-05   -0.165817
2000-01-06    1.509914
2000-01-07    0.305908
2000-01-08    0.323975
2000-01-09    0.875136
2000-01-10   -1.933676
Freq: D, dtype: float64

In [168]:
ts2

2000-01-03    0.694584
2000-01-06    1.509914
2000-01-09    0.875136
dtype: float64

In [169]:
ts2.reindex(ts.index)

2000-01-03    0.694584
2000-01-04         NaN
2000-01-05         NaN
2000-01-06    1.509914
2000-01-07         NaN
2000-01-08         NaN
2000-01-09    0.875136
2000-01-10         NaN
Freq: D, dtype: float64

In [170]:
ts2.reindex(ts.index, method='ffill')

2000-01-03    0.694584
2000-01-04    0.694584
2000-01-05    0.694584
2000-01-06    1.509914
2000-01-07    1.509914
2000-01-08    1.509914
2000-01-09    0.875136
2000-01-10    0.875136
Freq: D, dtype: float64

In [171]:
ts2.reindex(ts.index, method='bfill')

2000-01-03    0.694584
2000-01-04    1.509914
2000-01-05    1.509914
2000-01-06    1.509914
2000-01-07    0.875136
2000-01-08    0.875136
2000-01-09    0.875136
2000-01-10         NaN
Freq: D, dtype: float64

In [172]:
ts2.reindex(ts.index, method='nearest')

2000-01-03    0.694584
2000-01-04    0.694584
2000-01-05    1.509914
2000-01-06    1.509914
2000-01-07    1.509914
2000-01-08    0.875136
2000-01-09    0.875136
2000-01-10    0.875136
Freq: D, dtype: float64

In [173]:
ts2.reindex(ts.index).fillna(method='ffill')

2000-01-03    0.694584
2000-01-04    0.694584
2000-01-05    0.694584
2000-01-06    1.509914
2000-01-07    1.509914
2000-01-08    1.509914
2000-01-09    0.875136
2000-01-10    0.875136
Freq: D, dtype: float64

In [174]:
# Limits on filling while reindexing
ts2.reindex(ts.index, method='ffill', limit=1)

2000-01-03    0.694584
2000-01-04    0.694584
2000-01-05         NaN
2000-01-06    1.509914
2000-01-07    1.509914
2000-01-08         NaN
2000-01-09    0.875136
2000-01-10    0.875136
Freq: D, dtype: float64

In [175]:
ts2.reindex(ts.index, method='ffill', tolerance='1 day')

2000-01-03    0.694584
2000-01-04    0.694584
2000-01-05         NaN
2000-01-06    1.509914
2000-01-07    1.509914
2000-01-08         NaN
2000-01-09    0.875136
2000-01-10    0.875136
Freq: D, dtype: float64

In [176]:
# Dropping labels from an axis
df

Unnamed: 0,one,three,two
a,0.35063,,0.295866
b,-0.39494,0.044568,-0.943682
c,0.90247,0.148269,-0.865695
d,,-0.16224,0.26923


In [177]:
df.drop(['a', 'd'], axis=0)

Unnamed: 0,one,three,two
b,-0.39494,0.044568,-0.943682
c,0.90247,0.148269,-0.865695


In [178]:
df.drop(['one'], axis=1)

Unnamed: 0,three,two
a,,0.295866
b,0.044568,-0.943682
c,0.148269,-0.865695
d,-0.16224,0.26923


In [179]:
df.reindex(df.index.difference(['a', 'd']))

Unnamed: 0,one,three,two
b,-0.39494,0.044568,-0.943682
c,0.90247,0.148269,-0.865695


In [180]:
# Renaming / mapping labels
s

a    0.842105
b    0.881107
c    0.898795
d   -0.997632
e    0.884702
dtype: float64

In [181]:
s.rename(str.upper)

A    0.842105
B    0.881107
C    0.898795
D   -0.997632
E    0.884702
dtype: float64

In [182]:
df.rename(columns={'one' : 'foo', 'two' : 'bar'},
           index={'a' : 'apple', 'b' : 'banana', 'd' : 'durian'})

Unnamed: 0,foo,three,bar
apple,0.35063,,0.295866
banana,-0.39494,0.044568,-0.943682
c,0.90247,0.148269,-0.865695
durian,,-0.16224,0.26923


In [183]:
# Iteration
df = pd.DataFrame({'col1' : np.random.randn(3), 'col2' : np.random.randn(3)},
                   index=['a', 'b', 'c'])

In [184]:
for col in df:
     print(col)

col1
col2


In [185]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})

In [186]:
for index, row in df.iterrows():
     row['a'] = 10

In [187]:
for item, frame in wp.iteritems():
    print(item)
    print(frame)

Item1
                   A         B         C         D
2000-01-01  0.618553  0.795086  0.774164  0.382710
2000-01-02  0.445502 -0.281738 -1.206385  0.628830
2000-01-03  1.509694  0.798595  0.559829 -0.805349
2000-01-04 -0.647651 -1.077522  0.254855 -0.962241
2000-01-05 -0.379930 -1.086807  1.586222 -0.117334
Item2
                   A         B         C         D
2000-01-01 -0.553504 -0.487924 -1.890738  1.673118
2000-01-02 -1.312015  1.312733  1.215528  1.174602
2000-01-03 -0.006196  0.578754  0.226439  1.093874
2000-01-04 -1.540911 -0.025159  0.646433  1.223006
2000-01-05  0.616540  1.191169  0.756041 -1.176238


In [188]:
# iterrows
for row_index, row in df.iterrows():
    print('%s\n%s' % (row_index, row))

0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


In [189]:
df_orig = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])

In [190]:
df_orig.dtypes

int        int64
float    float64
dtype: object

In [191]:
row = next(df_orig.iterrows())[1]

In [192]:
row

int      1.0
float    1.5
Name: 0, dtype: float64

In [193]:
row['int'].dtype

dtype('float64')

In [194]:
df_orig['int'].dtype

dtype('int64')

In [195]:
df2 = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})

In [196]:
print(df2)

   x  y
0  1  4
1  2  5
2  3  6


In [197]:
print(df2.T)

   0  1  2
x  1  2  3
y  4  5  6


In [198]:
df2_t = pd.DataFrame(dict((idx,values) for idx, values in df2.iterrows()))

In [199]:
print(df2_t)

   0  1  2
x  1  2  3
y  4  5  6


In [200]:
# itertuples
for row in df.itertuples():
     print(row)

Pandas(Index=0, a=1, b='a')
Pandas(Index=1, a=2, b='b')
Pandas(Index=2, a=3, b='c')


In [201]:
# .dt accessor
s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))

In [202]:
s

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [203]:
s.dt.hour

0    9
1    9
2    9
3    9
dtype: int64

In [204]:
s.dt.second

0    12
1    12
2    12
3    12
dtype: int64

In [205]:
s.dt.day

0    1
1    2
2    3
3    4
dtype: int64

In [206]:
s[s.dt.day==2]

1   2013-01-02 09:10:12
dtype: datetime64[ns]

In [207]:
stz = s.dt.tz_localize('US/Eastern')

In [208]:
stz

0   2013-01-01 09:10:12-05:00
1   2013-01-02 09:10:12-05:00
2   2013-01-03 09:10:12-05:00
3   2013-01-04 09:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [209]:
stz.dt.tz

<DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>

In [210]:
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

0   2013-01-01 04:10:12-05:00
1   2013-01-02 04:10:12-05:00
2   2013-01-03 04:10:12-05:00
3   2013-01-04 04:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [211]:
# Datetime Index
s = pd.Series(pd.date_range('20130101', periods=4))

In [212]:
s

0   2013-01-01
1   2013-01-02
2   2013-01-03
3   2013-01-04
dtype: datetime64[ns]

In [213]:
s.dt.strftime('%Y/%m/%d')

0    2013/01/01
1    2013/01/02
2    2013/01/03
3    2013/01/04
dtype: object

In [214]:
# Period Index
s = pd.Series(pd.period_range('20130101', periods=4))

In [215]:
s

0   2013-01-01
1   2013-01-02
2   2013-01-03
3   2013-01-04
dtype: object

In [216]:
s.dt.strftime('%Y/%m/%d')

0    2013/01/01
1    2013/01/02
2    2013/01/03
3    2013/01/04
dtype: object

In [217]:
# Period
s = pd.Series(pd.period_range('20130101', periods=4, freq='D'))

In [218]:
s

0   2013-01-01
1   2013-01-02
2   2013-01-03
3   2013-01-04
dtype: object

In [219]:
s.dt.year

0    2013
1    2013
2    2013
3    2013
dtype: int64

In [220]:
s.dt.day

0    1
1    2
2    3
3    4
dtype: int64

In [221]:
# Time Delta
s = pd.Series(pd.timedelta_range('1 day 00:00:05', periods=4, freq='s'))

In [222]:
s

0   1 days 00:00:05
1   1 days 00:00:06
2   1 days 00:00:07
3   1 days 00:00:08
dtype: timedelta64[ns]

In [223]:
s.dt.days

0    1
1    1
2    1
3    1
dtype: int64

In [224]:
s.dt.seconds

0    5
1    6
2    7
3    8
dtype: int64

In [225]:
s.dt.components

Unnamed: 0,days,hours,minutes,seconds,milliseconds,microseconds,nanoseconds
0,1,0,0,5,0,0,0
1,1,0,0,6,0,0,0
2,1,0,0,7,0,0,0
3,1,0,0,8,0,0,0


In [227]:
# Vectorized string methods
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

In [228]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [229]:
# Sorting
# By Index
unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'],
                          columns=['three', 'two', 'one'])

In [230]:
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,,
b,,,
c,,,
d,,,


In [231]:
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,,,
c,,,
b,,,
a,,,


In [232]:
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,,,
d,,,
c,,,
b,,,


In [233]:
unsorted_df['three'].sort_index()

a   NaN
b   NaN
c   NaN
d   NaN
Name: three, dtype: float64

In [234]:
# By Values
df1 = pd.DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]})

In [235]:
df1[['one', 'two', 'three']].sort_values(by=['one','two'])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [236]:
s[2] = np.nan

In [237]:
s.sort_values()

0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
2     NaN
5     NaN
dtype: object

In [238]:
s.sort_values(na_position='first')

2     NaN
5     NaN
0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
dtype: object

In [239]:
# searchsorted
ser = pd.Series([1, 2, 3])

In [240]:
ser.searchsorted([0, 3])

array([0, 2])

In [241]:
ser.searchsorted([0, 4])

array([0, 3])

In [242]:
ser.searchsorted([1, 3], side='right')

array([1, 3])

In [243]:
ser.searchsorted([1, 3], side='left')

array([0, 2])

In [244]:
ser = pd.Series([3, 1, 2])

In [245]:
ser.searchsorted([0, 3], sorter=np.argsort(ser))

array([0, 2])

In [246]:
# smallest / largest values
s = pd.Series(np.random.permutation(10))

In [247]:
s

0    9
1    7
2    3
3    8
4    5
5    1
6    2
7    4
8    6
9    0
dtype: int64

In [248]:
s.sort_values()

9    0
5    1
6    2
2    3
7    4
4    5
8    6
1    7
3    8
0    9
dtype: int64

In [249]:
s.nsmallest(3)

9    0
5    1
6    2
dtype: int64

In [250]:
s.nlargest(3)

0    9
3    8
1    7
dtype: int64

In [251]:
df = pd.DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1],
                   'b': list('abdceff'),
                   'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]})

In [252]:
df.nlargest(3, 'a')

Unnamed: 0,a,b,c
5,11,f,3.0
3,10,c,3.2
4,8,e,


In [253]:
df.nlargest(5, ['a', 'c'])

Unnamed: 0,a,b,c
5,11,f,3.0
3,10,c,3.2
4,8,e,
2,1,d,4.0
1,-1,b,2.0


In [254]:
df.nsmallest(3, 'a')

Unnamed: 0,a,b,c
0,-2,a,1
1,-1,b,2
6,-1,f,4


In [255]:
df.nsmallest(5, ['a', 'c'])

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
6,-1,f,4.0
2,1,d,4.0
4,8,e,


In [256]:
# Sorting by a multi-index column
df1.columns = pd.MultiIndex.from_tuples([('a','one'),('a','two'),('b','three')])

In [257]:
df1.sort_values(by=('a','two'))

Unnamed: 0_level_0,a,a,b
Unnamed: 0_level_1,one,two,three
3,1,2,4
2,1,3,2
1,1,4,3
0,2,5,1


In [258]:
# dtypes
dft = pd.DataFrame(dict( A = np.random.rand(3),
                         B = 1,
                         C = 'foo',
                         D = pd.Timestamp('20010102'),
                         E = pd.Series([1.0]*3).astype('float32'),
                         F = False,
                         G = pd.Series([1]*3,dtype='int8')))

In [259]:
dft

Unnamed: 0,A,B,C,D,E,F,G
0,0.147565,1,foo,2001-01-02,1,False,1
1,0.185941,1,foo,2001-01-02,1,False,1
2,0.985124,1,foo,2001-01-02,1,False,1


In [260]:
dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [261]:
dft['A'].dtype

dtype('float64')

In [262]:
pd.Series([1, 2, 3, 4, 5, 6.])

0    1
1    2
2    3
3    4
4    5
5    6
dtype: float64

In [263]:
pd.Series([1, 2, 3, 6., 'foo'])

0      1
1      2
2      3
3      6
4    foo
dtype: object

In [264]:
dft.get_dtype_counts()

bool              1
datetime64[ns]    1
float32           1
float64           1
int64             1
int8              1
object            1
dtype: int64

In [265]:
df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32')

In [266]:
df1

Unnamed: 0,A
0,-0.774622
1,-0.299246
2,-0.875922
3,-0.243025
4,-1.102832
5,-0.74624
6,-1.171694
7,-1.285919


In [267]:
df1.dtypes

A    float32
dtype: object

In [268]:
df2 = pd.DataFrame(dict( A = pd.Series(np.random.randn(8), dtype='float16'),
                         B = pd.Series(np.random.randn(8)),
                         C = pd.Series(np.array(np.random.randn(8), dtype='uint8')) ))

In [269]:
df2

Unnamed: 0,A,B,C
0,-1.273438,-0.432107,0
1,-0.981445,-0.722863,0
2,-0.016708,-0.363594,0
3,2.105469,0.397811,2
4,-1.051758,-0.771426,1
5,0.604492,0.826952,255
6,0.385986,-0.775677,0
7,1.791992,0.954396,0


In [270]:
df2.dtypes

A    float16
B    float64
C      uint8
dtype: object

In [271]:
pd.DataFrame([1, 2], columns=['a']).dtypes

a    int64
dtype: object

In [272]:
pd.DataFrame({'a': [1, 2]}).dtypes

a    int64
dtype: object

In [273]:
pd.DataFrame({'a': 1 }, index=list(range(2))).dtypes

a    int64
dtype: object

In [274]:
frame = pd.DataFrame(np.array([1, 2]))

In [275]:
# upcasting
df3 = df1.reindex_like(df2).fillna(value=0.0) + df2

In [276]:
df3

Unnamed: 0,A,B,C
0,-2.04806,-0.432107,0
1,-1.280692,-0.722863,0
2,-0.89263,-0.363594,0
3,1.862444,0.397811,2
4,-2.15459,-0.771426,1
5,-0.141748,0.826952,255
6,-0.785708,-0.775677,0
7,0.506073,0.954396,0


In [277]:
df3.dtypes

A    float32
B    float64
C    float64
dtype: object

In [278]:
df3.values.dtype

dtype('float64')

In [279]:
# astype
df3

Unnamed: 0,A,B,C
0,-2.04806,-0.432107,0
1,-1.280692,-0.722863,0
2,-0.89263,-0.363594,0
3,1.862444,0.397811,2
4,-2.15459,-0.771426,1
5,-0.141748,0.826952,255
6,-0.785708,-0.775677,0
7,0.506073,0.954396,0


In [280]:
df3.dtypes

A    float32
B    float64
C    float64
dtype: object

In [281]:
df3.astype('float32').dtypes

A    float32
B    float32
C    float32
dtype: object

In [282]:
# object conversion
df3['D'] = '1.'

In [283]:
df3['E'] = '1'

In [284]:
df3['D'] = df3['D'].astype('float16')

In [285]:
df3['E'] = df3['E'].astype('int32')

In [286]:
df3.dtypes

A    float32
B    float64
C    float64
D    float16
E      int32
dtype: object

In [287]:
import datetime

In [288]:
s = pd.Series([datetime.datetime(2001,1,1,0,0),
               'foo', 1.0, 1, pd.Timestamp('20010104'),
               '20010105'], dtype='O')

In [289]:
s

0    2001-01-01 00:00:00
1                    foo
2                      1
3                      1
4    2001-01-04 00:00:00
5               20010105
dtype: object

In [290]:
s.convert_objects(convert_dates='coerce')

  if __name__ == '__main__':


0   2001-01-01
1          NaT
2          NaT
3          NaT
4   2001-01-04
5   2001-01-05
dtype: datetime64[ns]

In [291]:
# gotchas
dfi = df3.astype('int32')

In [292]:
dfi['E'] = 1

In [293]:
dfi

Unnamed: 0,A,B,C,D,E
0,-2,0,0,1,1
1,-1,0,0,1,1
2,0,0,0,1,1
3,1,0,2,1,1
4,-2,0,1,1,1
5,0,0,255,1,1
6,0,0,0,1,1
7,0,0,0,1,1


In [294]:
dfi.dtypes

A    int32
B    int32
C    int32
D    int32
E    int64
dtype: object

In [295]:
casted = dfi[dfi>0]

In [296]:
casted

Unnamed: 0,A,B,C,D,E
0,,,,1,1
1,,,,1,1
2,,,,1,1
3,1.0,,2.0,1,1
4,,,1.0,1,1
5,,,255.0,1,1
6,,,,1,1
7,,,,1,1


In [297]:
casted.dtypes

A    float64
B    float64
C    float64
D      int32
E      int64
dtype: object

In [298]:
dfa = df3.copy()

In [299]:
dfa['A'] = dfa['A'].astype('float32')

In [300]:
casted = dfa[df2>0]

In [301]:
casted

Unnamed: 0,A,B,C,D,E
0,,,,,
1,,,,,
2,,,,,
3,1.862444,0.397811,2.0,,
4,,,1.0,,
5,-0.141748,0.826952,255.0,,
6,-0.785708,,,,
7,0.506073,0.954396,,,


In [302]:
casted.dtypes

A    float32
B    float64
C    float64
D    float16
E    float64
dtype: object

In [303]:
# Selecting columns based on dtype
df = pd.DataFrame({ 'string': list('abc'),
                    'int64': list(range(1, 4)),
                    'uint8': np.arange(3, 6).astype('u1'),
                    'float64': np.arange(4.0, 7.0),
                    'bool1': [True, False, True],
                    'bool2': [False, True, False],
                    'dates': pd.date_range('now', periods=3).values,
                    'category': pd.Series(list("ABC")).astype('category')})

In [304]:
df['tdeltas'] = df.dates.diff()

In [305]:
df['uint64'] = np.arange(3, 6).astype('u8')

In [306]:
df['other_dates'] = pd.date_range('20130101', periods=3).values

In [307]:
df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern')

In [308]:
df

Unnamed: 0,bool1,bool2,category,dates,float64,int64,string,uint8,tdeltas,uint64,other_dates,tz_aware_dates
0,True,False,A,2016-02-16 07:01:23.485752,4,1,a,3,NaT,3,2013-01-01,2013-01-01 00:00:00-05:00
1,False,True,B,2016-02-17 07:01:23.485752,5,2,b,4,1 days,4,2013-01-02,2013-01-02 00:00:00-05:00
2,True,False,C,2016-02-18 07:01:23.485752,6,3,c,5,1 days,5,2013-01-03,2013-01-03 00:00:00-05:00


In [309]:
df.dtypes

bool1                                   bool
bool2                                   bool
category                            category
dates                         datetime64[ns]
float64                              float64
int64                                  int64
string                                object
uint8                                  uint8
tdeltas                      timedelta64[ns]
uint64                                uint64
other_dates                   datetime64[ns]
tz_aware_dates    datetime64[ns, US/Eastern]
dtype: object

In [310]:
df.select_dtypes(include=[bool])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


In [311]:
df.select_dtypes(include=['bool'])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


In [312]:
df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger'])

Unnamed: 0,bool1,bool2,float64,int64,tdeltas
0,True,False,4,1,NaT
1,False,True,5,2,1 days
2,True,False,6,3,1 days


In [313]:
df.select_dtypes(include=['object'])

Unnamed: 0,string
0,a
1,b
2,c


In [None]:
def subdtypes(dtype):
    subs = dtype.__subclasses__()
    if not subs:
        return dtype
    return [dtype, [subdtypes(dt) for dt in subs]]