In [79]:
import numpy as np
import pandas as pd

In [80]:
In [1]: index = pd.date_range('1/1/2000', periods=8)

In [2]: s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [3]: df = pd.DataFrame(np.random.randn(8, 3), index=index,
   ...:                   columns=['A', 'B', 'C'])
   ...: 

In [4]: wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
   ...:               major_axis=pd.date_range('1/1/2000', periods=5),
   ...:               minor_axis=['A', 'B', 'C', 'D'])
   ...: 

In [81]:
long_series = pd.Series(np.random.randn(1000))
long_series.head()

0   -0.582466
1   -0.610784
2   -0.276036
3   -0.271322
4   -0.685328
dtype: float64

In [82]:
long_series.tail()

995    1.162374
996   -1.219760
997   -1.020035
998    0.590491
999    0.409130
dtype: float64

## Attributes and the raw ndarray
- shape: axis dimensions 
- Axis labels:
    - Series: index
    - DataFrame: index(row) and columns
    - Panel: items, major_axis, and minor_axis

In [83]:
df[:2]

Unnamed: 0,A,B,C
2000-01-01,0.241812,-0.014659,0.307147
2000-01-02,0.352503,0.843579,1.437852


In [84]:
df.columns = [x.lower() for x in df.columns]
df[:2]

Unnamed: 0,a,b,c
2000-01-01,0.241812,-0.014659,0.307147
2000-01-02,0.352503,0.843579,1.437852


In [85]:
s.values

array([-1.43661173, -1.08717586, -1.26023504,  0.04442268,  0.97388229])

## Accelerated operations
- numxpr
- bottleneck

## Flexible binary operations
### Matching / broadcasting behavior

In [86]:
df = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index = ['a', 'b', 'c']),
                  'two' : pd.Series(np.random.randn(4), index = ['a', 'b', 'c', 'd']),
                  'three' : pd.Series(np.random.randn(3), index = ['b', 'c', 'd'])
                  })
df

Unnamed: 0,one,three,two
a,-0.582705,,-0.219643
b,-0.773375,0.266226,-3.057832
c,-0.719738,-0.577806,-0.729143
d,,0.442752,-1.380926


In [87]:
row = df.ix[1]

In [88]:
column = df['two']

In [89]:
df.sub(row, axis='columns')

Unnamed: 0,one,three,two
a,0.19067,,2.838189
b,0.0,0.0,0.0
c,0.053637,-0.844032,2.328689
d,,0.176526,1.676906


In [90]:
df.sub(row, axis=1)

Unnamed: 0,one,three,two
a,0.19067,,2.838189
b,0.0,0.0,0.0
c,0.053637,-0.844032,2.328689
d,,0.176526,1.676906


In [91]:
df.sub(column, axis='index')

Unnamed: 0,one,three,two
a,-0.363062,,0.0
b,2.284457,3.324058,0.0
c,0.009405,0.151337,0.0
d,,1.823678,0.0


In [92]:
df.sub(column, axis=0)

Unnamed: 0,one,three,two
a,-0.363062,,0.0
b,2.284457,3.324058,0.0
c,0.009405,0.151337,0.0
d,,1.823678,0.0


In [93]:
dfmi = df.copy()

In [94]:
In [23]: dfmi.index = pd.MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')],
   ....:                                        names=['first','second'])
   ....: 

In [95]:
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,one,three,two
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,-0.582705,,-0.219643
1,b,-0.773375,0.266226,-3.057832
1,c,-0.719738,-0.577806,-0.729143
2,a,,0.442752,-1.380926


In [96]:
dfmi.sub(column, axis=0, level='second')

Unnamed: 0_level_0,Unnamed: 1_level_0,one,three,two
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,-0.363062,,0.0
1,b,2.284457,3.324058,0.0
1,c,0.009405,0.151337,0.0
2,a,,0.662395,-1.161283


### Missing data / operations with fill values

In [97]:
df

Unnamed: 0,one,three,two
a,-0.582705,,-0.219643
b,-0.773375,0.266226,-3.057832
c,-0.719738,-0.577806,-0.729143
d,,0.442752,-1.380926


In [98]:
df2 = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index = ['a', 'b', 'c']),
                  'two' : pd.Series(np.random.randn(4), index = ['a', 'b', 'c', 'd']),
                  'three' : pd.Series(np.random.randn(4), index = ['a', 'b', 'c', 'd'])
                  })
df2


Unnamed: 0,one,three,two
a,0.274982,0.259813,0.106765
b,-1.618926,0.206757,-0.010979
c,-1.772539,-0.954129,0.582046
d,,0.21552,-0.90443


In [99]:
df + df2

Unnamed: 0,one,three,two
a,-0.307723,,-0.112878
b,-2.392301,0.472983,-3.068812
c,-2.492277,-1.531935,-0.147097
d,,0.658272,-2.285356


In [100]:
df.add(df2, fill_value=0)

Unnamed: 0,one,three,two
a,-0.307723,0.259813,-0.112878
b,-2.392301,0.472983,-3.068812
c,-2.492277,-1.531935,-0.147097
d,,0.658272,-2.285356


### Flexible Comparisons
- eq
- ne
- lt
- gt
- le
- ge

In [101]:
df.gt(df2)

Unnamed: 0,one,three,two
a,False,False,False
b,True,True,False
c,True,True,False
d,False,True,False


In [102]:
df2.ne(df)

Unnamed: 0,one,three,two
a,True,True,True
b,True,True,True
c,True,True,True
d,True,True,True


### Boolean Reductions
- empty
- any()
- all()
- bool()

In [103]:
(df > 0).all()

one      False
three    False
two      False
dtype: bool

In [104]:
(df > 0).any().any()

True

In [105]:
df.empty

False

In [106]:
pd.Series([True])

0    True
dtype: bool

In [107]:
pd.Series([True]).bool()

True

In [108]:
pd.DataFrame([[True]])

Unnamed: 0,0
0,True


In [109]:
pd.DataFrame([[True]]).bool()

True

### Comparing if objects are equivalent

In [110]:
df + df == df*2

Unnamed: 0,one,three,two
a,True,False,True
b,True,True,True
c,True,True,True
d,False,True,True


In [111]:
(df + df == df*2).all()

one      False
three    False
two       True
dtype: bool

In [112]:
np.nan == np.nan

False

In [113]:
(df+df).equals(df*2)

True

Note that the Series or DataFrame index needs to be in the same order for equality to be True:

### Comparing array-like objects

In [114]:
In [51]: pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [115]:
pd.Index(['foo', 'bar', 'baz']) == 'foo'

array([ True, False, False], dtype=bool)

In [116]:
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [117]:
pd.Series(['foo', 'bar', 'baz']) == np.array(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [118]:
pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo', 'bar'])

ValueError: Series lengths must match to compare

In [None]:
np.array([1, 2, 3]) == np.array([2])

In [None]:
np.array([1, 2, 3]) == np.array([1, 2])

### Combining overlapping data sets

In [None]:
In [57]: df1 = pd.DataFrame({'A' : [1., np.nan, 3., 5., np.nan],
   ....:                     'B' : [np.nan, 2., 3., np.nan, 6.]})
   ....: 

In [58]: df2 = pd.DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.],
   ....:                     'B' : [np.nan, np.nan, 3., 4., 6., 8.]})
   ....: 

In [59]: df1

In [None]:
df2

In [None]:
In [61]: df1.combine_first(df2)

### General DataFrame Combine

In [None]:
In [62]: combiner = lambda x, y: np.where(pd.isnull(x), y, x)

In [63]: df1.combine(df2, combiner)

## Descriptive statistics
A large number of methods for computing descriptive statistics and other related operations on Series, DataFrame, and Panel. Most of these are aggregations (hence producing a lower-dimensional result) like sum(), mean(), and quantile(), but some of them, like cumsum() and cumprod(), produce an object of the same size. Generally speaking, these methods take an axis argument, just like ndarray.{sum, std, ...}, but the axis can be specified by name or integer:
- Series: no axis argument needed
- DataFrame: “index” (axis=0, default), “columns” (axis=1)
- Panel: “items” (axis=0), “major” (axis=1, default), “minor” (axis=2)

In [None]:
df

In [None]:
df.mean(0)

In [None]:
df.mean(1)

In [None]:
df.sum(0, skipna=False)

In [None]:
df.sum(axis=1, skipna=True)

In [None]:
ts_stand = (df - df.mean()) / df.std()
ts_stand.std()

In [None]:
In [71]: xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)

In [72]: xs_stand.std(1)

In [None]:
In [73]: df.cumsum()


Function | Description
------|------
count |  Number of non-null observations
sum	| Sum of values
mean |	Mean of values
mad	 | Mean absolute deviation
median	| Arithmetic median of values
min	| Minimum
max	| Maximum
mode |	Mode
abs	| Absolute Value
prod |	Product of values
std	| Bessel-corrected sample standard deviation
var	| Unbiased variance
sem	| Standard error of the mean
skew |	Sample skewness (3rd moment)
kurt |	Sample kurtosis (4th moment)
quantile |	Sample quantile (value at %)
cumsum |	Cumulative sum
cumprod	| Cumulative product
cummax	| Cumulative maximum
cummin	| Cumulative minimum

In [None]:
In [74]: np.mean(df['one'])

In [None]:

In [75]: np.mean(df['one'].values)

In [None]:
In [76]: series = pd.Series(np.random.randn(500))

In [77]: series[20:500] = np.nan

In [78]: series[10:20]  = 5

In [79]: series.nunique()

### Summarizing data: describe

In [None]:
In [80]: series = pd.Series(np.random.randn(1000))

In [81]: series[::2] = np.nan

In [82]: series.describe()

In [None]:
In [83]: frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e'])

In [84]: frame.ix[::2] = np.nan

In [85]: frame.describe()

In [None]:
In [86]: series.describe(percentiles=[.05, .25, .75, .95])

In [None]:
In [87]: s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])

In [88]: s.describe()

In [None]:
In [89]: frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)})

In [90]: frame.describe()

In [None]:
In [91]: frame.describe(include=['object'])

In [None]:
In [92]: frame.describe(include='all')


### Index of Min/Max Values
- idxmin()
- idxmax()

In [None]:

In [99]: df1.idxmin(axis=0)

In [None]:
In [101]: df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba'))

In [102]: df3

In [None]:
In [103]: df3['A'].idxmin()

### Value counts (histogramming) / Mode

In [None]:
In [104]: data = np.random.randint(0, 7, size=50)
In [106]: s = pd.Series(data)

In [107]: s.value_counts()

In [None]:
In [108]: pd.value_counts(data)

In [None]:
In [109]: s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])

In [110]: s5.mode()

In [None]:
In [111]: df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
   .....:                     "B": np.random.randint(-10, 15, size=50)})
   .....: 

In [112]: df5.mode()

### Discretization and quantiling

In [None]:
In [113]: arr = np.random.randn(20)

In [114]: factor = pd.cut(arr, 4)

In [115]: factor

In [None]:
In [116]: factor = pd.cut(arr, [-5, -1, 0, 1, 5])

In [117]: factor

In [None]:
In [118]: arr = np.random.randn(30)

In [119]: factor = pd.qcut(arr, [0, .25, .5, .75, 1])

In [120]: factor

In [None]:
In [121]: pd.value_counts(factor)

In [None]:
In [122]: arr = np.random.randn(20)

In [123]: factor = pd.cut(arr, [-np.inf, 0, np.inf])

In [124]: factor

## Function application

In [None]:
In [125]: import statsmodels.formula.api as sm

In [126]: bb = pd.read_csv('data/baseball.csv', index_col='id')

In [127]: (bb.query('h > 0')
   .....:    .assign(ln_h = lambda df: np.log(df.h))
   .....:    .pipe((sm.poisson, 'data'), 'hr ~ ln_h + year + g + C(lg)')
   .....:    .fit()
   .....:    .summary()
   .....: )
   .....: 

### Row or Column-wise Function Application

In [None]:
In [128]: df.apply(np.mean)

In [None]:
In [129]: df.apply(np.mean, axis=1)

In [None]:
In [130]: df.apply(lambda x: x.max() - x.min())

In [None]:
In [131]: df.apply(np.cumsum)

In [None]:
In [132]: df.apply(np.exp)

In [None]:
np.exp(df)

In [None]:
In [133]: tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'],
   .....:                     index=pd.date_range('1/1/2000', periods=1000))
   .....: 

In [134]: tsdf.apply(lambda x: x.idxmax())

In [None]:
def subtract_and_divide(x, sub, divide=1):
    return (x - sub) / divide

df.apply(subtract_and_divide, args=(5,), divide=3)


In [None]:
tsdf.head()

In [None]:
In [136]: tsdf.apply(pd.Series.interpolate).head()

### Applying elementwise Python functions

In [None]:
df4

In [None]:
df

In [None]:
In [138]: f = lambda x: len(str(x))

In [None]:
df['one'].map(f)

In [None]:
df.applymap(f)

In [None]:
In [141]: s = pd.Series(['six', 'seven', 'six', 'seven', 'six'],
   .....:               index=['a', 'b', 'c', 'd', 'e'])
   .....: 

In [142]: t = pd.Series({'six' : 6., 'seven' : 7.})

In [143]: s

In [None]:
s.map(t)

### Applying with a Panel

## Reindexing and altering labels
reindex() is the fundamental data alignment method in pandas. It is used to implement nearly all other features relying on label-alignment functionality. To reindex means to conform the data to match a given set of labels along a particular axis. This accomplishes several things:

- Reorders the existing data to match a new set of labels
- Inserts missing value (NA) markers in label locations where no data for that label existed
- If specified, fill data for missing labels using logic (highly relevant to working with time series data)

In [None]:
In [165]: s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [166]: s

In [None]:
In [167]: s.reindex(['e', 'b', 'f', 'd'])

In [None]:
df

In [None]:
In [169]: df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])

In [None]:
In [170]: rs = s.reindex(df.index)

In [171]: rs

### Reindexing to align with another object

In [None]:
#df.reindex_like(df2)

### Aligning objects with each other with align
The align() method is the fastest way to simultaneously align two objects. It supports a join argument (related to joining and merging):

- join='outer': take the union of the indexes (default)
- join='left': use the calling object’s index
- join='right': use the passed object’s index
- join='inner': intersect the indexes

In [None]:
In [176]: s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [177]: s1 = s[:4]

In [178]: s2 = s[1:]

In [179]: s1.align(s2)

In [None]:
In [180]: s1.align(s2, join='inner')

In [None]:
In [181]: s1.align(s2, join='left')

In [None]:
df.align(df2, join='inner')

In [None]:
df.align(df2, join='inner', axis=0)

In [None]:
df.align(df2.ix[0], axis=1)

### Filling while reindexing
reindex() takes an optional parameter method which is a filling method chosen from the following table:

Method |	Action
---|---
pad / ffill |	Fill values forward
bfill / backfill |	Fill values backward
nearest	 | Fill from the nearest index value

In [None]:
In [185]: rng = pd.date_range('1/3/2000', periods=8)

In [186]: ts = pd.Series(np.random.randn(8), index=rng)

In [187]: ts2 = ts[[0, 3, 6]]

In [188]: ts

In [None]:
ts2

In [None]:
In [190]: ts2.reindex(ts.index)

In [None]:

In [191]: ts2.reindex(ts.index, method='ffill')

In [None]:
In [192]: ts2.reindex(ts.index, method='bfill')

In [None]:
In [193]: ts2.reindex(ts.index, method='nearest')


In [None]:
In [194]: ts2.reindex(ts.index).fillna(method='ffill')


### Limits on filling while reindexing

In [None]:
In [195]: ts2.reindex(ts.index, method='ffill', limit=1)


In [None]:
In [196]: ts2.reindex(ts.index, method='ffill', tolerance='1 day')


### Dropping labels from an axis

In [None]:
df

In [None]:
In [198]: df.drop(['a', 'd'], axis=0)


In [None]:
In [199]: df.drop(['one'], axis=1)


### Renaming / mapping labels

In [None]:
s

In [None]:
In [202]: s.rename(str.upper)


In [None]:
In [203]: df.rename(columns={'one' : 'foo', 'two' : 'bar'},
   .....:           index={'a' : 'apple', 'b' : 'banana', 'd' : 'durian'})
   .....: 

In [None]:
In [204]: s.rename("scalar-name")


### Iteration
The behavior of basic iteration over pandas objects depends on the type. When iterating over a Series, it is regarded as array-like, and basic iteration produces the values. Other data structures, like DataFrame and Panel, follow the dict-like convention of iterating over the “keys” of the objects.

In short, basic iteration (for i in object) produces:

- Series: values
- DataFrame: column labels
- Panel: item labels

In [None]:
In [205]: df = pd.DataFrame({'col1' : np.random.randn(3), 'col2' : np.random.randn(3)},
   .....:                   index=['a', 'b', 'c'])
   .....: 

In [206]: for col in df:
   .....:     print(col)
   .....: 

Pandas objects also have the dict-like iteritems() method to iterate over the (key, value) pairs.

To iterate over the rows of a DataFrame, you can use the following methods:

- iterrows(): Iterate over the rows of a DataFrame as (index, Series) pairs. This converts the rows to Series objects, which can change the dtypes and has some performance implications.
- itertuples(): Iterate over the rows of a DataFrame as namedtuples of the values. This is a lot faster than iterrows(), and is in most cases preferable to use to iterate over the values of a DataFrame.

You should never modify something you are iterating over. 

In [None]:
In [207]: df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})

In [208]: for index, row in df.iterrows():
   .....:     row['a'] = 10
   .....: 

In [209]: df

### iteritems
Consistent with the dict-like interface, iteritems() iterates through key-value pairs:

- Series: (index, scalar value) pairs
- DataFrame: (column, Series) pairs
- Panel: (item, DataFrame) pairs

In [None]:
In [210]: for item, frame in wp.iteritems():
   .....:     print(item)
   .....:     print(frame)
   .....: 

### iterrows

In [None]:
In [211]: for row_index, row in df.iterrows():
   .....:     print('%s\n%s' % (row_index, row))
   .....: 

In [None]:
In [212]: df_orig = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])

In [213]: df_orig.dtypes

In [None]:

In [214]: row = next(df_orig.iterrows())[1]

In [215]: row

In [None]:
In [216]: row['int'].dtype


In [None]:
In [217]: df_orig['int'].dtype


In [None]:
In [218]: df2 = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})

In [219]: print(df2)

In [None]:

In [220]: print(df2.T)

In [None]:
In [221]: df2_t = pd.DataFrame(dict((idx,values) for idx, values in df2.iterrows()))

In [222]: print(df2_t)

In [None]:
In [223]: for row in df.itertuples():
   .....:     print(row)
   .....: 

### .dt accessor

In [None]:
# datetime
In [224]: s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))

In [225]: s


In [None]:
In [226]: s.dt.hour


In [None]:
In [227]: s.dt.second


In [None]:
In [228]: s.dt.day


In [None]:
In [229]: s[s.dt.day==2]


In [None]:
In [230]: stz = s.dt.tz_localize('US/Eastern')

In [231]: stz

In [None]:
In [233]: s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')


In [None]:
# DatetimeIndex
In [234]: s = pd.Series(pd.date_range('20130101', periods=4))

In [235]: s

In [None]:
In [236]: s.dt.strftime('%Y/%m/%d')


In [None]:
# PeriodIndex
In [237]: s = pd.Series(pd.period_range('20130101', periods=4))

In [238]: s

In [None]:
In [239]: s.dt.strftime('%Y/%m/%d')


In [None]:
# period
In [240]: s = pd.Series(pd.period_range('20130101', periods=4, freq='D'))

In [241]: s

In [None]:
s.dt.year

In [None]:
# timedelta
In [244]: s = pd.Series(pd.timedelta_range('1 day 00:00:05', periods=4, freq='s'))

In [245]: s

## Vectorized string methods

In [None]:
In [249]: s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

In [250]: s.str.lower()

## Sorting

### By Index

In [119]:
In [251]: unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'],
   .....:                          columns=['three', 'two', 'one'])
   .....: 

In [120]:
unsorted_df

Unnamed: 0,three,two,one
a,,-0.219643,-0.582705
d,0.442752,-1.380926,
c,-0.577806,-0.729143,-0.719738
b,0.266226,-3.057832,-0.773375


In [121]:
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,-0.219643,-0.582705
b,0.266226,-3.057832,-0.773375
c,-0.577806,-0.729143,-0.719738
d,0.442752,-1.380926,


In [122]:
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,0.442752,-1.380926,
c,-0.577806,-0.729143,-0.719738
b,0.266226,-3.057832,-0.773375
a,,-0.219643,-0.582705


In [123]:
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,-0.582705,,-0.219643
d,,0.442752,-1.380926
c,-0.719738,-0.577806,-0.729143
b,-0.773375,0.266226,-3.057832


In [124]:
unsorted_df['three'].sort_index()

a         NaN
b    0.266226
c   -0.577806
d    0.442752
Name: three, dtype: float64

### By Values

In [125]:
In [256]: df1 = pd.DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]})

In [257]: df1.sort_values(by='two')

Unnamed: 0,one,three,two
0,2,5,1
2,1,3,2
1,1,4,3
3,1,2,4


In [126]:
In [258]: df1[['one', 'two', 'three']].sort_values(by=['one','two'])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [127]:
In [259]: s[2] = np.nan

In [260]: s.sort_values()

a   -1.436612
b   -1.087176
d    0.044423
e    0.973882
c         NaN
dtype: float64

In [128]:
In [261]: s.sort_values(na_position='first')

c         NaN
a   -1.436612
b   -1.087176
d    0.044423
e    0.973882
dtype: float64

### searchsorted

In [129]:
In [262]: ser = pd.Series([1, 2, 3])

In [263]: ser.searchsorted([0, 3])

array([0, 2], dtype=int64)

In [132]:
In [263]: ser.searchsorted([0, 4])

array([0, 3], dtype=int64)

In [133]:
In [265]: ser.searchsorted([1, 3], side='right')

array([1, 3], dtype=int64)

In [134]:
In [266]: ser.searchsorted([1, 3], side='left')

array([0, 2], dtype=int64)

In [135]:

In [268]: ser.searchsorted([0, 3], sorter=np.argsort(ser))

array([0, 2], dtype=int64)

### smallest / largest values

In [136]:
In [269]: s = pd.Series(np.random.permutation(10))

In [270]: s

0    5
1    3
2    6
3    2
4    1
5    8
6    0
7    4
8    7
9    9
dtype: int32

In [137]:
s.sort_values()

6    0
4    1
3    2
1    3
7    4
0    5
2    6
8    7
5    8
9    9
dtype: int32

In [138]:
s.nsmallest(3)

6    0
4    1
3    2
dtype: int32

In [140]:
s.nlargest(3).nsmallest(1)

8    7
dtype: int32

In [141]:
In [274]: df = pd.DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1],
   .....:                    'b': list('abdceff'),
   .....:                    'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]})
   .....: 

In [275]: df.nlargest(3, 'a')

Unnamed: 0,a,b,c
5,11,f,3.0
3,10,c,3.2
4,8,e,


In [142]:

In [276]: df.nlargest(5, ['a', 'c'])

Unnamed: 0,a,b,c
5,11,f,3.0
3,10,c,3.2
4,8,e,
2,1,d,4.0
1,-1,b,2.0


In [143]:
In [277]: df.nsmallest(3, 'a')

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
6,-1,f,4.0


In [144]:
In [278]: df.nsmallest(5, ['a', 'c'])


Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
6,-1,f,4.0
2,1,d,4.0
4,8,e,


### Sorting by a multi-index column

In [145]:
In [279]: df1.columns = pd.MultiIndex.from_tuples([('a','one'),('a','two'),('b','three')])

In [280]: df1.sort_values(by=('a','two'))

Unnamed: 0_level_0,a,a,b
Unnamed: 0_level_1,one,two,three
3,1,2,4
2,1,3,2
1,1,4,3
0,2,5,1


## Copying
- Inserting, deleting, or modifying a column
- Assigning to the index or columns attributes
- For homogeneous data, directly modifying the values via the values attribute or advanced indexing

### dtypes

In [146]:
In [281]: dft = pd.DataFrame(dict(A = np.random.rand(3),
   .....:                         B = 1,
   .....:                         C = 'foo',
   .....:                         D = pd.Timestamp('20010102'),
   .....:                         E = pd.Series([1.0]*3).astype('float32'),
   .....:                                     F = False,
   .....:                                     G = pd.Series([1]*3,dtype='int8')))
   .....: 

In [282]: dft

Unnamed: 0,A,B,C,D,E,F,G
0,0.554824,1,foo,2001-01-02,1.0,False,1
1,0.08791,1,foo,2001-01-02,1.0,False,1
2,0.384694,1,foo,2001-01-02,1.0,False,1


In [147]:
dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [148]:
dft['A'].dtype

dtype('float64')

In [149]:
# string data forces an ``object`` dtype
In [286]: pd.Series([1, 2, 3, 6., 'foo'])

0      1
1      2
2      3
3      6
4    foo
dtype: object

In [150]:
In [287]: dft.get_dtype_counts()

bool              1
datetime64[ns]    1
float32           1
float64           1
int64             1
int8              1
object            1
dtype: int64

In [151]:
In [288]: df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32')

In [289]: df1

Unnamed: 0,A
0,-1.719762
1,-0.646836
2,1.423485
3,-1.146387
4,0.481992
5,0.992624
6,-0.213724
7,0.145913


In [153]:

In [291]: df2 = pd.DataFrame(dict( A = pd.Series(np.random.randn(8), dtype='float16'),
   .....:                         B = pd.Series(np.random.randn(8)),
   .....:                         C = pd.Series(np.array(np.random.randn(8), dtype='uint8')) ))
   .....: 

In [292]: df2.dtypes

A    float16
B    float64
C      uint8
dtype: object

### defaults

In [154]:
In [294]: pd.DataFrame([1, 2], columns=['a']).dtypes

a    int64
dtype: object

In [155]:
In [295]: pd.DataFrame({'a': [1, 2]}).dtypes

a    int64
dtype: object

In [156]:
In [296]: pd.DataFrame({'a': 1 }, index=list(range(2))).dtypes

a    int64
dtype: object

In [157]:
pd.DataFrame(np.array([1, 2])).dtypes

0    int32
dtype: object

### upcasting

In [158]:
In [298]: df3 = df1.reindex_like(df2).fillna(value=0.0) + df2

In [299]: df3.dtypes

A    float32
B    float64
C    float64
dtype: object

### astype

In [159]:
df3.dtypes

A    float32
B    float64
C    float64
dtype: object

In [160]:
df3.astype('float32').dtypes

A    float32
B    float32
C    float32
dtype: object

### object conversion

In [161]:
In [305]: df3['D'] = '1.'

In [306]: df3['E'] = '1'
    
In [307]: df3.convert_objects(convert_numeric=True).dtypes



A    float32
B    float64
C    float64
D    float64
E      int64
dtype: object

### gotchas

In [163]:
#

### Selecting columns based on dtype

In [164]:
In [328]: df = pd.DataFrame({'string': list('abc'),
   .....:                    'int64': list(range(1, 4)),
   .....:                    'uint8': np.arange(3, 6).astype('u1'),
   .....:                    'float64': np.arange(4.0, 7.0),
   .....:                    'bool1': [True, False, True],
   .....:                    'bool2': [False, True, False],
   .....:                    'dates': pd.date_range('now', periods=3).values,
   .....:                    'category': pd.Series(list("ABC")).astype('category')})
   .....: 


In [165]:
df

Unnamed: 0,bool1,bool2,category,dates,float64,int64,string,uint8
0,True,False,A,2016-03-26 14:49:40.568889,4.0,1,a,3
1,False,True,B,2016-03-27 14:49:40.568889,5.0,2,b,4
2,True,False,C,2016-03-28 14:49:40.568889,6.0,3,c,5


In [166]:
In [329]: df['tdeltas'] = df.dates.diff()

In [330]: df['uint64'] = np.arange(3, 6).astype('u8')

In [331]: df['other_dates'] = pd.date_range('20130101', periods=3).values

In [332]: df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern')

In [333]: df

Unnamed: 0,bool1,bool2,category,dates,float64,int64,string,uint8,tdeltas,uint64,other_dates,tz_aware_dates
0,True,False,A,2016-03-26 14:49:40.568889,4.0,1,a,3,NaT,3,2013-01-01,2013-01-01 00:00:00-05:00
1,False,True,B,2016-03-27 14:49:40.568889,5.0,2,b,4,1 days,4,2013-01-02,2013-01-02 00:00:00-05:00
2,True,False,C,2016-03-28 14:49:40.568889,6.0,3,c,5,1 days,5,2013-01-03,2013-01-03 00:00:00-05:00


In [167]:
df.dtypes

bool1                                   bool
bool2                                   bool
category                            category
dates                         datetime64[ns]
float64                              float64
int64                                  int64
string                                object
uint8                                  uint8
tdeltas                      timedelta64[ns]
uint64                                uint64
other_dates                   datetime64[ns]
tz_aware_dates    datetime64[ns, US/Eastern]
dtype: object

In [168]:
In [335]: df.select_dtypes(include=[bool])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


In [169]:
In [337]: df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger'])


Unnamed: 0,bool1,bool2,float64,int64,tdeltas
0,True,False,4.0,1,NaT
1,False,True,5.0,2,1 days
2,True,False,6.0,3,1 days


In [170]:
In [338]: df.select_dtypes(include=['object'])

Unnamed: 0,string
0,a
1,b
2,c


In [171]:
In [339]: def subdtypes(dtype):
   .....:     subs = dtype.__subclasses__()
   .....:     if not subs:
   .....:         return dtype
   .....:     return [dtype, [subdtypes(dt) for dt in subs]]
   .....: 

In [172]:
subdtypes(np.generic)

[numpy.generic,
 [[numpy.number,
   [[numpy.integer,
     [[numpy.signedinteger,
       [numpy.int8,
        numpy.int16,
        numpy.int64,
        numpy.int32,
        numpy.timedelta64,
        numpy.int32]],
      [numpy.unsignedinteger,
       [numpy.uint8,
        numpy.uint16,
        numpy.uint32,
        numpy.uint64,
        numpy.uint32]]]],
    [numpy.inexact,
     [[numpy.floating,
       [numpy.float16, numpy.float32, numpy.float64, numpy.float64]],
      [numpy.complexfloating,
       [numpy.complex64, numpy.complex128, numpy.complex128]]]]]],
  [numpy.flexible,
   [[numpy.character, [numpy.bytes_, numpy.str_]],
    [numpy.void, [numpy.record]]]],
  numpy.datetime64,
  numpy.bool_,
  numpy.object_]]