# Pandas - Selection & Indexing

### Contents
 - [Selection by Location](#location)
 - [Selection by Callable](#callable)
 - [Selection by Where](#where)
 - [Indexing](#indexing)
 

In [148]:
from IPython.core.display import HTML
css = open('styles/style-table.css').read() + open('styles/style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [149]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

#pd.set_option('display.notebook_repr_html', False)
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', 150)
#pd.set_option('display.max_seq_items', None)
 
#%config InlineBackend.figure_formats = {'pdf',}
%matplotlib inline

import seaborn as sns
sns.set_context('notebook')
sns.set_style('darkgrid')

<a id="selection" />
----
### Selection

In [150]:
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,1.957795,1.194402,0.560173,-0.202792
2000-01-02,-0.794205,1.531326,0.346793,1.009707
2000-01-03,-1.525923,0.551424,0.562002,-0.86958
2000-01-04,1.43141,0.991284,-1.727871,-0.712236
2000-01-05,0.064854,-0.637659,-2.556065,-1.411646
2000-01-06,-1.486389,-0.309166,1.057885,-0.267456
2000-01-07,0.30741,1.198607,-0.274807,0.292673
2000-01-08,-0.957134,-1.146282,-0.114233,3.022594


In [151]:
# Basic Selection
s = df['A']
s = df.A
s

2000-01-01    1.957795
2000-01-02   -0.794205
2000-01-03   -1.525923
2000-01-04    1.431410
2000-01-05    0.064854
2000-01-06   -1.486389
2000-01-07    0.307410
2000-01-08   -0.957134
Freq: D, Name: A, dtype: float64

In [152]:
df[['B', 'A']] = df[['A', 'B']]
df

Unnamed: 0,A,B,C,D
2000-01-01,1.194402,1.957795,0.560173,-0.202792
2000-01-02,1.531326,-0.794205,0.346793,1.009707
2000-01-03,0.551424,-1.525923,0.562002,-0.86958
2000-01-04,0.991284,1.43141,-1.727871,-0.712236
2000-01-05,-0.637659,0.064854,-2.556065,-1.411646
2000-01-06,-0.309166,-1.486389,1.057885,-0.267456
2000-01-07,1.198607,0.30741,-0.274807,0.292673
2000-01-08,-1.146282,-0.957134,-0.114233,3.022594


In [153]:
# Add column
df['E'] = list(range(len(df.index)))
df

Unnamed: 0,A,B,C,D,E
2000-01-01,1.194402,1.957795,0.560173,-0.202792,0
2000-01-02,1.531326,-0.794205,0.346793,1.009707,1
2000-01-03,0.551424,-1.525923,0.562002,-0.86958,2
2000-01-04,0.991284,1.43141,-1.727871,-0.712236,3
2000-01-05,-0.637659,0.064854,-2.556065,-1.411646,4
2000-01-06,-0.309166,-1.486389,1.057885,-0.267456,5
2000-01-07,1.198607,0.30741,-0.274807,0.292673,6
2000-01-08,-1.146282,-0.957134,-0.114233,3.022594,7


<a id="location"/>
--
### Selection by Location

In [154]:
# Slicing using index position
df.iloc[1]

A    1.531326
B   -0.794205
C    0.346793
D    1.009707
E    1.000000
Name: 2000-01-02 00:00:00, dtype: float64

In [155]:
df.iloc[:3]

Unnamed: 0,A,B,C,D,E
2000-01-01,1.194402,1.957795,0.560173,-0.202792,0
2000-01-02,1.531326,-0.794205,0.346793,1.009707,1
2000-01-03,0.551424,-1.525923,0.562002,-0.86958,2


In [156]:
# Select by row and columns
df.iloc[:3, 0:3]

Unnamed: 0,A,B,C
2000-01-01,1.194402,1.957795,0.560173
2000-01-02,1.531326,-0.794205,0.346793
2000-01-03,0.551424,-1.525923,0.562002


In [157]:
# Assignment using selection
dfa = df.copy()
dfa.iloc[:3, 0:3] = 0
dfa

Unnamed: 0,A,B,C,D,E
2000-01-01,0.0,0.0,0.0,-0.202792,0
2000-01-02,0.0,0.0,0.0,1.009707,1
2000-01-03,0.0,0.0,0.0,-0.86958,2
2000-01-04,0.991284,1.43141,-1.727871,-0.712236,3
2000-01-05,-0.637659,0.064854,-2.556065,-1.411646,4
2000-01-06,-0.309166,-1.486389,1.057885,-0.267456,5
2000-01-07,1.198607,0.30741,-0.274807,0.292673,6
2000-01-08,-1.146282,-0.957134,-0.114233,3.022594,7


In [158]:
# Selection by integer list
df.iloc[[1,4],[0,1,2]]

Unnamed: 0,A,B,C
2000-01-02,1.531326,-0.794205,0.346793
2000-01-05,-0.637659,0.064854,-2.556065


In [159]:
# Not specifying colums is same as all columns
# Select rows 1 to 3 with ALL columns
df.iloc[1:3]

Unnamed: 0,A,B,C,D,E
2000-01-02,1.531326,-0.794205,0.346793,1.009707,1
2000-01-03,0.551424,-1.525923,0.562002,-0.86958,2


In [160]:
# Select rows 1 to 3 with ALL columns
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D,E
2000-01-02,1.531326,-0.794205,0.346793,1.009707,1
2000-01-03,0.551424,-1.525923,0.562002,-0.86958,2


In [161]:
# Select all rows for columns 1 to 3
df.iloc[:,1:3]

Unnamed: 0,B,C
2000-01-01,1.957795,0.560173
2000-01-02,-0.794205,0.346793
2000-01-03,-1.525923,0.562002
2000-01-04,1.43141,-1.727871
2000-01-05,0.064854,-2.556065
2000-01-06,-1.486389,1.057885
2000-01-07,0.30741,-0.274807
2000-01-08,-0.957134,-0.114233


In [162]:
# Getting a specific value. same as iat
df.iloc[1,1]
df.iat[1,1]

-0.79420520196443001

<a div="callable" />
----
### Selection by Callable

In [163]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'),
                   columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,1.607272,0.14257,0.260287,-0.25074
b,0.189205,-1.500316,-0.059333,-0.100386
c,-1.800183,-1.015642,-0.597815,-0.424448
d,0.231119,0.598735,-0.570128,-0.553477
e,-1.014603,0.598347,-0.079898,-0.283085
f,0.685932,-1.255316,0.622536,0.895137


In [164]:
# Select rows where A value is > 0
df1[df1.A > 0]

Unnamed: 0,A,B,C,D
a,1.607272,0.14257,0.260287,-0.25074
b,0.189205,-1.500316,-0.059333,-0.100386
d,0.231119,0.598735,-0.570128,-0.553477
f,0.685932,-1.255316,0.622536,0.895137


In [165]:
# Select rows where A value is > 0
df1.loc[df1.A > 0]

Unnamed: 0,A,B,C,D
a,1.607272,0.14257,0.260287,-0.25074
b,0.189205,-1.500316,-0.059333,-0.100386
d,0.231119,0.598735,-0.570128,-0.553477
f,0.685932,-1.255316,0.622536,0.895137


In [166]:
# Select rows where A > 0 and select only columns A & B
df1.loc[df1.A > 0, ['A','B']]

Unnamed: 0,A,B
a,1.607272,0.14257
b,0.189205,-1.500316
d,0.231119,0.598735
f,0.685932,-1.255316


In [167]:
# Select rows where A > 0 and select only columns A & B
df1[df1.A > 0][['A','B']]

Unnamed: 0,A,B
a,1.607272,0.14257
b,0.189205,-1.500316
d,0.231119,0.598735
f,0.685932,-1.255316


In [171]:
df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
                    'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
                   'c' : np.random.randn(7)})
df2

Unnamed: 0,a,b,c
0,one,x,0.835523
1,one,y,0.126119
2,two,y,-2.402926
3,three,x,-0.045849
4,two,y,0.751223
5,one,x,0.746446
6,six,x,-0.189913


In [175]:
criterion = df2['a'].map(lambda x: x.startswith('t'))
criterion
df2[criterion]

Unnamed: 0,a,b,c
2,two,y,-2.402926
3,three,x,-0.045849
4,two,y,0.751223


In [178]:
# same as above but slower
df2[[x.startswith('t') for x in df2.a]]

Unnamed: 0,a,b,c
2,two,y,-2.402926
3,three,x,-0.045849
4,two,y,0.751223


In [179]:
df2[criterion & (df2['b'] == 'x')]

Unnamed: 0,a,b,c
3,three,x,-0.045849


In [180]:
df2.loc[criterion & (df2['b'] == 'x'),'b':'c']

Unnamed: 0,b,c
3,x,-0.045849


In [168]:
# Slicing using label
df.loc['2000-01-01']

A    1.194402
B    1.957795
C    0.560173
D   -0.202792
E    0.000000
Name: 2000-01-01 00:00:00, dtype: float64

### Selection with "is in"

In [182]:
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [188]:
# select rows where one or more columns have values you want
#s.isin([2,4,5])
s[s.isin([2,4,6])]

2    2
0    4
dtype: int64

In [186]:
# select rows where row index have values you want
s[s.index.isin([2, 4, 6])]

4    0
2    2
dtype: int64

In [189]:
f = pd.DataFrame({'vals': [1, 2, 3, 4], 
                  'ids': ['a', 'b', 'f', 'n'],
                  'ids2': ['a', 'n', 'c', 'n']})
f

Unnamed: 0,ids,ids2,vals
0,a,a,1
1,b,n,2
2,f,c,3
3,n,n,4


In [190]:
values = ['a', 'b', 1, 3]
f.isin(values)

Unnamed: 0,ids,ids2,vals
0,True,True,True
1,True,False,False
2,False,False,True
3,False,False,False


In [192]:
values = {'ids': ['a', 'b'], 'vals': [1, 3]}
f.isin(values)

Unnamed: 0,ids,ids2,vals
0,True,False,True
1,True,False,False
2,False,False,True
3,False,False,False


In [169]:
df.loc['2000-01-01': '2000-01-05']

Unnamed: 0,A,B,C,D,E
2000-01-01,1.194402,1.957795,0.560173,-0.202792,0
2000-01-02,1.531326,-0.794205,0.346793,1.009707,1
2000-01-03,0.551424,-1.525923,0.562002,-0.86958,2
2000-01-04,0.991284,1.43141,-1.727871,-0.712236,3
2000-01-05,-0.637659,0.064854,-2.556065,-1.411646,4


### where( )
- Boolean indexing but retaining original shape of data (for Series).
- Replacement options

In [194]:
s = pd.Series([0,1,2,3,4,5])

dates = pd.date_range('2015-1-1', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2015-01-01,-0.394545,0.17854,0.832353,-1.130516
2015-01-02,0.485004,0.118978,-0.902221,0.111592
2015-01-03,0.949849,-1.194467,1.152051,-1.20408
2015-01-04,-0.616025,-1.35144,-0.183004,-2.520606
2015-01-05,0.625008,-0.192718,-0.665486,0.168708
2015-01-06,0.873265,1.180235,-0.139416,2.289179
2015-01-07,0.820911,-0.390134,-0.711251,-0.641265
2015-01-08,0.006094,-1.093164,0.459006,-0.830649


In [200]:
df[df >0]

Unnamed: 0,A,B,C,D
2015-01-01,,0.17854,0.832353,
2015-01-02,0.485004,0.118978,,0.111592
2015-01-03,0.949849,,1.152051,
2015-01-04,,,,
2015-01-05,0.625008,,,0.168708
2015-01-06,0.873265,1.180235,,2.289179
2015-01-07,0.820911,,,
2015-01-08,0.006094,,0.459006,


#### Same as ...

In [201]:
df.where(df > 0)

Unnamed: 0,A,B,C,D
2015-01-01,,0.17854,0.832353,
2015-01-02,0.485004,0.118978,,0.111592
2015-01-03,0.949849,,1.152051,
2015-01-04,,,,
2015-01-05,0.625008,,,0.168708
2015-01-06,0.873265,1.180235,,2.289179
2015-01-07,0.820911,,,
2015-01-08,0.006094,,0.459006,


In [204]:
# where takes an optional other argument for replacement of values where the condition is False
df.where(df > 0, -df)

Unnamed: 0,A,B,C,D
2015-01-01,0.394545,0.17854,0.832353,1.130516
2015-01-02,0.485004,0.118978,0.902221,0.111592
2015-01-03,0.949849,1.194467,1.152051,1.20408
2015-01-04,0.616025,1.35144,0.183004,2.520606
2015-01-05,0.625008,0.192718,0.665486,0.168708
2015-01-06,0.873265,1.180235,0.139416,2.289179
2015-01-07,0.820911,0.390134,0.711251,0.641265
2015-01-08,0.006094,1.093164,0.459006,0.830649


In [208]:
df_orig = df.copy()
df_orig.where(df > 0, -df, inplace=True);
df_orig

Unnamed: 0,A,B,C,D
2015-01-01,0.394545,-0.0,-0.0,1.130516
2015-01-02,-0.0,-0.0,0.902221,-0.0
2015-01-03,-0.0,1.194467,-0.0,1.20408
2015-01-04,0.616025,1.35144,0.183004,2.520606
2015-01-05,-0.0,0.192718,0.665486,-0.0
2015-01-06,-0.0,-0.0,0.139416,-0.0
2015-01-07,-0.0,0.390134,0.711251,0.641265
2015-01-08,-0.0,1.093164,-0.0,0.830649


In [207]:
# You may wish to set values based on some boolean criteria
df[df > 0] = 0
df

Unnamed: 0,A,B,C,D
2015-01-01,-0.394545,0.0,0.0,-1.130516
2015-01-02,0.0,0.0,-0.902221,0.0
2015-01-03,0.0,-1.194467,0.0,-1.20408
2015-01-04,-0.616025,-1.35144,-0.183004,-2.520606
2015-01-05,0.0,-0.192718,-0.665486,0.0
2015-01-06,0.0,0.0,-0.139416,0.0
2015-01-07,0.0,-0.390134,-0.711251,-0.641265
2015-01-08,0.0,-1.093164,0.0,-0.830649


In [210]:
df1[(df1.A < df1.B) & (df1.B < df1.C)]

Unnamed: 0,A,B,C,D
c,-1.800183,-1.015642,-0.597815,-0.424448


#### But for Series there is a difference

In [202]:
s[s<4]

0    0
1    1
2    2
3    3
dtype: int64

In [203]:
s.where(s<4)

0    0.0
1    1.0
2    2.0
3    3.0
4    NaN
5    NaN
dtype: float64

In [None]:
df2 = df.copy()
df2

In [None]:
df2 < 0

In [None]:
df2.where(df2>0,df2.iloc[0],axis=1)

In [None]:
df1 = pd.DataFrame({'item': ['a', 'b', 'c', 'd', 'e'],
                    'iclass': ['A3', 'A3', 'B4', 8, 1],
                    'price': [8, 12, 21, 44, 27]},
                    index=('aa','bb','cc','dd','ee'))
df2 = pd.DataFrame({'iclass': ['A3', 'B4', 8, 'D6'], 'grp': ['WIP', 'RM', 'FG', 'None']})

df1.reindex(columns=['iclass', 'item', 'price', 'foo'], fill_value='unknown')

In [None]:
# Label based indexing
df1.loc['aa':'cc']

In [None]:
df1.loc['cc':'ee', 'item']

In [None]:
# Position based indexing
df2.iloc[1:3]

In [None]:
df2.iloc[1:3, 0]

In [None]:
#cross section
df2.iloc[2]

In [None]:
df1.ix[:2, 'iclass']

In [None]:
df1.ix[df1.iclass == 'A3']['item']

In [None]:
df1.ix[3]

In [None]:
df3=pd.DataFrame(np.arange(12.).reshape((3,4)), columns=list('abcd'))
df3

In [None]:
df4=pd.DataFrame(np.arange(20.).reshape((4,5)), columns=list('abcde'))
df3.add(df4, fill_value=10)

In [None]:
# Deleting
df4.drop([2])

### Multi Indexing

In [None]:
# Creating DataFrame with MultiIndexes on both axis
idx = pd.MultiIndex.from_product([['A','B','C','D'],['XX', 'YY', 'ZZ'], [1,2]], names=['Operator', 'Facility', 'Shift'])

arrays = np.array(['OB11', 'OB11', 'HH90']), np.array(['M1', 'M2', 'M3'])
idx2 = pd.MultiIndex.from_arrays(arrays, names=['Machine type', 'MachineID'])

df = pd.DataFrame(np.random.randint(0, 50, (24,3)), index=idx, columns=idx2)
df

### Slicing

In [None]:
# Index must be sorted
df.sort_index(axis=1,inplace=True)
df.sort_index(axis=0,inplace=True)

idx = pd.IndexSlice
df.loc[idx[['B','D'], :, 2],idx[:,['M1','M3']]]