_My notebook on_
# Python for Data Analysis - Wes McKinney
## Chapter 5 - Getting Started with pandas
### Part 2 - Essential Functionality

In [1]:
import pandas as pd
import numpy as np

Reindexing

In [2]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
print(obj)
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj2)

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


In [3]:
# reindexing with interpolation - filling in between

obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)
obj4 = obj3.reindex(range(6), method='ffill')
print(obj4)

0      blue
2    purple
4    yellow
dtype: object
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object


In [4]:
# reindexing on dataframe

frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])
print(frame)

frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0


In [5]:
# reindexing on dataframe by columns
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [6]:
# alt reindexing by loc (deprecated)
frame.loc[['a', 'b', 'c', 'd'], states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


Dropping Entries from an Axis

In [7]:
# Dropping Entries from a Series

obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
print(obj)
print(obj.drop('c'))
print(obj.drop(['d', 'c']))

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
e    4.0
dtype: float64


In [8]:
# Dropping Entries from a DataFrame
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
print(data)
print(data.drop(['Colorado', 'Ohio']))
print(data.drop('two', axis=1))
print(data.drop(['two', 'four'], axis='columns'))

# inplace drop
data.drop('New York', inplace=True)
print(data)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15
          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15
          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11


Indexing, Selection, and Filtering

In [9]:
# Series indexed by value or proper index

obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj)
print(obj['b'], obj[1])
print(obj[2:4])

# notice the difference: here 'end' is included!
print(obj['b':'d'])

print(obj[['b', 'a', 'd']])
print(obj[[1, 3]])
print(obj[obj < 2])

#selected elements could be set:
obj['b':'c'] = 5
print(obj)

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
1.0 1.0
c    2.0
d    3.0
dtype: float64
b    1.0
c    2.0
d    3.0
dtype: float64
b    1.0
a    0.0
d    3.0
dtype: float64
b    1.0
d    3.0
dtype: float64
a    0.0
b    1.0
dtype: float64
a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64


In [10]:
print(data)
print('Getting a column:')
print(data['two'])
print('Getting more columns')
print(data[['three', 'one']])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
Getting a column:
Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32
Getting more columns
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8


In [11]:
print('Getting a few rows:')
print(data[:2])

Getting a few rows:
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7


In [12]:
# selecting rows by boolean check
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11


In [13]:
# scalar comparison
print(data < 5)
print('---')
data[data < 5] = 0
print(data)

            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
---
          one  two  three  four
Ohio        0    0      0     0
Colorado    0    5      6     7
Utah        8    9     10    11


In [14]:
# loc to select a single row and multiple columns

data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [15]:
#iloc to select a single row and multiple columns
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [16]:
# iloc to get a row
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [17]:
# iloc to get rows-cols values
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [18]:
# slicing, all rows up to Utah - col two
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [19]:
# all rows, cols up to third, then bool check on three
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10


### Integer Indexes

In [20]:
ser = pd.Series(np.arange(3.))
print(ser)
# ser[-1] # KeyError!

0    0.0
1    1.0
2    2.0
dtype: float64


In [21]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
print(ser2)
print(ser2['a'], ser2[1], ser2[-1]) # -1 here is not ambiguous

a    0.0
b    1.0
c    2.0
dtype: float64
0.0 1.0 2.0


In [22]:
# loc and iloc are more precise
print(ser[:1])
print(ser.loc[:1])
print(ser.iloc[:1])

0    0.0
dtype: float64
0    0.0
1    1.0
dtype: float64
0    0.0
dtype: float64


### Arithmetic and Data Alignment

In [23]:
# Series

s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
print(s1)
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
print(s2)
print('Adding up two series leads to something like automatic outer join - w/ NaN')
print(s1+s2)

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64
Adding up two series leads to something like automatic outer join - w/ NaN
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64


In [24]:
# DataFrame
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)),
                   columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
print(df1)
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                   columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df2)
print('Same behavior on df, index and columns are union, NaN for missing value on join')
print(df1+df2)

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
Same behavior on df, index and columns are union, NaN for missing value on join
            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


In [25]:
df1 = pd.DataFrame({'A': [1, 2]})
print(df1)
df2 = pd.DataFrame({'B': [3, 4]})
print(df2)
print('No common elements, leads to a NaN table')
print(df1 - df2)

   A
0  1
1  2
   B
0  3
1  4
No common elements, leads to a NaN table
    A   B
0 NaN NaN
1 NaN NaN


Arithmetic methods with fill values

In [26]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
print(df1)
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
print(df2)
print('Plain add leads to NaN')
print(df1+df2)
print('Using add method makes available the fill_value argument')
print(df1.add(df2, fill_value=0))

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0
Plain add leads to NaN
      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0   NaN  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN
Using add method makes available the fill_value argument
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0   5.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0


methods for arithmetic - starting with "r" for flipping arguments

In [27]:
print(1 / df1)
print(df1.rdiv(1))

          a         b         c         d
0       inf  1.000000  0.500000  0.333333
1  0.250000  0.200000  0.166667  0.142857
2  0.125000  0.111111  0.100000  0.090909
          a         b         c         d
0       inf  1.000000  0.500000  0.333333
1  0.250000  0.200000  0.166667  0.142857
2  0.125000  0.111111  0.100000  0.090909


Operations between DataFrame and Series

In [28]:
# difference between a two-dimensional array and one of its rows - broadcasting
arr = np.arange(12.).reshape((3, 4))
print(arr)
print(arr - arr[0])

[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]
[[0. 0. 0. 0.]
 [4. 4. 4. 4.]
 [8. 8. 8. 8.]]


In [29]:
# same when working with a series and a dataframe
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
print(frame)
print(frame - series)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
          b    d    e
Utah    0.0  0.0  0.0
Ohio    3.0  3.0  3.0
Texas   6.0  6.0  6.0
Oregon  9.0  9.0  9.0


In [30]:
s2 = pd.Series(range(3), index=['b', 'e', 'f'])
print(s2)
print(frame+s2)

b    0
e    1
f    2
dtype: int64
          b   d     e   f
Utah    0.0 NaN   3.0 NaN
Ohio    3.0 NaN   6.0 NaN
Texas   6.0 NaN   9.0 NaN
Oregon  9.0 NaN  12.0 NaN


Broadcast over columns

In [31]:
s3 = frame['d']
print(s3)
print(frame.sub(s3, axis='index'))
print(frame.sub(s3, axis=0))

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64
          b    d    e
Utah   -1.0  0.0  1.0
Ohio   -1.0  0.0  1.0
Texas  -1.0  0.0  1.0
Oregon -1.0  0.0  1.0
          b    d    e
Utah   -1.0  0.0  1.0
Ohio   -1.0  0.0  1.0
Texas  -1.0  0.0  1.0
Oregon -1.0  0.0  1.0


## Function Application and Mapping

In [32]:
# universal function on pandas objects

frame = pd.DataFrame(np.random.randn(4, 3),
                     columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
print(np.abs(frame))

               b         d         e
Utah   -0.045957  1.378435  0.376536
Ohio   -2.063078 -0.273716 -0.112475
Texas   0.212767  0.015502  0.164772
Oregon  0.188643 -0.595074  0.077918
               b         d         e
Utah    0.045957  1.378435  0.376536
Ohio    2.063078  0.273716  0.112475
Texas   0.212767  0.015502  0.164772
Oregon  0.188643  0.595074  0.077918


In [33]:
# apply a function to each column or row
delta = lambda x: x.max() - x.min()

print(frame.apply(delta))
print('---')
print(frame.apply(delta, axis='columns'))

b    2.275845
d    1.973509
e    0.489011
dtype: float64
---
Utah      1.424393
Ohio      1.950603
Texas     0.197265
Oregon    0.783717
dtype: float64


In [34]:
# applied function could return a series
delta = lambda x: pd.Series([x.min(), x.max()], index=['min', 'max'])
print(frame.apply(delta))

            b         d         e
min -2.063078 -0.595074 -0.112475
max  0.212767  1.378435  0.376536


In [35]:
# use applymap to format data
formatter = lambda x: '{:.2f}'.format(x)
print(frame.applymap(formatter))

            b      d      e
Utah    -0.05   1.38   0.38
Ohio    -2.06  -0.27  -0.11
Texas    0.21   0.02   0.16
Oregon   0.19  -0.60   0.08


In [37]:
# on Series, map() does the trick
frame['e'].map(formatter)

Utah       0.38
Ohio      -0.11
Texas      0.16
Oregon     0.08
Name: e, dtype: object

## Sorting and ranking

In [38]:
# sort index for a series

obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
print(obj)
print(obj.sort_index())

d    0
a    1
b    2
c    3
dtype: int64
a    1
b    2
c    3
d    0
dtype: int64


In [39]:
# sort dataframe index or column names
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
print(frame)
print(frame.sort_index())
print(frame.sort_index(axis=1))
print(frame.sort_index(axis=1, ascending=False))

       d  a  b  c
three  0  1  2  3
one    4  5  6  7
       d  a  b  c
one    4  5  6  7
three  0  1  2  3
       a  b  c  d
three  1  2  3  0
one    5  6  7  4
       d  c  b  a
three  0  3  2  1
one    4  7  6  5


In [40]:
# sort by values
obj = pd.Series([4, 7, -3, 2])
print(obj)
print(obj.sort_values())
print('Missing values by default at the end')
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
print(obj)
print(obj.sort_values())
print(obj.sort_values(na_position='first'))

0    4
1    7
2   -3
3    2
dtype: int64
2   -3
3    2
0    4
1    7
dtype: int64
Missing values by default at the end
0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64
1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0
dtype: float64


In [41]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by='b'))
print(frame.sort_values(by=['a', 'b']))

   a  b
0  0  4
1  1  7
2  0 -3
3  1  2
   a  b
2  0 -3
3  1  2
0  0  4
1  1  7
   a  b
2  0 -3
0  0  4
3  1  2
1  1  7


In [42]:
# ranking
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
print(obj)
print(obj.rank())
print(obj.rank(method='first'))
print(obj.rank(method='max'))
print(obj.rank(ascending=False, method='max'))

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
0    7.0
1    1.0
2    7.0
3    5.0
4    3.0
5    2.0
6    5.0
dtype: float64
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64


In [43]:
frame = pd.DataFrame(
    {'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})
print(frame)
print(frame.rank(axis='columns'))
print(frame.rank())

   a    b    c
0  0  4.3 -2.0
1  1  7.0  5.0
2  0 -3.0  8.0
3  1  2.0 -2.5
     a    b    c
0  2.0  3.0  1.0
1  1.0  3.0  2.0
2  2.0  1.0  3.0
3  2.0  3.0  1.0
     a    b    c
0  1.5  3.0  2.0
1  3.5  4.0  3.0
2  1.5  1.0  4.0
3  3.5  2.0  1.0


## Axis Indexes with Duplicate Labels

In [44]:
# a Series could have duplicate indices
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
print(obj)
print(obj.index.is_unique)
print(obj['a'])
print(obj['c'])

a    0
a    1
b    2
b    3
c    4
dtype: int64
False
a    0
a    1
dtype: int64
4


In [None]:
# same for dataframe
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
print(df)
print(df.loc['b'])

## Summarizing and Computing Descriptive Statistics

In [45]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [46]:
# DataFrame.sum() returns a Series containing column sums
df.sum()

one    9.25
two   -5.80
dtype: float64

In [52]:
# sum on columns
print(df.sum(axis='columns'))
print(df.sum(axis=1))

# default is index
print(df.sum(axis=0))
print(df.sum(axis='index'))

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
one    9.25
two   -5.80
dtype: float64
one    9.25
two   -5.80
dtype: float64


In [53]:
# By default, NA values are excluded unless the entire slice is NA
print(df.mean(axis='columns'))
print(df.mean(axis='columns', skipna=False))

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64


In [56]:
print(df.idxmax())
print(df.idxmin())

one    b
two    d
dtype: object
one    d
two    b
dtype: object


In [63]:
# comulative sum
print(df)
print(df.cumsum())
print(df.cumsum(axis='columns'))

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
    one  two
a  1.40  NaN
b  8.50 -4.5
c   NaN  NaN
d  9.25 -5.8
    one   two
a  1.40   NaN
b  7.10  2.60
c   NaN   NaN
d  0.75 -0.55


In [67]:
print(df.describe())
print('---')
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
print(obj)
print(obj.describe())

            one       two
count  3.000000  2.000000
mean   3.083333 -2.900000
std    3.493685  2.262742
min    0.750000 -4.500000
25%    1.075000 -3.700000
50%    1.400000 -2.900000
75%    4.250000 -2.100000
max    7.100000 -1.300000
---
0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object
count     16
unique     3
top        a
freq       8
dtype: object


## Unique Values, Value Counts, and Membership

In [80]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
print(uniques)
uniques.sort()
print(uniques)
print(obj.value_counts())
print(pd.value_counts(obj.values, sort=False))
print(pd.value_counts(obj.values))

['c' 'a' 'd' 'b']
['a' 'b' 'c' 'd']
a    3
c    3
b    2
d    1
dtype: int64
c    3
d    1
a    3
b    2
dtype: int64
a    3
c    3
b    2
d    1
dtype: int64


In [84]:
print(obj)
mask = obj.isin(['b', 'c'])
print(mask)
print(obj[mask])

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool
0    c
5    b
6    b
7    c
8    c
dtype: object


In [85]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2], dtype=int64)

In [90]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4], 
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
print(data)

result = data.apply(pd.value_counts).fillna(0)
print(result)

   Qu1  Qu2  Qu3
0    1    2    1
1    3    3    5
2    4    1    2
3    3    2    4
4    4    3    4
   Qu1  Qu2  Qu3
1  1.0  1.0  1.0
2  0.0  2.0  1.0
3  2.0  2.0  0.0
4  2.0  0.0  2.0
5  0.0  0.0  1.0
