## CHAPTER 5
# Getting Started with pandas
---
## Arithmetic and Funtions

In [3]:
%pylab inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

Populating the interactive namespace from numpy and matplotlib


In [3]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
display(s1,s2)

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [4]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [4]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
    index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon'])
display(df1,df2)

df1 + df2

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [6]:
df1.add(df2,fill_value=0) # add, sub, div, mul

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


In [5]:
display(df1)
df1.reindex(index=(df1.index | df2.index), columns=(df1.columns | df2.columns),fill_value=-1)
### df1.index.append(df2.index).unique()

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,-1.0
Ohio,0.0,1.0,2.0,-1.0
Oregon,-1.0,-1.0,-1.0,-1.0
Texas,3.0,4.0,5.0,-1.0
Utah,-1.0,-1.0,-1.0,-1.0


In [8]:
df = pd.DataFrame(np.arange(12).reshape(4,3))
display(df)
df-[1,2,3]
#df-[1,2,3,4] # Error!!
df.sub([1,2,3,4],axis=0)
df - df[0]
df - df[:1]
df - df.mean()

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


Unnamed: 0,0,1,2
0,-4.5,-4.5,-4.5
1,-1.5,-1.5,-1.5
2,1.5,1.5,1.5
3,4.5,4.5,4.5


In [7]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon'])
display(frame)
frame.abs()

Unnamed: 0,b,d,e
Utah,1.530059,0.911503,1.345126
Ohio,1.046738,-0.020159,1.242184
Texas,-0.244314,0.356477,-0.269382
Oregon,0.690358,-1.400469,-0.034224


Unnamed: 0,b,d,e
Utah,1.530059,0.911503,1.345126
Ohio,1.046738,0.020159,1.242184
Texas,0.244314,0.356477,0.269382
Oregon,0.690358,1.400469,0.034224


In [9]:
np.abs(frame) # we can use numpy function
frame.min()
frame.max(axis=1)
frame.apply(lambda ser: ser.max() -ser.min())
frame.apply(lambda ser: ser-ser.mean())
frame.apply(lambda ser: ser-ser.mean(),axis=1)
frame.apply(lambda ser: pd.Series([ser.min(),ser.max(),ser.mean()],index=['min','max','mean']))

Unnamed: 0,b,d,e
min,-0.244314,-1.400469,-0.269382
max,1.530059,0.911503,1.345126
mean,0.75571,-0.038162,0.570926


In [6]:
frame.applymap(lambda x: '%0.2f' % x) # series has function 'map'
frame.b.map(np.abs)

Unnamed: 0,b,d,e
Utah,1.320338,-0.286899,0.383818
Ohio,-0.449831,0.051982,0.285775
Texas,0.115367,0.976372,-1.660707
Oregon,0.065302,0.08647,0.229033


Utah      1.320338
Ohio      0.449831
Texas     0.115367
Oregon    0.065302
Name: b, dtype: float64

In [31]:
frame.sort_index()
frame.sort_values('b')
frame.sort_values('Oregon',axis=1,ascending=False)

Unnamed: 0,e,b,d
Utah,-0.227872,1.132299,0.629507
Ohio,1.745679,0.413487,-0.826987
Texas,-0.343479,0.335947,2.383627
Oregon,1.027537,-0.351751,-1.663161


In [33]:
ser = pd.Series([-4,3,1,-2])
display(ser)
ser.sort_values()

0   -4
1    3
2    1
3   -2
dtype: int64

0   -4
3   -2
2    1
1    3
dtype: int64

In [10]:
frame = pd.DataFrame({'a': [0,1,0,1,0], 'b': [1,2,3,4,5]})
display(frame)
frame.sort_values(['a','b']) # 'by' option
# frame.rank(method='first') # sorted ranking. default method is 'average'

Unnamed: 0,a,b
0,0,1
1,1,2
2,0,3
3,1,4
4,0,5


Unnamed: 0,a,b
0,0,1
2,0,3
4,0,5
1,1,2
3,1,4
