# Pandas

![gif](imgs/P005.gif)

## Import

In [1]:
import pandas as pd
import numpy as np

## NaN data formating

### series

In [2]:
s = pd.Series(['a', 'b', 'c', np.nan, 'e', np.nan])
s

0      a
1      b
2      c
3    NaN
4      e
5    NaN
dtype: object

### np.nan

In [3]:
s.isnull()

0    False
1    False
2    False
3     True
4    False
5     True
dtype: bool

In [4]:
s.isna()

0    False
1    False
2    False
3     True
4    False
5     True
dtype: bool

### None

In [5]:
s[6] = None
s

0       a
1       b
2       c
3     NaN
4       e
5     NaN
6    None
dtype: object

In [6]:
s.isnull()

0    False
1    False
2    False
3     True
4    False
5     True
6     True
dtype: bool

In [7]:
s.isna()

0    False
1    False
2    False
3     True
4    False
5     True
6     True
dtype: bool

### dropna

In [8]:
s.dropna()

0    a
1    b
2    c
4    e
dtype: object

In [9]:
s[s.notnull()]

0    a
1    b
2    c
4    e
dtype: object

### fillna

In [10]:
s.fillna('good')

0       a
1       b
2       c
3    good
4       e
5    good
6    good
dtype: object

### dataframe

In [11]:
df = pd.DataFrame([[1, 5, 3], 
                   [1, np.nan, np.nan], 
                   [np.nan, np.nan, np.nan], 
                   [np.nan, 7, 4]], columns=list('and'))
df

Unnamed: 0,a,n,d
0,1.0,5.0,3.0
1,1.0,,
2,,,
3,,7.0,4.0


In [12]:
df.dropna()

Unnamed: 0,a,n,d
0,1.0,5.0,3.0


In [13]:
df.dropna(how='all')

Unnamed: 0,a,n,d
0,1.0,5.0,3.0
1,1.0,,
3,,7.0,4.0


In [14]:
df.dropna(axis=1)

0
1
2
3


In [15]:
df.dropna(axis=1, how='all')

Unnamed: 0,a,n,d
0,1.0,5.0,3.0
1,1.0,,
2,,,
3,,7.0,4.0


### another dataframe

In [16]:
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,0.13021,-0.628638,-0.00485
1,-0.880094,-0.116086,-0.452721
2,0.045085,-0.996998,-0.531331
3,-1.315667,-1.596334,0.484287
4,-0.459284,0.370977,-1.041848
5,-1.405799,-0.531334,0.652833
6,0.647806,0.878489,-0.731691


In [17]:
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.13021,,
1,-0.880094,,
2,0.045085,,-0.531331
3,-1.315667,,0.484287
4,-0.459284,0.370977,-1.041848
5,-1.405799,-0.531334,0.652833
6,0.647806,0.878489,-0.731691


In [18]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.045085,,-0.531331
3,-1.315667,,0.484287
4,-0.459284,0.370977,-1.041848
5,-1.405799,-0.531334,0.652833
6,0.647806,0.878489,-0.731691


In [19]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2
4,-0.459284,0.370977,-1.041848
5,-1.405799,-0.531334,0.652833
6,0.647806,0.878489,-0.731691


## Remake NA data

In [20]:
df

Unnamed: 0,0,1,2
0,0.13021,,
1,-0.880094,,
2,0.045085,,-0.531331
3,-1.315667,,0.484287
4,-0.459284,0.370977,-1.041848
5,-1.405799,-0.531334,0.652833
6,0.647806,0.878489,-0.731691


In [21]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.13021,0.0,0.0
1,-0.880094,0.0,0.0
2,0.045085,0.0,-0.531331
3,-1.315667,0.0,0.484287
4,-0.459284,0.370977,-1.041848
5,-1.405799,-0.531334,0.652833
6,0.647806,0.878489,-0.731691


In [22]:
d = {1:0.4, 2:0.8}

df.fillna(d)

Unnamed: 0,0,1,2
0,0.13021,0.4,0.8
1,-0.880094,0.4,0.8
2,0.045085,0.4,-0.531331
3,-1.315667,0.4,0.484287
4,-0.459284,0.370977,-1.041848
5,-1.405799,-0.531334,0.652833
6,0.647806,0.878489,-0.731691


### methods

In [23]:
df

Unnamed: 0,0,1,2
0,0.13021,,
1,-0.880094,,
2,0.045085,,-0.531331
3,-1.315667,,0.484287
4,-0.459284,0.370977,-1.041848
5,-1.405799,-0.531334,0.652833
6,0.647806,0.878489,-0.731691


In [24]:
df.fillna(method='bfill')

Unnamed: 0,0,1,2
0,0.13021,0.370977,-0.531331
1,-0.880094,0.370977,-0.531331
2,0.045085,0.370977,-0.531331
3,-1.315667,0.370977,0.484287
4,-0.459284,0.370977,-1.041848
5,-1.405799,-0.531334,0.652833
6,0.647806,0.878489,-0.731691


In [25]:
df.fillna(method='bfill', limit=2)

Unnamed: 0,0,1,2
0,0.13021,,-0.531331
1,-0.880094,,-0.531331
2,0.045085,0.370977,-0.531331
3,-1.315667,0.370977,0.484287
4,-0.459284,0.370977,-1.041848
5,-1.405799,-0.531334,0.652833
6,0.647806,0.878489,-0.731691


### mean

In [26]:
df

Unnamed: 0,0,1,2
0,0.13021,,
1,-0.880094,,
2,0.045085,,-0.531331
3,-1.315667,,0.484287
4,-0.459284,0.370977,-1.041848
5,-1.405799,-0.531334,0.652833
6,0.647806,0.878489,-0.731691


In [27]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,0.13021,0.239377,-0.23355
1,-0.880094,0.239377,-0.23355
2,0.045085,0.239377,-0.531331
3,-1.315667,0.239377,0.484287
4,-0.459284,0.370977,-1.041848
5,-1.405799,-0.531334,0.652833
6,0.647806,0.878489,-0.731691


## Multi-indexing

### series

In [28]:
s = pd.Series(np.random.randn(10), index=[list('aaabbbccdd'), [1,2,3,1,2,3,1,2,2,3]])
s

a  1    0.458635
   2   -1.386975
   3   -0.376142
b  1    0.308387
   2    1.273407
   3    3.117362
c  1   -0.600147
   2   -1.857547
d  2   -1.629218
   3    0.338274
dtype: float64

In [29]:
s.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [30]:
s['a']

1    0.458635
2   -1.386975
3   -0.376142
dtype: float64

In [31]:
s['a':'c']

a  1    0.458635
   2   -1.386975
   3   -0.376142
b  1    0.308387
   2    1.273407
   3    3.117362
c  1   -0.600147
   2   -1.857547
dtype: float64

In [32]:
s.loc[['a', 'd']]

a  1    0.458635
   2   -1.386975
   3   -0.376142
d  2   -1.629218
   3    0.338274
dtype: float64

In [33]:
s[:, 2]

a   -1.386975
b    1.273407
c   -1.857547
d   -1.629218
dtype: float64

In [34]:
s.unstack()

Unnamed: 0,1,2,3
a,0.458635,-1.386975,-0.376142
b,0.308387,1.273407,3.117362
c,-0.600147,-1.857547,
d,,-1.629218,0.338274


In [35]:
s.unstack().stack()

a  1    0.458635
   2   -1.386975
   3   -0.376142
b  1    0.308387
   2    1.273407
   3    3.117362
c  1   -0.600147
   2   -1.857547
d  2   -1.629218
   3    0.338274
dtype: float64

### dataframe

In [36]:
df = pd.DataFrame(np.arange(12).reshape((4,3)), 
                  index=[list('aabb'), list('1212')], 
                  columns=[list('OOL'), list('GRG')])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,O,O,L
Unnamed: 0_level_1,Unnamed: 1_level_1,G,R,G
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [37]:
df.index.names = ['key1', 'key2']
df.columns.names = ['state', 'color']
df

Unnamed: 0_level_0,state,O,O,L
Unnamed: 0_level_1,color,G,R,G
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


## ReSortLevel and sorting

In [38]:
df

Unnamed: 0_level_0,state,O,O,L
Unnamed: 0_level_1,color,G,R,G
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [39]:
df.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,O,O,L
Unnamed: 0_level_1,color,G,R,G
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [40]:
df.swaplevel('color', 'state', axis=1)

Unnamed: 0_level_0,color,G,R,G
Unnamed: 0_level_1,state,O,O,L
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [41]:
df.swaplevel('key1', 'key2').sort_index(0)

Unnamed: 0_level_0,state,O,O,L
Unnamed: 0_level_1,color,G,R,G
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


## Total stats with level

In [42]:
df

Unnamed: 0_level_0,state,O,O,L
Unnamed: 0_level_1,color,G,R,G
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [43]:
df.sum(level='key2')

state,O,O,L
color,G,R,G
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [44]:
df.sum(level='color', axis=1)

Unnamed: 0_level_0,color,G,R
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10
