# Pandas

![gif](imgs/P004.gif)

## Import

In [1]:
import pandas as pd
import numpy as np

## Axis indexing with repeatable values

### series

In [2]:
s = pd.Series(np.arange(5), index=list('aabbc'))
s

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [3]:
s.index.is_unique

False

In [4]:
s['a']

a    0
a    1
dtype: int32

In [5]:
type(s['a'])

pandas.core.series.Series

In [6]:
s['c']

4

In [7]:
type(s['c'])

numpy.int32

### dataframe

In [8]:
df = pd.DataFrame(np.random.randn(4,3), index=list('aabb'))
df

Unnamed: 0,0,1,2
a,0.713172,1.442436,0.053976
a,0.207821,0.167678,-1.005934
b,-0.340509,-2.353696,-1.422887
b,1.138666,0.514238,1.589574


In [9]:
df.loc['a']

Unnamed: 0,0,1,2
a,0.713172,1.442436,0.053976
a,0.207821,0.167678,-1.005934


## Reduction and calculation stats

### dataframe

In [10]:
df = pd.DataFrame({'one': [1,2,np.nan], 'two': [8,np.nan,6]}, index=list('abc'))
df

Unnamed: 0,one,two
a,1.0,8.0
b,2.0,
c,,6.0


### sum

In [11]:
df.sum()

one     3.0
two    14.0
dtype: float64

In [12]:
df.sum(axis=1)

a    9.0
b    2.0
c    6.0
dtype: float64

### mean

In [13]:
df.mean()

one    1.5
two    7.0
dtype: float64

In [14]:
df.mean(axis=1)

a    4.5
b    2.0
c    6.0
dtype: float64

In [15]:
df.mean(axis=1, skipna=False)

a    4.5
b    NaN
c    NaN
dtype: float64

### idmin, idmax

In [16]:
df.idxmin()

one    a
two    c
dtype: object

In [17]:
df.idxmin(axis=1)

a    one
b    one
c    two
dtype: object

In [18]:
df.idxmax()

one    b
two    a
dtype: object

In [19]:
df.idxmax(axis=1)

a    two
b    one
c    two
dtype: object

### describe

In [20]:
df.describe()

Unnamed: 0,one,two
count,2.0,2.0
mean,1.5,7.0
std,0.707107,1.414214
min,1.0,6.0
25%,1.25,6.5
50%,1.5,7.0
75%,1.75,7.5
max,2.0,8.0


In [21]:
s = pd.Series(list('aabc') * 4)
s

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [22]:
s.describe()

count     16
unique     3
top        a
freq       8
dtype: object

## Correlation and covariation

### prep

In [23]:
index_list = []
for i in range(10):
    index_list.append('1/1/200{}'.format(i))
    
index_list

['1/1/2000',
 '1/1/2001',
 '1/1/2002',
 '1/1/2003',
 '1/1/2004',
 '1/1/2005',
 '1/1/2006',
 '1/1/2007',
 '1/1/2008',
 '1/1/2009']

In [24]:
df = pd.DataFrame(np.random.randn(10, 7), columns=list('ABCDEFG'), index=index_list)
df

Unnamed: 0,A,B,C,D,E,F,G
1/1/2000,-0.580317,-1.172205,1.824581,0.256637,0.168333,0.308234,-1.113815
1/1/2001,0.641601,-0.51893,0.241959,0.467028,0.652808,0.440662,0.580837
1/1/2002,0.362031,-0.122528,-0.401799,0.006861,0.390792,0.824773,-2.03341
1/1/2003,-0.762745,-0.212196,0.813066,-0.227799,-1.314191,-0.133755,-1.660449
1/1/2004,-2.102579,0.458301,-0.589592,-0.952404,-1.450447,-0.314001,-0.379856
1/1/2005,0.382282,-0.065583,-1.183745,0.106859,-0.075776,-0.674442,0.124049
1/1/2006,0.066749,-0.027022,0.745331,-0.087171,0.199351,-0.638848,-0.062634
1/1/2007,-1.373227,1.007323,0.105828,-0.723159,-1.302814,0.186852,0.55902
1/1/2008,0.145906,0.3259,0.807613,-0.659431,0.600862,-2.630107,-0.142168
1/1/2009,0.69645,-0.051224,-0.830457,-0.005017,-0.446735,-1.79336,-1.706787


### local

In [25]:
df.A.corr(df.B)

-0.4103399529765193

In [26]:
df.A.cov(df.B)

-0.22139354383300155

### all set

In [27]:
df.corr()

Unnamed: 0,A,B,C,D,E,F,G
A,1.0,-0.41034,-0.136363,0.729695,0.78699,-0.240763,-0.130299
B,-0.41034,1.0,-0.443092,-0.814726,-0.503886,-0.268544,0.357076
C,-0.136363,-0.443092,1.0,0.115077,0.212391,0.090765,-0.017176
D,0.729695,-0.814726,0.115077,1.0,0.615506,0.321317,-0.158027
E,0.78699,-0.503886,0.212391,0.615506,1.0,-0.120415,0.077034
F,-0.240763,-0.268544,0.090765,0.321317,-0.120415,1.0,-0.054853
G,-0.130299,0.357076,-0.017176,-0.158027,0.077034,-0.054853,1.0


In [28]:
df.cov()

Unnamed: 0,A,B,C,D,E,F,G
A,0.865311,-0.221394,-0.116816,0.311012,0.60156,-0.237962,-0.117614
B,-0.221394,0.336411,-0.236672,-0.216519,-0.240154,-0.165494,0.200968
C,-0.116816,-0.236672,0.848078,0.048558,0.160723,0.088811,-0.015348
D,0.311012,-0.216519,0.048558,0.209942,0.231742,0.156428,-0.07026
E,0.60156,-0.240154,0.160723,0.231742,0.675223,-0.105132,0.061424
F,-0.237962,-0.165494,0.088811,0.156428,-0.105132,1.128919,-0.056554
G,-0.117614,0.200968,-0.015348,-0.07026,0.061424,-0.056554,0.941586


### corrwith

In [29]:
df.corrwith(df.A)

A    1.000000
B   -0.410340
C   -0.136363
D    0.729695
E    0.786990
F   -0.240763
G   -0.130299
dtype: float64

## Unique values, counters and membership

### series

In [30]:
s = pd.Series(list('cadaabbcc'))
s

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [31]:
s.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [32]:
s.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [33]:
pd.value_counts(s, sort=True)

c    3
a    3
b    2
d    1
dtype: int64

In [34]:
s.isin(list('bc'))

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

### gistogram

In [35]:
df

Unnamed: 0,A,B,C,D,E,F,G
1/1/2000,-0.580317,-1.172205,1.824581,0.256637,0.168333,0.308234,-1.113815
1/1/2001,0.641601,-0.51893,0.241959,0.467028,0.652808,0.440662,0.580837
1/1/2002,0.362031,-0.122528,-0.401799,0.006861,0.390792,0.824773,-2.03341
1/1/2003,-0.762745,-0.212196,0.813066,-0.227799,-1.314191,-0.133755,-1.660449
1/1/2004,-2.102579,0.458301,-0.589592,-0.952404,-1.450447,-0.314001,-0.379856
1/1/2005,0.382282,-0.065583,-1.183745,0.106859,-0.075776,-0.674442,0.124049
1/1/2006,0.066749,-0.027022,0.745331,-0.087171,0.199351,-0.638848,-0.062634
1/1/2007,-1.373227,1.007323,0.105828,-0.723159,-1.302814,0.186852,0.55902
1/1/2008,0.145906,0.3259,0.807613,-0.659431,0.600862,-2.630107,-0.142168
1/1/2009,0.69645,-0.051224,-0.830457,-0.005017,-0.446735,-1.79336,-1.706787


In [36]:
df.apply(pd.value_counts).fillna(0).head(15)

Unnamed: 0,A,B,C,D,E,F,G
-2.630107,0.0,0.0,0.0,0.0,0.0,1.0,0.0
-2.102579,1.0,0.0,0.0,0.0,0.0,0.0,0.0
-2.03341,0.0,0.0,0.0,0.0,0.0,0.0,1.0
-1.79336,0.0,0.0,0.0,0.0,0.0,1.0,0.0
-1.706787,0.0,0.0,0.0,0.0,0.0,0.0,1.0
-1.660449,0.0,0.0,0.0,0.0,0.0,0.0,1.0
-1.450447,0.0,0.0,0.0,0.0,1.0,0.0,0.0
-1.373227,1.0,0.0,0.0,0.0,0.0,0.0,0.0
-1.314191,0.0,0.0,0.0,0.0,1.0,0.0,0.0
-1.302814,0.0,0.0,0.0,0.0,1.0,0.0,0.0
