# 5 Pandas

## 5.1 Pandas Data Structure

### 5.1.1 Series

In [8]:
import pandas as pd
import numpy as np

In [9]:
ser1 = pd.Series([4,7,-5,3])
ser1

0    4
1    7
2   -5
3    3
dtype: int64

In [10]:
sdata = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
ser2 = pd.Series(sdata)
ser2

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [11]:
ser2.index

Index(['Ohio', 'Oregon', 'Texas', 'Utah'], dtype='object')

### 5.1.2 DataFrame

In [12]:
# simple DataFrame
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
       'year':[2000,2001,2002,2001,2002,2003],
       'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
frame1 = pd.DataFrame(data)
frame1

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [13]:
# nested dict of dicts
pop = {'Nevada':{2001:2.4,2002:2.9},
      'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame2 = pd.DataFrame(pop)
frame2

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [14]:
# DataFrame of ndarray
frame3 = pd.DataFrame(np.arange(9).reshape((3,3)),
                     index = ['a','b','c'],
                     columns = ['Ohio','Texas','California'])
frame3

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8


In [15]:
# add and modify columns
frame1 = pd.DataFrame(data,columns = ['year','state','pop','debt'],
                     index = ['one','two','three','four','five','six'])
frame1

frame1['debt']=np.arange(6.)
frame1

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [16]:
# tranpose DataFrame
frame1.T

Unnamed: 0,one,two,three,four,five,six
year,2000,2001,2002,2001,2002,2003
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
pop,1.5,1.7,3.6,2.4,2.9,3.2
debt,0,1,2,3,4,5


## 5.2 Essential Functionality

### 5.2.1 Reindex

In [17]:
# reindex with .reindex
ser3 = pd.Series([4.5,7.3,6.5,4.6], index=[0,2,3,5])
print(ser3)
ser4 = ser3.reindex([0,1,2,3,4,5],method='ffill')
ser4

0    4.5
2    7.3
3    6.5
5    4.6
dtype: float64


0    4.5
1    4.5
2    7.3
3    6.5
4    6.5
5    4.6
dtype: float64

### 5.2.2 Dropping Entries from an Axis

In [18]:
# drop columns of DataFrame
frame1.drop('year',axis=1)

Unnamed: 0,state,pop,debt
one,Ohio,1.5,0.0
two,Ohio,1.7,1.0
three,Ohio,3.6,2.0
four,Nevada,2.4,3.0
five,Nevada,2.9,4.0
six,Nevada,3.2,5.0


### 5.2.3 Indexing, Selection and Filtering

In [19]:
# indexing columns of DataFrame
frame1[['state','pop']]

Unnamed: 0,state,pop
one,Ohio,1.5
two,Ohio,1.7
three,Ohio,3.6
four,Nevada,2.4
five,Nevada,2.9
six,Nevada,3.2


In [20]:
# indexing indexs of DataFrame
frame1[:3]

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0


In [21]:
# modify values of DataFrame with boolean
frame3[frame3<3] = 0
frame3

Unnamed: 0,Ohio,Texas,California
a,0,0,0
b,3,4,5
c,6,7,8


### 5.2.4 Arithmetic and Data Aligment

In [22]:
# construct DataFrame for calculate
frame4 = pd.DataFrame(np.arange(12.).reshape((3,4)),
                     columns = list('abcd'))
frame5 = pd.DataFrame(np.arange(20.).reshape((4,5)),
                     columns = list('abcde'))
print(frame4)
print(frame5)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   6.0   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0


In [23]:
frame4+frame5

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [24]:
frame4.add(frame5,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


### 5.2.5 Function Application and Mapping

In [25]:
# differences per columns
f = lambda x: x.max()-x.min()
frame4.apply(f)

a    8.0
b    8.0
c    8.0
d    8.0
dtype: float64

In [26]:
# differences per row
frame4.apply(f,axis='columns')

0    3.0
1    3.0
2    3.0
dtype: float64

In [27]:
# compute format string
format = lambda x:'%.2f'%x
frame4.applymap(format)

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


## 5.3 Summarizing and Computing Descriptive Statistics

In [30]:
# describe method
frame4.describe()

Unnamed: 0,a,b,c,d
count,3.0,3.0,3.0,3.0
mean,4.0,5.0,6.0,7.0
std,4.0,4.0,4.0,4.0
min,0.0,1.0,2.0,3.0
25%,2.0,3.0,4.0,5.0
50%,4.0,5.0,6.0,7.0
75%,6.0,7.0,8.0,9.0
max,8.0,9.0,10.0,11.0


In [33]:
# correlation
frame5.corr()

Unnamed: 0,a,b,c,d,e
a,1.0,1.0,1.0,1.0,1.0
b,1.0,1.0,1.0,1.0,1.0
c,1.0,1.0,1.0,1.0,1.0
d,1.0,1.0,1.0,1.0,1.0
e,1.0,1.0,1.0,1.0,1.0


In [34]:
# covariance
frame5.cov()

Unnamed: 0,a,b,c,d,e
a,41.666667,41.666667,41.666667,41.666667,41.666667
b,41.666667,41.666667,41.666667,41.666667,41.666667
c,41.666667,41.666667,41.666667,41.666667,41.666667
d,41.666667,41.666667,41.666667,41.666667,41.666667
e,41.666667,41.666667,41.666667,41.666667,41.666667
