<div class="licence">
<span>Licence CC BY-NC-ND</span>
<span>Thierry Parmentelat</span>
</div>

In [None]:
from plan import plan_extras; plan_extras("pandas")

# pandas

from `Python for data analysis`, Wes McKinney

# objectives

* data structures with labeled axes
  * automatic or explicit data alignment
* integrated time series functionality
* same data structure for time series or non-time series
* arithemtic operations and reductions (on a whole column)
* flexible handling of missing data
* merge and other relation operations as found in popular db systems, e.g. sql-based

In [None]:
import pandas as pd

In [None]:
from pandas import Series, DataFrame

# `Series`

a `Series` corresponds roughly to a column (values) in an excel spreadsheet, with names (index) attached to lines

In [None]:
# by default lines are numbered
ser = Series([4, 7, -5, 3])
ser

In [None]:
ser.values

In [None]:
list(ser.index)

In [None]:
# attaching names to lines
ser2 = Series([4, 7, -5, 3],
              index = ['d', 'b', 'a', 'c'] )
ser2

In [None]:
ser2['a']

In [None]:
ser2['d'] = 6
ser2

In [None]:
# extract a list of lines
ser2[ ['c', 'a', 'b'] ]

In [None]:
# numpy operations
ser2[ser2 > 0]

In [None]:
ser2 * 2

In [None]:
# looking for keys (lines)
'b' in ser2

In [None]:
'e' in ser2

In [None]:
# creating from a regular python dict
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

ser3 = Series(sdata)
ser3

In [None]:
# providing a dict *and* an index 
# may lead to discarding data or undefined data
states = ['California', 'Ohio', 'Oregon', 'Texas']

ser4 = Series(sdata, index = states)
ser4

In [None]:
ser4.isnull()

### combining 2 `Series` (here, addition)

a notion called **data alignment**

In [None]:
ser3

In [None]:
ser4

In [None]:
ser3 + ser4

### the `name` attribute

In [None]:
ser4.name = 'population'
ser4.index.name = 'state'

In [None]:
ser4

### altering the index in place

In [None]:
ser

In [None]:
ser.inde = ['Bob', 'Steve', 'Jeff', 'Ryan']

In [None]:
ser

# `DataFrame`

a data frame corresponds roughly to a full spreadsheet with

* cells can be accessed by row or by column
* 2 dimensions mostly symmetrical

In [None]:
data = {'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year' : [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}

In [None]:
# providing data as a dict
frame = DataFrame(data)
frame

In [None]:
# ordering the columns
DataFrame(data, columns=['year', 'state', 'pop'])

In [None]:
# specifying undefined column to make space
# and names for rows (in index like for Series)
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                   index = ['one', 'two', 'three', 'four', 'five'])

In [None]:
frame2

### retrieving columns

one can retrieve each column as a `Series` object, note the `name` attribute is set properly.

In [None]:
# accessing a column
# returns a Series
frame2['state']

In [None]:
# ditto throuh an attribute 
frame2.year

In [None]:
# all series share the same index
frame2['state'].index is frame2['pop'].index

### retrieving rows

this also returns a `Series` !

In [None]:
frame2.loc['three']

In [None]:
type(frame2.loc['three'])

### modifying columns by assignment

In [None]:
import numpy as np

In [None]:
# this uses numpy broadcasting
frame2['debt'] = 16.5
frame2

In [None]:
# from a numpy array
frame2['debt'] = np.random.randint(0, 10, 5)
frame2

In [None]:
# ditto from a Series
# missing data will be marked as NaN
newdebt = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = newdebt
frame2

### creating a new column

In [None]:
# just assign as if it was existing
frame2['eastern'] = frame2.state == 'Ohio'
frame2

In [None]:
# deleting 
del frame2['eastern']
frame2

### columns are shared data

In [None]:
pop = frame2['pop']
pop

In [None]:
pop['three'] = 5
frame2

### creating from a dict of dicts

In [None]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [None]:
frame3 = DataFrame(pop)
frame3

### transposing

In [None]:
# like with numpy
frame3.T

### specifyng the index

In [None]:
#  like with Series, one can set `index`
DataFrame(pop, index=[2001, 2002, 2003])

### ` values` returns a ndarray

In [None]:
frame3.values

### numpy type

In [None]:
# here we have a homogeneous array
frame3.values.dtype

In [None]:
# when several types are mixed: using numpy wildcard object type
frame2.values

In [None]:
frame2.values.dtype

# ` Index`  objects

In [None]:
ser = Series(range(3), index=['a', 'b', 'c'])
index = ser.index
index

In [None]:
# slicing
index[1:]

In [None]:
# cannot write through the index
try:
    index[0] = 'x'
except TypeError as e:
    print(" OOPS", e)

In [None]:
# sharing of indexes
index = pd.Index(np.arange(3))

In [None]:
ser11 = Series([1.5, -2.5, 0], index=index)

In [None]:
ser11.index is index

### index like a fixed size set

In [None]:
frame3

In [None]:
'Ohio' in frame3.columns

In [None]:
2000 in frame3.index

# essential functionality

## reindexing

In [None]:
ser21 = Series([4.5, 7.2, -5.3, 3.6],
               index = ['d', 'b', 'a', 'c'])
ser21

In [None]:
# create a new Series
ser22 = ser21.reindex(['a', 'b', 'c', 'd', 'e'])
ser22

In [None]:
ser21.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

In [None]:
ser21

In [None]:
ser21.values is ser2.values