# [Prof. Dalvan Griebler](mailto:dalvan.griebler@pucrs.br)

## Programação Orientada a Dados (POD) - Turma 10 (POD_98H04-06)

**Atualizado**: 29/10/2021

**Descrição**: Material de apoio as aulas sobre Python para POD

**Copyright &copy;**: Este documento está sob a licensa da Criative Commons [BY-NC-ND 4.0](https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode)

# Introdução a Biblioteca Pandas do Python

## Programação Funcional vs Manipulação de Dados com Pandas

In [1]:
# modo imperativo
def fat1(n):
    f = 1
    while n > 1:
        f *= n
        n -= 1
    return f
# modo funcional
def fat2(n):
    if n < 2:
        return 1
    else:
        return n * fat2(n-1)

# modo re-uso de funções
from operator import mul
from functools import reduce
def fat3(n):
    return reduce(mul,range(1,n+1))

print("fat1(6)", fat1(6))
print("fat2(6)", fat2(6))
print("fat3(6)", fat3(6))

fat1(6) 720
fat2(6) 720
fat3(6) 720


## Estrutura de Dados

In [2]:
# !conda install pandas
# !pip install numpy
import pandas as pd

### Series

In [3]:
obj = pd.Series([4,-1, 2, 5])
print(obj)
print(type(obj))

print(obj.values)

print(obj.index)

0    4
1   -1
2    2
3    5
dtype: int64
<class 'pandas.core.series.Series'>
[ 4 -1  2  5]
RangeIndex(start=0, stop=4, step=1)


In [4]:
obj = pd.Series([4,-1, 2, 5], index=['d','a','c', 'b'])

print(obj)
print(obj.index)

d    4
a   -1
c    2
b    5
dtype: int64
Index(['d', 'a', 'c', 'b'], dtype='object')


In [5]:
print(obj['a'])
obj['a']=6
print(obj[['a','c']])

-1
a    6
c    2
dtype: int64


In [6]:
print(obj)
print(obj[obj>4])

d    4
a    6
c    2
b    5
dtype: int64
a    6
b    5
dtype: int64


In [7]:
print(obj*2)

d     8
a    12
c     4
b    10
dtype: int64


In [8]:
print(obj)
print('a' in obj)
print('e' in obj)
print(2 in obj.values)

d    4
a    6
c    2
b    5
dtype: int64
True
False
True


In [9]:
sdata = {'RS': 30405, "SC": 87463, "SP": 232434, "RJ": 382746}

obj3 = pd.Series(sdata)
print(obj3)

estados = ['RS', "SP", "RJ", "SC", "PR"]

obj4 = pd.Series(sdata, index=estados)
print(obj4)

RS     30405
SC     87463
SP    232434
RJ    382746
dtype: int64
RS     30405.0
SP    232434.0
RJ    382746.0
SC     87463.0
PR         NaN
dtype: float64


In [10]:
print(pd.isnull(obj4))
print(pd.notnull(obj4))
print(obj4.isnull())

RS    False
SP    False
RJ    False
SC    False
PR     True
dtype: bool
RS     True
SP     True
RJ     True
SC     True
PR    False
dtype: bool
RS    False
SP    False
RJ    False
SC    False
PR     True
dtype: bool


In [11]:
print(obj3+obj4)

PR         NaN
RJ    765492.0
RS     60810.0
SC    174926.0
SP    464868.0
dtype: float64


In [12]:
obj4.name = 'popul'
obj4.index.name = 'estados'
print(obj4)

estados
RS     30405.0
SP    232434.0
RJ    382746.0
SC     87463.0
PR         NaN
Name: popul, dtype: float64


In [13]:
obj5 = pd.Series([2,5,90,45])
print(obj5)
print(obj5.index)
obj5.index = ['Dalvan', "Carlos", "Vitor", "Luciano"]
print(obj5)
print(obj5.index)
obj5.index = ["Carlos", 'Dalvan', "Vitor", "Luciano"]
print(obj5)
print(obj5.index)

0     2
1     5
2    90
3    45
dtype: int64
RangeIndex(start=0, stop=4, step=1)
Dalvan      2
Carlos      5
Vitor      90
Luciano    45
dtype: int64
Index(['Dalvan', 'Carlos', 'Vitor', 'Luciano'], dtype='object')
Carlos      2
Dalvan      5
Vitor      90
Luciano    45
dtype: int64
Index(['Carlos', 'Dalvan', 'Vitor', 'Luciano'], dtype='object')


### DataFrame

In [14]:
dicionario = {'estados': ['RS', 'RS', "RS", "SP", 'SP', "SP"],
              'ano': [2000, 2001, 2002, 2000, 2001, 2002],
              'popul': [1.5, 1.7, 3.6, 2.4, 2.6, 3.2]
             }

df1 = pd.DataFrame(dicionario)
print(type(df1))
display(df1)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,estados,ano,popul
0,RS,2000,1.5
1,RS,2001,1.7
2,RS,2002,3.6
3,SP,2000,2.4
4,SP,2001,2.6
5,SP,2002,3.2


In [15]:
display(df1.head(3))

Unnamed: 0,estados,ano,popul
0,RS,2000,1.5
1,RS,2001,1.7
2,RS,2002,3.6


In [16]:
df1 = pd.DataFrame(dicionario, columns=['ano', 'estados', 'popul'])
display(df1)

Unnamed: 0,ano,estados,popul
0,2000,RS,1.5
1,2001,RS,1.7
2,2002,RS,3.6
3,2000,SP,2.4
4,2001,SP,2.6
5,2002,SP,3.2


In [17]:
df1 = pd.DataFrame(dicionario, columns=['ano', 'estados', 'popul'],
                   index=['um','dois','tres', 'quatro', 'cinco', 'seis']
                  )
display(df1)
display(df1.columns)

Unnamed: 0,ano,estados,popul
um,2000,RS,1.5
dois,2001,RS,1.7
tres,2002,RS,3.6
quatro,2000,SP,2.4
cinco,2001,SP,2.6
seis,2002,SP,3.2


Index(['ano', 'estados', 'popul'], dtype='object')

In [18]:
print(df1['estados'])
print(type(df1['estados']))

um        RS
dois      RS
tres      RS
quatro    SP
cinco     SP
seis      SP
Name: estados, dtype: object
<class 'pandas.core.series.Series'>


In [19]:
print(df1.ano)

um        2000
dois      2001
tres      2002
quatro    2000
cinco     2001
seis      2002
Name: ano, dtype: int64


In [20]:
print(df1.loc['um'])
print(type(df1.loc['um']))

ano        2000
estados      RS
popul       1.5
Name: um, dtype: object
<class 'pandas.core.series.Series'>


In [21]:
print(df1.iloc[0])
print(type(df1.iloc[0]))

ano        2000
estados      RS
popul       1.5
Name: um, dtype: object
<class 'pandas.core.series.Series'>


In [22]:
import numpy as np
df1['mortes']=np.arange(6.)
display(df1)

Unnamed: 0,ano,estados,popul,mortes
um,2000,RS,1.5,0.0
dois,2001,RS,1.7,1.0
tres,2002,RS,3.6,2.0
quatro,2000,SP,2.4,3.0
cinco,2001,SP,2.6,4.0
seis,2002,SP,3.2,5.0


In [23]:
val = pd.Series([20, 567, 300], index=['dois', 'quatro', 'cinco'])

df1['mortes'] = val
display(df1)

Unnamed: 0,ano,estados,popul,mortes
um,2000,RS,1.5,
dois,2001,RS,1.7,20.0
tres,2002,RS,3.6,
quatro,2000,SP,2.4,567.0
cinco,2001,SP,2.6,300.0
seis,2002,SP,3.2,


In [24]:
df1['inverno'] = df1.estados == "RS"
display(df1)

Unnamed: 0,ano,estados,popul,mortes,inverno
um,2000,RS,1.5,,True
dois,2001,RS,1.7,20.0,True
tres,2002,RS,3.6,,True
quatro,2000,SP,2.4,567.0,False
cinco,2001,SP,2.6,300.0,False
seis,2002,SP,3.2,,False


In [25]:
del df1['inverno']
df1

Unnamed: 0,ano,estados,popul,mortes
um,2000,RS,1.5,
dois,2001,RS,1.7,20.0
tres,2002,RS,3.6,
quatro,2000,SP,2.4,567.0
cinco,2001,SP,2.6,300.0
seis,2002,SP,3.2,


### Objetos Index

In [38]:
obj=pd.Series(range(3), index=['a','b','c'])
print(obj)
print(obj.index)
print(obj.index[:2])

labels=pd.Index(np.arange(3))
print(labels)

obj1=pd.Series(np.random.randn(3), index=labels)

print(obj1)
obj1.index is labels

a    0
b    1
c    2
dtype: int64
Index(['a', 'b', 'c'], dtype='object')
Index(['a', 'b'], dtype='object')
Int64Index([0, 1, 2], dtype='int64')
0    0.173682
1    0.566855
2    1.778700
dtype: float64


True

In [42]:
display(df1)
print(df1.columns)
print(df1.index)

Unnamed: 0,ano,estados,popul,mortes
um,2000,RS,1.5,
dois,2001,RS,1.7,20.0
tres,2002,RS,3.6,
quatro,2000,SP,2.4,567.0
cinco,2001,SP,2.6,300.0
seis,2002,SP,3.2,


Index(['ano', 'estados', 'popul', 'mortes'], dtype='object')
Index(['um', 'dois', 'tres', 'quatro', 'cinco', 'seis'], dtype='object')


## Funcionalidades

### Reindexação

In [45]:
obj = pd.Series(range(5,10), index=['d', 'c', 'a', 'b', 'e'])

print(obj)

obj1=obj.reindex(sorted(obj.index))
print(obj1)


d    5
c    6
a    7
b    8
e    9
dtype: int64
a    7
b    8
c    6
d    5
e    9
dtype: int64


In [48]:
obj3=pd.Series(['Amarelo', 'Azul', 'Verde'], index=[0,2,4])
print(obj3)
print(obj3.reindex(range(6), method='ffill'))

0    Amarelo
2       Azul
4      Verde
dtype: object
0    Amarelo
1    Amarelo
2       Azul
3       Azul
4      Verde
5      Verde
dtype: object


In [55]:
df2 = pd.DataFrame(np.arange(9).reshape((3,3)),
                  index=['a','c','d'],
                  columns=['RS','RJ', 'SP'])

print(df2)
display(df2.reindex(['a','b','c','d']))
display(df2.reindex(['a','b','c','d'], method='ffill'))


display(df2.reindex(columns=['SP','RJ','RS']))

   RS  RJ  SP
a   0   1   2
c   3   4   5
d   6   7   8


Unnamed: 0,RS,RJ,SP
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


Unnamed: 0,RS,RJ,SP
a,0,1,2
b,0,1,2
c,3,4,5
d,6,7,8


Unnamed: 0,SP,RJ,RS
a,2,1,0
c,5,4,3
d,8,7,6


### Descartando entradas de um eixo

In [65]:
df3=df2.reindex(['a','b','c','d'])

display(df3)
new_df3=df3.drop('b')
display(new_df3)

new_df3=df3.drop('RS', axis='columns')
display(new_df3)

new_df3=df3.drop(['a','b'])
display(new_df3)


Unnamed: 0,RS,RJ,SP
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


Unnamed: 0,RS,RJ,SP
a,0.0,1.0,2.0
c,3.0,4.0,5.0
d,6.0,7.0,8.0


Unnamed: 0,RJ,SP
a,1.0,2.0
b,,
c,4.0,5.0
d,7.0,8.0


Unnamed: 0,RS,RJ,SP
c,3.0,4.0,5.0
d,6.0,7.0,8.0


### Indexação, Seleção e Fatiamento

In [72]:
obj = pd.Series(np.arange(4), index=['a','b','c','d'])
print(obj)

print(obj['b'])
print(obj[1])

print(obj[2:4])

print(obj[['b','a']])

print(obj[[1,0]])

print(obj[obj<3])

print(obj['b':'d'])

obj['b':'d'] = 20
print(obj)

a    0
b    1
c    2
d    3
dtype: int64
1
1
c    2
d    3
dtype: int64
b    1
a    0
dtype: int64
b    1
a    0
dtype: int64
a    0
b    1
c    2
dtype: int64
b    1
c    2
d    3
dtype: int64
a     0
b    20
c    20
d    20
dtype: int64


In [86]:
display(df1)

print(df1['ano'])

display(df1[['ano','mortes']])

display(df1[:2])

display(df1[df1['popul'] < 3])

display(df1.popul < 3)

Unnamed: 0,ano,estados,popul,mortes
um,2000,RS,1.5,
dois,2001,RS,1.7,20.0
tres,2002,RS,3.6,
quatro,2000,SP,2.4,567.0
cinco,2001,SP,2.6,300.0
seis,2002,SP,3.2,


um        2000
dois      2001
tres      2002
quatro    2000
cinco     2001
seis      2002
Name: ano, dtype: int64


Unnamed: 0,ano,mortes
um,2000,
dois,2001,20.0
tres,2002,
quatro,2000,567.0
cinco,2001,300.0
seis,2002,


Unnamed: 0,ano,estados,popul,mortes
um,2000,RS,1.5,
dois,2001,RS,1.7,20.0


Unnamed: 0,ano,estados,popul,mortes
um,2000,RS,1.5,
dois,2001,RS,1.7,20.0
quatro,2000,SP,2.4,567.0
cinco,2001,SP,2.6,300.0


um         True
dois       True
tres      False
quatro     True
cinco      True
seis      False
Name: popul, dtype: bool

### Índices Inteiros

### Iteração

In [92]:
display(df1)

for idx, linha in df1.iterrows():
    print(idx)
    print(type(linha))    
    print(linha)

Unnamed: 0,ano,estados,popul,mortes
um,2000,RS,1.5,
dois,2001,RS,1.7,20.0
tres,2002,RS,3.6,
quatro,2000,SP,2.4,567.0
cinco,2001,SP,2.6,300.0
seis,2002,SP,3.2,


um
<class 'pandas.core.series.Series'>
ano        2000
estados      RS
popul       1.5
mortes      NaN
Name: um, dtype: object
dois
<class 'pandas.core.series.Series'>
ano        2001
estados      RS
popul       1.7
mortes     20.0
Name: dois, dtype: object
tres
<class 'pandas.core.series.Series'>
ano        2002
estados      RS
popul       3.6
mortes      NaN
Name: tres, dtype: object
quatro
<class 'pandas.core.series.Series'>
ano         2000
estados       SP
popul        2.4
mortes     567.0
Name: quatro, dtype: object
cinco
<class 'pandas.core.series.Series'>
ano         2001
estados       SP
popul        2.6
mortes     300.0
Name: cinco, dtype: object
seis
<class 'pandas.core.series.Series'>
ano        2002
estados      SP
popul       3.2
mortes      NaN
Name: seis, dtype: object


In [93]:
for col, ser in df1.items():
    print(col)
    print(type(ser))    
    print(ser)

ano
<class 'pandas.core.series.Series'>
um        2000
dois      2001
tres      2002
quatro    2000
cinco     2001
seis      2002
Name: ano, dtype: int64
estados
<class 'pandas.core.series.Series'>
um        RS
dois      RS
tres      RS
quatro    SP
cinco     SP
seis      SP
Name: estados, dtype: object
popul
<class 'pandas.core.series.Series'>
um        1.5
dois      1.7
tres      3.6
quatro    2.4
cinco     2.6
seis      3.2
Name: popul, dtype: float64
mortes
<class 'pandas.core.series.Series'>
um          NaN
dois       20.0
tres        NaN
quatro    567.0
cinco     300.0
seis        NaN
Name: mortes, dtype: float64


### Aritmética e alinhamento de dados

In [95]:
s1 = pd.Series(np.random.randn(4), index=list('abcd'))
s2 = pd.Series(np.random.randn(5), index=list('aefdb'))

print(s1)
print(s2)
print(s1+s2)

a   -1.148658
b   -1.280267
c   -1.483432
d   -0.770908
dtype: float64
a   -1.572050
e   -0.128872
f    0.826799
d    0.106972
b    0.755843
dtype: float64
a   -2.720708
b   -0.524425
c         NaN
d   -0.663936
e         NaN
f         NaN
dtype: float64


In [98]:
df1 = pd.DataFrame(np.arange(9).reshape((3,3)), columns=list('bcd'),
                  index=['RS', "SP", "RJ"])
df2 = pd.DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'),
                  index=['PR', 'RS', "SP", "RJ"])

display(df1)
display(df2)

display(df1+df2)



Unnamed: 0,b,c,d
RS,0,1,2
SP,3,4,5
RJ,6,7,8


Unnamed: 0,b,d,e
PR,0,1,2
RS,3,4,5
SP,6,7,8
RJ,9,10,11


Unnamed: 0,b,c,d,e
PR,,,,
RJ,15.0,,18.0,
RS,3.0,,6.0,
SP,9.0,,12.0,


Unnamed: 0,b,c,d,e
PR,0.0,,1.0,2.0
RJ,15.0,7.0,18.0,11.0
RS,3.0,1.0,6.0,5.0
SP,9.0,4.0,12.0,8.0


### Métodos aritméticos com valores para preenchimento

In [99]:
display(df1.add(df2, fill_value=0))

Unnamed: 0,b,c,d,e
PR,0.0,,1.0,2.0
RJ,15.0,7.0,18.0,11.0
RS,3.0,1.0,6.0,5.0
SP,9.0,4.0,12.0,8.0


### Operações entre DataFrame e Series

In [106]:
df10 = df1.add(df2, fill_value=0)
display(df10)

s10 = df10.iloc[2]
print(s10)

print(df10+s10)

Unnamed: 0,b,c,d,e
PR,0.0,,1.0,2.0
RJ,15.0,7.0,18.0,11.0
RS,3.0,1.0,6.0,5.0
SP,9.0,4.0,12.0,8.0


b    3.0
c    1.0
d    6.0
e    5.0
Name: RS, dtype: float64
       b    c     d     e
PR   3.0  NaN   7.0   7.0
RJ  18.0  8.0  24.0  16.0
RS   6.0  2.0  12.0  10.0
SP  12.0  5.0  18.0  13.0


### Aplicação de funções e mapeamento

In [109]:
dfN = pd.DataFrame(np.random.randn(4,3), columns=list('bde'),
                  index=['RS', 'SP', "RJ", "SC"])

display(dfN)

np.abs(dfN)

Unnamed: 0,b,d,e
RS,-1.490712,0.020717,0.583004
SP,0.107376,-0.076047,0.479054
RJ,-0.948971,0.419479,0.018982
SC,-0.59316,0.631921,0.225851


Unnamed: 0,b,d,e
RS,1.490712,0.020717,0.583004
SP,0.107376,0.076047,0.479054
RJ,0.948971,0.419479,0.018982
SC,0.59316,0.631921,0.225851


In [111]:
f = lambda x: x.max() - x.min()
dfN.apply(f)

b    1.598088
d    0.707969
e    0.564022
dtype: float64

In [112]:
f = lambda x: x.max() - x.min()
dfN.apply(f, axis='columns')

RS    2.073716
SP    0.555101
RJ    1.368450
SC    1.225082
dtype: float64

### Ordenação e classificação

### Índices de eixos com rótulos duplicados

## Sumarização dos Dados e Estatísticas Descritivas

## Carregamento e Armazenamento de Dados

## 