# [Prof. Dalvan Griebler](mailto:dalvan.griebler@pucrs.br)

## Programação Orientada a Dados (POD) - Turma 10 (POD_98H04-06)

**Atualizado**: 29/10/2021

**Descrição**: Material de apoio as aulas sobre Python para POD

**Copyright &copy;**: Este documento está sob a licensa da Criative Commons [BY-NC-ND 4.0](https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode)

# Introdução a Biblioteca Pandas do Python

## Programação Funcional vs Manipulação de Dados com Pandas

In [1]:
# modo imperativo
def fat1(n):
    f = 1
    while n > 1:
        f *= n
        n -= 1
    return f
# modo funcional
def fat2(n):
    if n < 2:
        return 1
    else:
        return n * fat2(n-1)

# modo re-uso de funções
from operator import mul
from functools import reduce
def fat3(n):
    return reduce(mul,range(1,n+1))

print("fat1(6)", fat1(6))
print("fat2(6)", fat2(6))
print("fat3(6)", fat3(6))

fat1(6) 720
fat2(6) 720
fat3(6) 720


## Estrutura de Dados

In [2]:
import pandas as pd

### Series

In [3]:
obj = pd.Series([4,7,-5,3])

print(obj)
print(type(obj))

print(obj.values)

print(obj.index)

0    4
1    7
2   -5
3    3
dtype: int64
<class 'pandas.core.series.Series'>
[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)


In [4]:
obj = pd.Series([4,7,-5,3], index=['d', 'c', 'a', 'b'])
print(obj)
print(obj.index)

d    4
c    7
a   -5
b    3
dtype: int64
Index(['d', 'c', 'a', 'b'], dtype='object')


In [5]:
print(obj['a'])
obj['a']=10
print(obj['a'])

print(obj[['a', 'c', 'b']])

-5
10
a    10
c     7
b     3
dtype: int64


In [6]:
print(obj[obj > 4])

print(obj * 4)

c     7
a    10
dtype: int64
d    16
c    28
a    40
b    12
dtype: int64


In [7]:
print('b' in obj)
print('e' in obj)

True
False


In [8]:
habitantes = {'RS': 35000, 'SP': 40000, 'RJ': 70000, 'PR': 40060}

obj2 = pd.Series(habitantes)

print(obj2)
print(obj2['RS'])

estados = ['RS', 'RJ', 'PR', 'SP', 'SC']

obj3 = pd.Series(habitantes, index=estados)
print(obj3)

RS    35000
SP    40000
RJ    70000
PR    40060
dtype: int64
35000
RS    35000.0
RJ    70000.0
PR    40060.0
SP    40000.0
SC        NaN
dtype: float64


In [9]:
print(pd.isnull(obj3))
print(pd.notnull(obj3))

print(obj3.isnull())

RS    False
RJ    False
PR    False
SP    False
SC     True
dtype: bool
RS     True
RJ     True
PR     True
SP     True
SC    False
dtype: bool
RS    False
RJ    False
PR    False
SP    False
SC     True
dtype: bool


In [10]:
print(obj3)
print(obj2)

print(obj2+obj3)

RS    35000.0
RJ    70000.0
PR    40060.0
SP    40000.0
SC        NaN
dtype: float64
RS    35000
SP    40000
RJ    70000
PR    40060
dtype: int64
PR     80120.0
RJ    140000.0
RS     70000.0
SC         NaN
SP     80000.0
dtype: float64


In [11]:
obj3.name = 'popul'
obj3.index.name = 'estado'
print(obj3)
#cuidado, não faz associacao
obj3.index = ['RJ', 'RS', 'PR', 'SP', 'SC']
print(obj3)

estado
RS    35000.0
RJ    70000.0
PR    40060.0
SP    40000.0
SC        NaN
Name: popul, dtype: float64
RJ    35000.0
RS    70000.0
PR    40060.0
SP    40000.0
SC        NaN
Name: popul, dtype: float64


### DataFrame

In [12]:
data = {'estado': ['RS', 'RJ','RS', 'RJ', 'RJ', 'RJ'],
        'ano': [2000, 2000, 2001, 2001, 2002, 2003],
        'popul': [1.5, 1.7, 3.6, 4.4, 5.5, 6.7]}

df = pd.DataFrame(data)

print(df)

  estado   ano  popul
0     RS  2000    1.5
1     RJ  2000    1.7
2     RS  2001    3.6
3     RJ  2001    4.4
4     RJ  2002    5.5
5     RJ  2003    6.7


In [13]:
print(df.head())

  estado   ano  popul
0     RS  2000    1.5
1     RJ  2000    1.7
2     RS  2001    3.6
3     RJ  2001    4.4
4     RJ  2002    5.5


In [14]:
df = pd.DataFrame(data, columns=['ano', 'estado', 'popul'])
display(df)

Unnamed: 0,ano,estado,popul
0,2000,RS,1.5
1,2000,RJ,1.7
2,2001,RS,3.6
3,2001,RJ,4.4
4,2002,RJ,5.5
5,2003,RJ,6.7


In [15]:
df = pd.DataFrame(data, columns=['ano', 'estado', 'popul', 'mortes'],
                 index=['um', 'dois', 'tres', 'quatro', 'cinco', 
                        'seis'])
display(df)

Unnamed: 0,ano,estado,popul,mortes
um,2000,RS,1.5,
dois,2000,RJ,1.7,
tres,2001,RS,3.6,
quatro,2001,RJ,4.4,
cinco,2002,RJ,5.5,
seis,2003,RJ,6.7,


In [16]:
ser=df['popul']
print(type(ser))
print(df['popul'])

print(df.popul)

<class 'pandas.core.series.Series'>
um        1.5
dois      1.7
tres      3.6
quatro    4.4
cinco     5.5
seis      6.7
Name: popul, dtype: float64
um        1.5
dois      1.7
tres      3.6
quatro    4.4
cinco     5.5
seis      6.7
Name: popul, dtype: float64


In [17]:
display(df)
print(df.loc['quatro'])
print(df.iloc[3])

Unnamed: 0,ano,estado,popul,mortes
um,2000,RS,1.5,
dois,2000,RJ,1.7,
tres,2001,RS,3.6,
quatro,2001,RJ,4.4,
cinco,2002,RJ,5.5,
seis,2003,RJ,6.7,


ano       2001
estado      RJ
popul      4.4
mortes     NaN
Name: quatro, dtype: object
ano       2001
estado      RJ
popul      4.4
mortes     NaN
Name: quatro, dtype: object


In [18]:
df['mortes'] = 0.2
display(df)

Unnamed: 0,ano,estado,popul,mortes
um,2000,RS,1.5,0.2
dois,2000,RJ,1.7,0.2
tres,2001,RS,3.6,0.2
quatro,2001,RJ,4.4,0.2
cinco,2002,RJ,5.5,0.2
seis,2003,RJ,6.7,0.2


In [19]:
val = pd.Series([0.5, 0.4, 0.6], index=['um', 'tres', 'cinco'])
df['mortes'] = val
display(df)

Unnamed: 0,ano,estado,popul,mortes
um,2000,RS,1.5,0.5
dois,2000,RJ,1.7,
tres,2001,RS,3.6,0.4
quatro,2001,RJ,4.4,
cinco,2002,RJ,5.5,0.6
seis,2003,RJ,6.7,


In [20]:
df1 = df
df1['inverno'] = df1.estado == 'RS'
display(df1)

Unnamed: 0,ano,estado,popul,mortes,inverno
um,2000,RS,1.5,0.5,True
dois,2000,RJ,1.7,,False
tres,2001,RS,3.6,0.4,True
quatro,2001,RJ,4.4,,False
cinco,2002,RJ,5.5,0.6,False
seis,2003,RJ,6.7,,False


In [21]:
del df1['inverno']
display(df1)

Unnamed: 0,ano,estado,popul,mortes
um,2000,RS,1.5,0.5
dois,2000,RJ,1.7,
tres,2001,RS,3.6,0.4
quatro,2001,RJ,4.4,
cinco,2002,RJ,5.5,0.6
seis,2003,RJ,6.7,


In [22]:
pop = {'RS': {2001: 1.5, 2002: 2.9},
       'RJ': {2000: 2.5, 2001: 3.5, 2002: 5.7}}

print(pop)
df2 = pd.DataFrame(pop)
df2.index.name='ano'
display(df2)

{'RS': {2001: 1.5, 2002: 2.9}, 'RJ': {2000: 2.5, 2001: 3.5, 2002: 5.7}}


Unnamed: 0_level_0,RS,RJ
ano,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,1.5,3.5
2002,2.9,5.7
2000,,2.5


In [23]:
df2.T

ano,2001,2002,2000
RS,1.5,2.9,
RJ,3.5,5.7,2.5


In [24]:
df2.values

array([[1.5, 3.5],
       [2.9, 5.7],
       [nan, 2.5]])

### Objetos Index

In [25]:
obj = pd.Series(range(3), index=['a', 'b','c'])
print(obj)
index = obj.index
print(index)
print(index[:2])

a    0
b    1
c    2
dtype: int64
Index(['a', 'b', 'c'], dtype='object')
Index(['a', 'b'], dtype='object')


In [26]:
import numpy as np
# label = pd.Index(np.arange(3))
label = pd.Index(['Andressa', 'Rafaela', 'Florensa'])
print(label)

obj = pd.Series(range(3), index=label)
print(obj)

print(obj.index is label)

Index(['Andressa', 'Rafaela', 'Florensa'], dtype='object')
Andressa    0
Rafaela     1
Florensa    2
dtype: int64
True


In [27]:
display(df2)
print(df2.columns)

print('RS' in df2.columns)

print(2002 in df2.index)

Unnamed: 0_level_0,RS,RJ
ano,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,1.5,3.5
2002,2.9,5.7
2000,,2.5


Index(['RS', 'RJ'], dtype='object')
True
True


## Funcionalidades

### Reindexação

In [28]:
obj3 = pd.Series(range(3), index=['a', 'b','c'])
print(obj3)
obj4 = obj3.reindex(['c', 'b', 'a'])
print(obj4)

a    0
b    1
c    2
dtype: int64
c    2
b    1
a    0
dtype: int64


In [29]:
df3 = df2
display(df3)
df4 = df3.reindex([2000, 2001, 2002])
idx = df4.columns.reindex(['RJ', 'RS'])
print(idx)
df5=pd.DataFrame(df4, columns=idx[0])
display(df5)

Unnamed: 0_level_0,RS,RJ
ano,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,1.5,3.5
2002,2.9,5.7
2000,,2.5


(Index(['RJ', 'RS'], dtype='object'), array([1, 0]))


Unnamed: 0_level_0,RJ,RS
ano,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,2.5,
2001,3.5,1.5
2002,5.7,2.9


### Descartando entradas de um eixo

In [30]:
obj6 = pd.Series(range(5), index=['a', 'b','c', 'd', 'e'])

print(obj6)
# new_obj6 = obj6.drop('c')
new_obj6 = obj6.drop(['c', 'e'])

print(new_obj6)

a    0
b    1
c    2
d    3
e    4
dtype: int64
a    0
b    1
d    3
dtype: int64


In [31]:
df_obj = pd.DataFrame(np.arange(16).reshape((4,4)),
                     index=['RS', 'PR', 'RJ', 'SP'],
                     columns=['um', 'dois', 'tres', 'quatro'])

df_obj

Unnamed: 0,um,dois,tres,quatro
RS,0,1,2,3
PR,4,5,6,7
RJ,8,9,10,11
SP,12,13,14,15


In [32]:
df_obj.drop(['PR', 'RJ'])

Unnamed: 0,um,dois,tres,quatro
RS,0,1,2,3
SP,12,13,14,15


In [33]:
df_obj.drop(['dois'], axis=1)

Unnamed: 0,um,tres,quatro
RS,0,2,3
PR,4,6,7
RJ,8,10,11
SP,12,14,15


In [34]:
df_obj.drop(['dois','quatro'], axis='columns')

Unnamed: 0,um,tres
RS,0,2
PR,4,6
RJ,8,10
SP,12,14


In [35]:
display(df_obj)
df_obj.drop(['dois','quatro'], axis='columns', inplace=True)
display(df_obj)

Unnamed: 0,um,dois,tres,quatro
RS,0,1,2,3
PR,4,5,6,7
RJ,8,9,10,11
SP,12,13,14,15


Unnamed: 0,um,tres
RS,0,2
PR,4,6
RJ,8,10
SP,12,14


### Indexação, Seleção e Fatiamento

In [36]:
ser = pd.Series(np.arange(5.), index=['a','b', 'c','d', 'e'])

print(ser)

print(ser['b'])
print(ser[1])


print(ser['b':'d'])
print(ser[2:4])


print(ser[['b', 'e']])
print(ser[[1,4]])



a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
1.0
1.0
b    1.0
c    2.0
d    3.0
dtype: float64
c    2.0
d    3.0
dtype: float64
b    1.0
e    4.0
dtype: float64
b    1.0
e    4.0
dtype: float64


In [37]:
print(ser)
ser['b':'d']=5
print(ser)

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    5.0
c    5.0
d    5.0
e    4.0
dtype: float64


In [38]:
df_obj = pd.DataFrame(np.arange(16).reshape((4,4)),
                     index=['RS', 'PR', 'RJ', 'SP'],
                     columns=['um', 'dois', 'tres', 'quatro'])

display(df_obj)

display(df_obj['dois'])
display(df_obj[['dois', 'tres']])


Unnamed: 0,um,dois,tres,quatro
RS,0,1,2,3
PR,4,5,6,7
RJ,8,9,10,11
SP,12,13,14,15


RS     1
PR     5
RJ     9
SP    13
Name: dois, dtype: int64

Unnamed: 0,dois,tres
RS,1,2
PR,5,6
RJ,9,10
SP,13,14


In [39]:
display(df_obj[df_obj['dois'] > 5])

Unnamed: 0,um,dois,tres,quatro
RJ,8,9,10,11
SP,12,13,14,15


In [40]:
display(df_obj < 4)

Unnamed: 0,um,dois,tres,quatro
RS,True,True,True,True
PR,False,False,False,False
RJ,False,False,False,False
SP,False,False,False,False


In [41]:
df_obj[df_obj < 4] = 0
display(df_obj)

Unnamed: 0,um,dois,tres,quatro
RS,0,0,0,0
PR,4,5,6,7
RJ,8,9,10,11
SP,12,13,14,15


In [42]:
display(df_obj)
display(df_obj.loc['RS'])
display(df_obj.iloc[0])


Unnamed: 0,um,dois,tres,quatro
RS,0,0,0,0
PR,4,5,6,7
RJ,8,9,10,11
SP,12,13,14,15


um        0
dois      0
tres      0
quatro    0
Name: RS, dtype: int64

um        0
dois      0
tres      0
quatro    0
Name: RS, dtype: int64

In [43]:
display(df_obj.loc[['RS','RJ']])
display(df_obj.iloc[[0,2]])

Unnamed: 0,um,dois,tres,quatro
RS,0,0,0,0
RJ,8,9,10,11


Unnamed: 0,um,dois,tres,quatro
RS,0,0,0,0
RJ,8,9,10,11


In [44]:
display(df_obj.iloc[:2])
display(df_obj.loc[:'PR'])

display(df_obj.iloc[:2,:3])
display(df_obj.loc[:'PR',:'tres'])

Unnamed: 0,um,dois,tres,quatro
RS,0,0,0,0
PR,4,5,6,7


Unnamed: 0,um,dois,tres,quatro
RS,0,0,0,0
PR,4,5,6,7


Unnamed: 0,um,dois,tres
RS,0,0,0
PR,4,5,6


Unnamed: 0,um,dois,tres
RS,0,0,0
PR,4,5,6


### Índices Inteiros

### Iteração

In [45]:
for idx, row in df_obj.iterrows():
    print(idx, '--' ,row)

RS -- um        0
dois      0
tres      0
quatro    0
Name: RS, dtype: int64
PR -- um        4
dois      5
tres      6
quatro    7
Name: PR, dtype: int64
RJ -- um         8
dois       9
tres      10
quatro    11
Name: RJ, dtype: int64
SP -- um        12
dois      13
tres      14
quatro    15
Name: SP, dtype: int64


In [46]:
for idx, col in df_obj.items():
    print(idx, '--' ,col)

um -- RS     0
PR     4
RJ     8
SP    12
Name: um, dtype: int64
dois -- RS     0
PR     5
RJ     9
SP    13
Name: dois, dtype: int64
tres -- RS     0
PR     6
RJ    10
SP    14
Name: tres, dtype: int64
quatro -- RS     0
PR     7
RJ    11
SP    15
Name: quatro, dtype: int64


### Aritmética e alinhamento de dados

In [49]:
ser1 = pd.Series(np.arange(5.), index=['a','b', 'c','d', 'e'])
ser2 = pd.Series(np.arange(5.), index=['a','b', 'b','g', 'e'])

print(ser1)
print(ser2)
print(ser1+ser2)

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
b    2.0
g    3.0
e    4.0
dtype: float64
a    0.0
b    2.0
b    3.0
c    NaN
d    NaN
e    8.0
g    NaN
dtype: float64


In [50]:
df_obj1 = pd.DataFrame(np.arange(16).reshape((4,4)),
                     index=['RS', 'PR', 'RJ', 'SP'],
                     columns=['um', 'tres', 'tres', 'quatro'])

df_obj2 = pd.DataFrame(np.arange(16).reshape((4,4)),
                     index=['RS', 'PR', 'RJ', 'SP'],
                     columns=['um', 'dois', 'tres', 'quatro'])

display(df_obj1)
display(df_obj2)
display(df_obj1+df_obj2)

Unnamed: 0,um,tres,tres.1,quatro
RS,0,1,2,3
PR,4,5,6,7
RJ,8,9,10,11
SP,12,13,14,15


Unnamed: 0,um,dois,tres,quatro
RS,0,1,2,3
PR,4,5,6,7
RJ,8,9,10,11
SP,12,13,14,15


Unnamed: 0,dois,quatro,tres,tres.1,um
RS,,6,3,4,0
PR,,14,11,12,8
RJ,,22,19,20,16
SP,,30,27,28,24


### Métodos aritméticos com valores para preenchimento

In [55]:
df_obj3=df_obj1+df_obj2

display(df_obj3)

display(df_obj1.add(df_obj2, fill_value=1))

Unnamed: 0,dois,quatro,tres,tres.1,um
RS,,6,3,4,0
PR,,14,11,12,8
RJ,,22,19,20,16
SP,,30,27,28,24


Unnamed: 0,dois,quatro,tres,tres.1,um
RS,2.0,6,3,4,0
PR,6.0,14,11,12,8
RJ,10.0,22,19,20,16
SP,14.0,30,27,28,24


In [59]:
display(1/df_obj1)
df_obj1.rdiv(1)

Unnamed: 0,um,tres,tres.1,quatro
RS,inf,1.0,0.5,0.333333
PR,0.25,0.2,0.166667,0.142857
RJ,0.125,0.111111,0.1,0.090909
SP,0.083333,0.076923,0.071429,0.066667


Unnamed: 0,um,tres,tres.1,quatro
RS,inf,1.0,0.5,0.333333
PR,0.25,0.2,0.166667,0.142857
RJ,0.125,0.111111,0.1,0.090909
SP,0.083333,0.076923,0.071429,0.066667


### Operações entre DataFrame e Series

In [67]:
arr = np.arange(12).reshape((3,4))
print(arr)
print(arr[1])
print(arr[:,1])
print(arr+arr[1])

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[4 5 6 7]
[1 5 9]
[[ 4  6  8 10]
 [ 8 10 12 14]
 [12 14 16 18]]


In [71]:
display(df_obj1)

display(df_obj1.iloc[1])
display(df_obj1+df_obj1.iloc[1])

Unnamed: 0,um,tres,tres.1,quatro
RS,0,1,2,3
PR,4,5,6,7
RJ,8,9,10,11
SP,12,13,14,15


um        4
tres      5
tres      6
quatro    7
Name: PR, dtype: int64

Unnamed: 0,um,tres,tres.1,quatro
RS,4,6,8,10
PR,8,10,12,14
RJ,12,14,16,18
SP,16,18,20,22


In [77]:
# display(df_obj2+df_obj1.iloc[1])
df_obj1 = pd.DataFrame(np.arange(16).reshape((4,4)),
                     index=['RS', 'PR', 'RJ', 'SP'],
                     columns=['um', 'cinco', 'tres', 'quatro'])

df_obj2 = pd.DataFrame(np.arange(16).reshape((4,4)),
                     index=['RS', 'PR', 'RJ', 'SP'],
                     columns=['um', 'dois', 'tres', 'quatro'])

display(df_obj2+df_obj1.iloc[1])


Unnamed: 0,cinco,dois,quatro,tres,um
RS,,,10.0,8.0,4.0
PR,,,14.0,12.0,8.0
RJ,,,18.0,16.0,12.0
SP,,,22.0,20.0,16.0


### Aplicação de funções e mapeamento

In [80]:
df_rand = pd.DataFrame(np.random.randn(4,3), columns=list('abc'), 
                       index=['RS', 'SP', 'RJ', 'PR'])

display(df_rand)

np.abs(df_rand)

Unnamed: 0,a,b,c
RS,-0.121118,-0.983577,0.235521
SP,-0.210889,-0.666407,0.423299
RJ,0.304726,-1.301619,-0.753064
PR,-0.363746,-0.246874,-1.912553


Unnamed: 0,a,b,c
RS,0.121118,0.983577,0.235521
SP,0.210889,0.666407,0.423299
RJ,0.304726,1.301619,0.753064
PR,0.363746,0.246874,1.912553


In [85]:
df_rand_abs=df_rand.apply(np.abs)
f = lambda x: x.max() - x.min()
df_rand_abs.apply(f)

a    0.242628
b    1.054745
c    1.677032
dtype: float64

In [88]:
df_rand_abs.apply(f, axis='columns')

RS    0.862458
SP    0.455518
RJ    0.996893
PR    1.665679
dtype: float64

In [90]:
def f2(x):
    return pd.Series([x.min(), x.max()], index=['min','max'])

df_rand_abs.apply(f2)

Unnamed: 0,a,b,c
min,0.121118,0.246874,0.235521
max,0.363746,1.301619,1.912553


In [92]:
form = lambda e: '%.2f' % e

df_rand_abs.applymap(form)

Unnamed: 0,a,b,c
RS,0.12,0.98,0.24
SP,0.21,0.67,0.42
RJ,0.3,1.3,0.75
PR,0.36,0.25,1.91


In [94]:
df_rand_abs['b'].map(form)

RS    0.98
SP    0.67
RJ    1.30
PR    0.25
Name: b, dtype: object

### Ordenação e classificação

In [98]:
display(df_rand_abs)
display(df_rand_abs.sort_index())
display(df_rand_abs.sort_index(axis=1, ascending=False))

Unnamed: 0,a,b,c
RS,0.121118,0.983577,0.235521
SP,0.210889,0.666407,0.423299
RJ,0.304726,1.301619,0.753064
PR,0.363746,0.246874,1.912553


Unnamed: 0,a,b,c
PR,0.363746,0.246874,1.912553
RJ,0.304726,1.301619,0.753064
RS,0.121118,0.983577,0.235521
SP,0.210889,0.666407,0.423299


Unnamed: 0,c,b,a
RS,0.235521,0.983577,0.121118
SP,0.423299,0.666407,0.210889
RJ,0.753064,1.301619,0.304726
PR,1.912553,0.246874,0.363746


In [101]:
display(df_rand_abs)
display(df_rand_abs.sort_values(by='b'))
display(df_rand_abs.sort_values(by='RS',axis=1, ascending=False))

Unnamed: 0,a,b,c
RS,0.121118,0.983577,0.235521
SP,0.210889,0.666407,0.423299
RJ,0.304726,1.301619,0.753064
PR,0.363746,0.246874,1.912553


Unnamed: 0,a,b,c
PR,0.363746,0.246874,1.912553
SP,0.210889,0.666407,0.423299
RS,0.121118,0.983577,0.235521
RJ,0.304726,1.301619,0.753064


Unnamed: 0,b,c,a
RS,0.983577,0.235521,0.121118
SP,0.666407,0.423299,0.210889
RJ,1.301619,0.753064,0.304726
PR,0.246874,1.912553,0.363746


### Índices de eixos com rótulos duplicados

In [102]:
ser2 = pd.Series(np.arange(5.), index=['a','b', 'b','g', 'e'])

print(ser2.index.is_unique)

False


## Sumarização dos Dados e Estatísticas Descritivas

In [104]:
display(df_rand_abs.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, RS to PR
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       4 non-null      float64
 1   b       4 non-null      float64
 2   c       4 non-null      float64
dtypes: float64(3)
memory usage: 300.0+ bytes


None

In [107]:
display(df_rand_abs)
display(df_rand_abs.describe())

Unnamed: 0,a,b,c
RS,0.121118,0.983577,0.235521
SP,0.210889,0.666407,0.423299
RJ,0.304726,1.301619,0.753064
PR,0.363746,0.246874,1.912553


Unnamed: 0,a,b,c
count,4.0,4.0,4.0
mean,0.25012,0.799619,0.831109
std,0.106573,0.450598,0.75203
min,0.121118,0.246874,0.235521
25%,0.188446,0.561524,0.376355
50%,0.257808,0.824992,0.588181
75%,0.319481,1.063087,1.042936
max,0.363746,1.301619,1.912553


In [115]:
display(df_rand_abs.sum())
display(df_rand_abs.sum(axis=1))
display(df_obj3.sum())
display(df_obj3.sum(skipna=False))


a    1.000480
b    3.198476
c    3.324437
dtype: float64

RS    1.340216
SP    1.300595
RJ    2.359409
PR    2.523173
dtype: float64

dois       0.0
quatro    72.0
tres      60.0
tres      64.0
um        48.0
dtype: float64

dois       NaN
quatro    72.0
tres      60.0
tres      64.0
um        48.0
dtype: float64

In [120]:
display(df_obj3.idxmax())
display(df_obj3.idxmax(axis=1))

dois      NaN
quatro     SP
tres       SP
tres       SP
um         SP
dtype: object

RS    quatro
PR    quatro
RJ    quatro
SP    quatro
dtype: object

## Valores Únicos, Contadores de Valores e Pertencimento

In [126]:
display(df_rand_abs)

display(df_rand_abs['b'].unique())

display(df_rand_abs.loc['RS'].unique())



Unnamed: 0,a,b,c
RS,0.121118,0.983577,0.235521
SP,0.210889,0.666407,0.423299
RJ,0.304726,1.301619,0.753064
PR,0.363746,0.246874,1.912553


array([0.98357675, 0.66640687, 1.30161892, 0.24687395])

array([0.12111838, 0.98357675, 0.2355213 ])

In [128]:
display(df_rand_abs['b'].value_counts())

display(df_rand_abs.loc['RS'].value_counts())

0.983577    1
0.666407    1
1.301619    1
0.246874    1
Name: b, dtype: int64

0.121118    1
0.983577    1
0.235521    1
Name: RS, dtype: int64

In [130]:
display(df_rand_abs.apply(pd.value_counts).fillna(0))

Unnamed: 0,a,b,c
0.121118,1.0,0.0,0.0
0.210889,1.0,0.0,0.0
0.235521,0.0,0.0,1.0
0.246874,0.0,1.0,0.0
0.304726,1.0,0.0,0.0
0.363746,1.0,0.0,0.0
0.423299,0.0,0.0,1.0
0.666407,0.0,1.0,0.0
0.753064,0.0,0.0,1.0
0.983577,0.0,1.0,0.0


## Carregamento e Armazenamento de Dados

In [135]:
df = pd.read_csv('brasi2000-2021.csv', sep=',')
df

Unnamed: 0,ID,Rodada,Data,Horário,Dia,Mandante,Visitante,Vencedor,Arena,Mandante Placar,Visitante Placar,Estado Mandante,Estado Visitante,Estado Vencedor
0,1,1,2000-07-29,16h00,Sábado,Fluminense,Bahia,Fluminense,Maracanã,2,0,RJ,BA,RJ
1,2,1,2000-07-29,16h00,Sábado,Vasco,Sport,Sport,São Januário,0,2,RJ,PE,PE
2,3,1,2000-07-29,16h00,Sábado,Vitória,Palmeiras,Vitória,Barradão,4,1,ES,SP,ES
3,4,1,2000-07-30,17h00,Domingo,Botafogo-RJ,Atlético-MG,-,Caio Martins,0,0,RJ,MG,-
4,5,1,2000-07-30,18h30,Domingo,Juventude,Flamengo,-,Alfredo Jaconi,1,1,RS,RJ,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8333,8334,2,2021-06-06,16:00,Domingo,américa-mg,corinthians,corinthians,Independência,0,1,MG,SP,SP
8334,8335,2,2021-06-06,16:00,Domingo,fortaleza,internacional,fortaleza,Castelão,5,1,CE,RS,CE
8335,8336,2,2021-06-06,18:15,Domingo,palmeiras,chapecoense,palmeiras,Allianz Parque,3,1,SP,SC,SP
8336,8337,2,2021-06-06,18:15,Domingo,juventude,athletico-pr,athletico-pr,Alfredo Jaconi,0,3,RS,PR,PR


In [139]:
df = pd.read_csv('brasi2000-2021.csv', sep=',', nrows=5)
df

Unnamed: 0,ID,Rodada,Data,Horário,Dia,Mandante,Visitante,Vencedor,Arena,Mandante Placar,Visitante Placar,Estado Mandante,Estado Visitante,Estado Vencedor
0,1,1,2000-07-29,16h00,Sábado,Fluminense,Bahia,Fluminense,Maracanã,2,0,RJ,BA,RJ
1,2,1,2000-07-29,16h00,Sábado,Vasco,Sport,Sport,São Januário,0,2,RJ,PE,PE
2,3,1,2000-07-29,16h00,Sábado,Vitória,Palmeiras,Vitória,Barradão,4,1,ES,SP,ES
3,4,1,2000-07-30,17h00,Domingo,Botafogo-RJ,Atlético-MG,-,Caio Martins,0,0,RJ,MG,-
4,5,1,2000-07-30,18h30,Domingo,Juventude,Flamengo,-,Alfredo Jaconi,1,1,RS,RJ,-


In [141]:
df = pd.read_csv('brasi2000-2021.csv', sep=',', na_values={'Vencedor': '-'}, nrows=5)
df

Unnamed: 0,ID,Rodada,Data,Horário,Dia,Mandante,Visitante,Vencedor,Arena,Mandante Placar,Visitante Placar,Estado Mandante,Estado Visitante,Estado Vencedor
0,1,1,2000-07-29,16h00,Sábado,Fluminense,Bahia,Fluminense,Maracanã,2,0,RJ,BA,RJ
1,2,1,2000-07-29,16h00,Sábado,Vasco,Sport,Sport,São Januário,0,2,RJ,PE,PE
2,3,1,2000-07-29,16h00,Sábado,Vitória,Palmeiras,Vitória,Barradão,4,1,ES,SP,ES
3,4,1,2000-07-30,17h00,Domingo,Botafogo-RJ,Atlético-MG,,Caio Martins,0,0,RJ,MG,-
4,5,1,2000-07-30,18h30,Domingo,Juventude,Flamengo,,Alfredo Jaconi,1,1,RS,RJ,-


In [144]:
df_chunk = pd.read_csv('brasi2000-2021.csv', sep=',', chunksize=100,
                 na_values={'Vencedor': '-'}, nrows=5)

for part in df_chunk:
    print(part)

df_chunk

   ID  Rodada        Data Horário      Dia     Mandante    Visitante  \
0   1       1  2000-07-29   16h00   Sábado   Fluminense        Bahia   
1   2       1  2000-07-29   16h00   Sábado        Vasco        Sport   
2   3       1  2000-07-29   16h00   Sábado      Vitória    Palmeiras   
3   4       1  2000-07-30   17h00  Domingo  Botafogo-RJ  Atlético-MG   
4   5       1  2000-07-30   18h30  Domingo    Juventude     Flamengo   

     Vencedor           Arena  Mandante Placar  Visitante Placar  \
0  Fluminense        Maracanã                2                 0   
1       Sport    São Januário                0                 2   
2     Vitória        Barradão                4                 1   
3         NaN    Caio Martins                0                 0   
4         NaN  Alfredo Jaconi                1                 1   

  Estado Mandante Estado Visitante Estado Vencedor  
0              RJ               BA              RJ  
1              RJ               PE              PE  

<pandas.io.parsers.readers.TextFileReader at 0x7f86d2516fa0>

In [138]:
df.to_csv('saida.csv')
!cat saida.csv

,ID,Rodada,Data,Horário,Dia,Mandante,Visitante,Vencedor,Arena,Mandante Placar,Visitante Placar,Estado Mandante,Estado Visitante,Estado Vencedor
0,1,1,2000-07-29,16h00,Sábado,Fluminense,Bahia,Fluminense,Maracanã,2,0,RJ,BA,RJ
1,2,1,2000-07-29,16h00,Sábado,Vasco,Sport,Sport,São Januário,0,2,RJ,PE,PE
2,3,1,2000-07-29,16h00,Sábado,Vitória,Palmeiras,Vitória,Barradão,4,1,ES,SP,ES
3,4,1,2000-07-30,17h00,Domingo,Botafogo-RJ,Atlético-MG,-,Caio Martins,0,0,RJ,MG,-
4,5,1,2000-07-30,18h30,Domingo,Juventude,Flamengo,-,Alfredo Jaconi,1,1,RS,RJ,-


## 