# Pandas

Origem do nome: Panel Data. Principal aplicação é o uso de dados multidimensionais e heterogêneos, enquanto o NumPy está associado mais a dados homogêneos.
Estruturas fundamentais são "series" e "dataframes"

**Series**: array unidimensional com uma sequencia de valores e um array associado a rótulos/labels de dados chamado índice. Índices à esquerda e valores à direita.

**Dataframe**: tabela de dados retangular com coleções ordenadas de colunas, cada uma pode conter um tipo de valor diferente. Possui índice para linha e para coluna. Pode ser visto como um dicionário (estrutura Python) de series.

## Series

In [1]:
import numpy as np
import pandas as pd


In [2]:
s1 = pd.Series([1, 2, 3, 0, -8, -23])
s1

0     1
1     2
2     3
3     0
4    -8
5   -23
dtype: int64

In [3]:
s1.values

array([  1,   2,   3,   0,  -8, -23])

In [4]:
s1.index

RangeIndex(start=0, stop=6, step=1)

In [5]:
indexarray = ['a', 'f', 'c', 'd']

In [6]:
s2 = pd.Series([1.2, 2, 3, -0.5], index=indexarray)
s2.index, s2

(Index(['a', 'f', 'c', 'd'], dtype='object'),
 a    1.2
 f    2.0
 c    3.0
 d   -0.5
 dtype: float64)

In [7]:
s2[s2>=2]

f    2.0
c    3.0
dtype: float64

In [8]:
s2*2

a    2.4
f    4.0
c    6.0
d   -1.0
dtype: float64

In [9]:
s2+s2

a    2.4
f    4.0
c    6.0
d   -1.0
dtype: float64

In [10]:
s2.isnull()

a    False
f    False
c    False
d    False
dtype: bool

## Dataframe

In [11]:
dados = {'estado': ['SP', 'MG', 'RJ', 'SP', 'PR', 'RS'], 'ano': [2016, 1994, 1517, 1993, 2016, 2008], 'population': [47.3, 54.9, 12.3, 29.7, 32.2, 17.1]} # isto é um dict
df1 = pd.DataFrame(dados)
df1

Unnamed: 0,estado,ano,population
0,SP,2016,47.3
1,MG,1994,54.9
2,RJ,1517,12.3
3,SP,1993,29.7
4,PR,2016,32.2
5,RS,2008,17.1


In [12]:
df1.head(2)

Unnamed: 0,estado,ano,population
0,SP,2016,47.3
1,MG,1994,54.9


In [13]:
df1.tail(2)

Unnamed: 0,estado,ano,population
4,PR,2016,32.2
5,RS,2008,17.1


In [14]:
df1.sample(2) # retorna uma amostra aleatória

Unnamed: 0,estado,ano,population
0,SP,2016,47.3
3,SP,1993,29.7


In [15]:
df2 = pd.DataFrame(dados, columns=['ano', 'estado', 'population'])
df2

Unnamed: 0,ano,estado,population
0,2016,SP,47.3
1,1994,MG,54.9
2,1517,RJ,12.3
3,1993,SP,29.7
4,2016,PR,32.2
5,2008,RS,17.1


In [16]:
df2.ano

0    2016
1    1994
2    1517
3    1993
4    2016
5    2008
Name: ano, dtype: int64

In [17]:
df2.tail(3).estado

3    SP
4    PR
5    RS
Name: estado, dtype: object

In [18]:
df2.head(4)['population']

0    47.3
1    54.9
2    12.3
3    29.7
Name: population, dtype: float64

In [19]:
df2.dtypes

ano             int64
estado         object
population    float64
dtype: object

In [20]:
df2['estimativa'] = df2.population / df2.ano * 1000
df2

Unnamed: 0,ano,estado,population,estimativa
0,2016,SP,47.3,23.462302
1,1994,MG,54.9,27.532598
2,1517,RJ,12.3,8.108108
3,1993,SP,29.7,14.902158
4,2016,PR,32.2,15.972222
5,2008,RS,17.1,8.515936


In [21]:
df2.estimativa = np.arange(6)
df2

Unnamed: 0,ano,estado,population,estimativa
0,2016,SP,47.3,0
1,1994,MG,54.9,1
2,1517,RJ,12.3,2
3,1993,SP,29.7,3
4,2016,PR,32.2,4
5,2008,RS,17.1,5


In [22]:
df3 = df2
df3

Unnamed: 0,ano,estado,population,estimativa
0,2016,SP,47.3,0
1,1994,MG,54.9,1
2,1517,RJ,12.3,2
3,1993,SP,29.7,3
4,2016,PR,32.2,4
5,2008,RS,17.1,5


In [23]:
df3 = df2['estado']
df3

0    SP
1    MG
2    RJ
3    SP
4    PR
5    RS
Name: estado, dtype: object

In [24]:
df2['Não Acolá'] = df2.estado != 'PR'
df2

Unnamed: 0,ano,estado,population,estimativa,Não Acolá
0,2016,SP,47.3,0,True
1,1994,MG,54.9,1,True
2,1517,RJ,12.3,2,True
3,1993,SP,29.7,3,True
4,2016,PR,32.2,4,False
5,2008,RS,17.1,5,True


## Exclusão de coluna

In [25]:
del df2['Não Acolá']
df2.estimativa = (df2.population / df2.ano) * 1000
df2

Unnamed: 0,ano,estado,population,estimativa
0,2016,SP,47.3,23.462302
1,1994,MG,54.9,27.532598
2,1517,RJ,12.3,8.108108
3,1993,SP,29.7,14.902158
4,2016,PR,32.2,15.972222
5,2008,RS,17.1,8.515936


In [26]:
df2.shape

(6, 4)

In [27]:
df2.shape[0] # informa a quantidade de registros

6

In [28]:
df2.index

RangeIndex(start=0, stop=6, step=1)

In [29]:
df2.columns

Index(['ano', 'estado', 'population', 'estimativa'], dtype='object')

In [30]:
df2.count() # conta valores por coluna, excluindo os nulos

ano           6
estado        6
population    6
estimativa    6
dtype: int64

In [31]:
df3 = df2.copy()
df3.columns=['Ano', 'Estado', 'Populacao', 'Estimativva']
df3

Unnamed: 0,Ano,Estado,Populacao,Estimativva
0,2016,SP,47.3,23.462302
1,1994,MG,54.9,27.532598
2,1517,RJ,12.3,8.108108
3,1993,SP,29.7,14.902158
4,2016,PR,32.2,15.972222
5,2008,RS,17.1,8.515936


In [32]:
df3.describe(include='all')
df2['Ano'] = df2.ano+2
df2

Unnamed: 0,ano,estado,population,estimativa,Ano
0,2016,SP,47.3,23.462302,2018
1,1994,MG,54.9,27.532598,1996
2,1517,RJ,12.3,8.108108,1519
3,1993,SP,29.7,14.902158,1995
4,2016,PR,32.2,15.972222,2018
5,2008,RS,17.1,8.515936,2010


In [33]:
df4 = df2[df2['Ano'] > 2000]
df4

Unnamed: 0,ano,estado,population,estimativa,Ano
0,2016,SP,47.3,23.462302,2018
4,2016,PR,32.2,15.972222,2018
5,2008,RS,17.1,8.515936,2010


In [34]:
df4.drop('Ano', axis=1)

Unnamed: 0,ano,estado,population,estimativa
0,2016,SP,47.3,23.462302
4,2016,PR,32.2,15.972222
5,2008,RS,17.1,8.515936


In [35]:
df2.drop('Ano', axis='columns', inplace=True)

In [36]:
df2

Unnamed: 0,ano,estado,population,estimativa
0,2016,SP,47.3,23.462302
1,1994,MG,54.9,27.532598
2,1517,RJ,12.3,8.108108
3,1993,SP,29.7,14.902158
4,2016,PR,32.2,15.972222
5,2008,RS,17.1,8.515936


In [37]:
df2.drop([0,1]) # remove a linha

Unnamed: 0,ano,estado,population,estimativa
2,1517,RJ,12.3,8.108108
3,1993,SP,29.7,14.902158
4,2016,PR,32.2,15.972222
5,2008,RS,17.1,8.515936


In [38]:
df2.loc[0, 'estado'] = 'RS'
df2

Unnamed: 0,ano,estado,population,estimativa
0,2016,RS,47.3,23.462302
1,1994,MG,54.9,27.532598
2,1517,RJ,12.3,8.108108
3,1993,SP,29.7,14.902158
4,2016,PR,32.2,15.972222
5,2008,RS,17.1,8.515936


In [50]:
x1 = np.random.randint(10, size=6)  # One-dimensional array
x2 = np.random.randint(10, size=(3, 4))  # Two-dimensional array
x3 = np.random.randint(10, size=(3, 4, 5))  # Three-dimensional array

np.ndim(x1), np.ndim(x2), np.shape(x2), np.shape(x3), np.size(x2), x3

(1,
 2,
 (3, 4),
 (3, 4, 5),
 12,
 array([[[6, 1, 4, 0, 2],
         [7, 5, 2, 0, 8],
         [3, 1, 0, 7, 2],
         [4, 1, 8, 3, 8]],
 
        [[7, 2, 7, 7, 4],
         [4, 1, 5, 9, 3],
         [1, 7, 8, 4, 8],
         [2, 4, 3, 7, 7]],
 
        [[3, 7, 3, 2, 9],
         [0, 6, 6, 9, 3],
         [9, 5, 8, 3, 2],
         [4, 3, 2, 2, 0]]]))

In [67]:
x3, x3[::-1, ::-1, ::-1]

(array([[[6, 1, 4, 0, 2],
         [7, 5, 2, 0, 8],
         [3, 1, 0, 7, 2],
         [4, 1, 8, 3, 8]],
 
        [[7, 2, 7, 7, 4],
         [4, 1, 5, 9, 3],
         [1, 7, 8, 4, 8],
         [2, 4, 3, 7, 7]],
 
        [[3, 7, 3, 2, 9],
         [0, 6, 6, 9, 3],
         [9, 5, 8, 3, 2],
         [4, 3, 2, 2, 0]]]),
 array([[[0, 2, 2, 3, 4],
         [2, 3, 8, 5, 9],
         [3, 9, 6, 6, 0],
         [9, 2, 3, 7, 3]],
 
        [[7, 7, 3, 4, 2],
         [8, 4, 8, 7, 1],
         [3, 9, 5, 1, 4],
         [4, 7, 7, 2, 7]],
 
        [[8, 3, 8, 1, 4],
         [2, 7, 0, 1, 3],
         [8, 0, 2, 5, 7],
         [2, 0, 4, 1, 6]]]))

In [73]:
def compute_reciprocals(values):
    output = np.empty(len(values))
    for i in range(len(values)):
        output[i] = 1.0 / values[i]
    return output
        
values = np.random.randint(1, 10, size=5)
compute_reciprocals(values), values

(array([0.25      , 0.11111111, 0.125     , 0.5       , 0.33333333]),
 array([4, 9, 8, 2, 3]))

In [78]:
big_array = np.random.randint(1, 100, size=1000000)
%timeit compute_reciprocals(big_array)

2.54 s ± 512 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [77]:
%timeit (1.0 / big_array)

3.15 ms ± 39 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [83]:
%timeit np.sum(big_array)
%timeit sum(big_array)

529 µs ± 38.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
74 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
