# Data Wrangling II


In [53]:
import pandas as pd
import numpy as np

### Stack e Unstack


In [54]:
import pandas as pd

# Criando um DataFrame com índice hierárquico
data = {'Grupo': ['A', 'A', 'B', 'B'],
        'Categoria': ['X', 'Y', 'X', 'Y'],
        'Ano': [2019, 2020, 2019, 2020],
        'Valor': [10, 20, 30, 40]}
df = pd.DataFrame(data)
df = df.set_index(['Grupo', 'Categoria', 'Ano'])
df


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Valor
Grupo,Categoria,Ano,Unnamed: 3_level_1
A,X,2019,10
A,Y,2020,20
B,X,2019,30
B,Y,2020,40


In [55]:
# Aplicando stack para empilhar as colunas
df_stacked = df.stack()
df_stacked

Grupo  Categoria  Ano        
A      X          2019  Valor    10
       Y          2020  Valor    20
B      X          2019  Valor    30
       Y          2020  Valor    40
dtype: int64

In [56]:
# Aplicando unstack para desempilhar os dados
df_unstacked = df_stacked.unstack()
df_unstacked

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Valor
Grupo,Categoria,Ano,Unnamed: 3_level_1
A,X,2019,10
A,Y,2020,20
B,X,2019,30
B,Y,2020,40


### Explode

In [57]:
data = {'col1': ['foo', 'bar', 'baz'],
      'col2': [[1, 2], [3, 4, 5], [6, 7]]}
dfl = pd.DataFrame(data)
dfl

Unnamed: 0,col1,col2
0,foo,"[1, 2]"
1,bar,"[3, 4, 5]"
2,baz,"[6, 7]"


In [59]:
dfl.explode('col2')

Unnamed: 0,col1,col2
0,foo,1
0,foo,2
1,bar,3
1,bar,4
1,bar,5
2,baz,6
2,baz,7


### Pivot

In [49]:
df_ri = df.reset_index()
df_ri

Unnamed: 0,Grupo,Categoria,Ano,Valor
0,A,X,2019,10
1,A,Y,2020,20
2,B,X,2019,30
3,B,Y,2020,40


In [50]:
df_ri.pivot(index='Grupo', 
            columns='Ano', 
            values='Valor')

Ano,2019,2020
Grupo,Unnamed: 1_level_1,Unnamed: 2_level_1
A,10,20
B,30,40


In [63]:
data = {'A': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'],
        'B': ['one', 'one', 'two', 'two', 'one', 'one'],
        'C': [1, 2, 3, 4, 5, 6],
        'D': [7, 8, 9, 10, 11, 12]}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,foo,one,1,7
1,foo,one,2,8
2,foo,two,3,9
3,bar,two,4,10
4,bar,one,5,11
5,bar,one,6,12


In [65]:
pd.pivot_table(df, values='C', index='A', columns='B', aggfunc='sum')

B,one,two
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,11,4
foo,3,3


### Tipo de dados 


In [38]:
data = {'nome': ['Joao','Maria','Andre','Carlos'],
        'idade': [12, 40, 22, 35],
        'tamanho': [1.51, 1.51, 1.81, 1.74]}
df = pd.DataFrame(data)
df

Unnamed: 0,nome,idade,tamanho
0,Joao,12,1.51
1,Maria,40,1.51
2,Andre,22,1.81
3,Carlos,35,1.74


In [40]:
df.dtypes

nome        object
idade        int64
tamanho    float64
dtype: object

In [41]:
df['idade'] = df['idade'].astype(float)
df

Unnamed: 0,nome,idade,tamanho
0,Joao,12.0,1.51
1,Maria,40.0,1.51
2,Andre,22.0,1.81
3,Carlos,35.0,1.74


### Missing values
identificação 

In [61]:
data = {'A': [1, 2, np.nan, 4],
        'B': [5, np.nan, 7, 8]}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,1.0,5.0
1,2.0,
2,,7.0
3,4.0,8.0


In [14]:
df.isna()

Unnamed: 0,A,B
0,False,False
1,False,True
2,True,False
3,False,False


In [17]:
df[~df['A'].isna()]

Unnamed: 0,A,B
0,1.0,5.0
1,2.0,
3,4.0,8.0


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       3 non-null      float64
 1   B       3 non-null      float64
dtypes: float64(2)
memory usage: 192.0 bytes


exclusão

In [23]:
data = {'A': [1, 2, np.nan, 4],
        'B': [5, np.nan, 7, 8],
        'C': [5, 3, 2, 1]}
df2 = pd.DataFrame(data)
df2

Unnamed: 0,A,B,C
0,1.0,5.0,5
1,2.0,,3
2,,7.0,2
3,4.0,8.0,1


In [27]:
df2.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,5
3,4.0,8.0,1


In [28]:
df2.dropna(subset='A')

Unnamed: 0,A,B,C
0,1.0,5.0,5
1,2.0,,3
3,4.0,8.0,1


In [29]:
df2.dropna(axis=1)

Unnamed: 0,C
0,5
1,3
2,2
3,1


Preencher 

In [30]:
df2.fillna(0)

Unnamed: 0,A,B,C
0,1.0,5.0,5
1,2.0,0.0,3
2,0.0,7.0,2
3,4.0,8.0,1


In [32]:
df2.fillna({'A': -1})

Unnamed: 0,A,B,C
0,1.0,5.0,5
1,2.0,,3
2,-1.0,7.0,2
3,4.0,8.0,1


In [34]:
df2.fillna(df2.mean())

Unnamed: 0,A,B,C
0,1.0,5.0,5
1,2.0,6.666667,3
2,2.333333,7.0,2
3,4.0,8.0,1


### Filtros, loc e iloc
Revisão aula 10