# Lección 18 - Data Wrangling

In [23]:
import pandas as pd
import numpy as np

## Multiples Indices

In [24]:
data = pd.Series(
    np.random.uniform(size=9),
    index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"], [1, 2, 3, 1, 3, 1, 2, 2, 3],["Platano","Lechuga","Platano","Lechuga","Platano","Lechuga","Platano","Lechuga","Platano"]]
)
data

a  1  Platano    0.614141
   2  Lechuga    0.040765
   3  Platano    0.152140
b  1  Lechuga    0.669398
   3  Platano    0.259809
c  1  Lechuga    0.027630
   2  Platano    0.057492
d  2  Lechuga    0.425117
   3  Platano    0.084793
dtype: float64

In [25]:
data.index

MultiIndex([('a', 1, 'Platano'),
            ('a', 2, 'Lechuga'),
            ('a', 3, 'Platano'),
            ('b', 1, 'Lechuga'),
            ('b', 3, 'Platano'),
            ('c', 1, 'Lechuga'),
            ('c', 2, 'Platano'),
            ('d', 2, 'Lechuga'),
            ('d', 3, 'Platano')],
           )

In [26]:
data = pd.Series(
    np.random.uniform(size=9),
    index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"], [1, 2, 3, 1, 3, 1, 2, 2, 3]]
)
data

a  1    0.973220
   2    0.367756
   3    0.797595
b  1    0.786804
   3    0.673442
c  1    0.015288
   2    0.079868
d  2    0.284816
   3    0.119613
dtype: float64

In [27]:
data["b"]

1    0.786804
3    0.673442
dtype: float64

In [28]:
data["b":"c"]

b  1    0.786804
   3    0.673442
c  1    0.015288
   2    0.079868
dtype: float64

Leer solamente aquellos que tienen indice *b* e indice *d*

In [29]:
data.loc[["b","d"]]

b  1    0.786804
   3    0.673442
d  2    0.284816
   3    0.119613
dtype: float64

Seleccionamos todos los valores que tengan 2 en el segundo nivel del index

In [30]:
data.loc[:,2]

a    0.367756
c    0.079868
d    0.284816
dtype: float64

**unstack()** 

Si tienes datos con multiples indices, los separa de tal manera que ahora puedas verlos por intersecciones

In [31]:
data.unstack()

Unnamed: 0,1,2,3
a,0.97322,0.367756,0.797595
b,0.786804,,0.673442
c,0.015288,0.079868,
d,,0.284816,0.119613


The inverse operation of unstack is stack

In [32]:
data.unstack().stack()

a  1    0.973220
   2    0.367756
   3    0.797595
b  1    0.786804
   3    0.673442
c  1    0.015288
   2    0.079868
d  2    0.284816
   3    0.119613
dtype: float64

In [33]:
frame = pd.DataFrame(
    np.arange(12).reshape((4, 3)),
    index=[["a", "a", "b", "b"], [1, 2, 1, 2]],
    columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]],
)
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


Agregar nombres a los indices

In [34]:
frame.index.names = ["key1","key2"]
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


Agregar nombre a las columnas

In [35]:
frame.columns.names = ["state","color"]
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [36]:
frame.index.nlevels

2

In [37]:
frame["Ohio"]

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [38]:
pd.MultiIndex.from_arrays(
    [["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], names=["state", "color"]
)

MultiIndex([(    'Ohio', 'Green'),
            (    'Ohio',   'Red'),
            ('Colorado', 'Green')],
           names=['state', 'color'])

## Reordenamiento y Ordenamiento de Niveles

In [39]:
frame.swaplevel("key1","key2")

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [40]:
frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [41]:
frame.swaplevel(0, 1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


## Summary Statistics by Level

Agrupamiento por:
Funciona datos por caracteristicas.

In [42]:
frame.groupby(level="key2").sum()

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


**axis = 1** cambia las filas por las columnas

In [44]:
frame.groupby(level="color",axis=1).sum()

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10
