## [ Reshaping and Pivoting in Pandas ]
These operations change the structure (shape) of the data -- not the content

In [1]:
import numpy as np 
import pandas as pd 

## [ Reshaping with Hierarchical Indexing ]
- it provides a consistent way to rearrange data in a DataFrame
- there are two primary actions:
    - `stack`: this "rotates" or pivots from the columns in the data to the rows
    - `unstack`: this pivots from the rows into the columns

In [11]:
# illustration
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index=pd.Index(["Ohio", "Colorado"], name="state"),
                    columns=pd.Index(["one", "two", "three"],
                    name="number"))
print(data)

# using the stack method on this data pivots the columns into the rows, producing a Series
result = data.stack() # level="number"
print(result)

number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64


In [6]:
# from a hierarchically indexed Series, we can rearrange the data back into a DataFrame with unstack
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [10]:
# by default, the innermost level is unstacked (same with stack).
# to unstack a different level, pass a level number or name

print(result.unstack(level=0))
print(result.unstack(level="state"))

state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5
state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5


unstacking might introduce missing data if all of the values in the level aren't found in each subgroup

In [13]:
s1 = pd.Series([0, 1, 2, 3], index=["a", "b", "c", "d"], dtype="Int64")
s2 = pd.Series([4, 5, 6], index=["c", "d", "e"], dtype="Int64")
data2 = pd.concat([s1, s2], keys=["one", "two"])
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: Int64

In [14]:
data2.unstack()
# stacking filters out missing data by default, so the operation is more easily invertible 

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2,3,
two,,,4,5,6.0


In [15]:
data2.unstack().stack()

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: Int64

In [18]:
data2.unstack().stack(future_stack=True) # dropna=False

one  a       0
     b       1
     c       2
     d       3
     e    <NA>
two  a    <NA>
     b    <NA>
     c       4
     d       5
     e       6
dtype: Int64

In [21]:
# when you unstack in a DataFrame, the level unstacked becomes the lowest level in the result
df = pd.DataFrame({"left": result, "right": result + 5}, columns=pd.Index(["left", "right"], name="side"))
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [22]:
df.unstack(level="state")

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [24]:
# as with unstack, when calling stack we can indicate the name of the axis to stack
df.unstack(level="state").stack(level="side", future_stack=True)

Unnamed: 0_level_0,state,Ohio,Colorado
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,0,3
one,right,5,8
two,left,1,4
two,right,6,9
three,left,2,5
three,right,7,10
