_My notebook on_
# Python for Data Analysis - Wes McKinney
## Chapter 8 - Data Wrangling: Join, Combine, and Reshape

In [1]:
import numpy as np
import pandas as pd

## Part 3 - Reshaping and Pivoting

In [2]:
data = pd.DataFrame(
    data=np.arange(6).reshape((2, 3)),
    index=pd.Index(['Ohio', 'Colorado'], name='state'),
    columns=pd.Index(['one', 'two', 'three'], name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


Reshaping with Hierarchical Indexing

In [3]:
# DataFrame.stack() to pivot from the columns in the data to the rows
result = data.stack()
print(result)

# Series.unstack() from a hierarchically indexed Series to a DataFrame
print(result.unstack())
print(result.unstack('number'))
print(result.unstack(0))
print(result.unstack('state'))

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32
number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5
number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5
state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5
state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5


In [4]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])

print(data2)
# unstacking could introduce missing values
df = data2.unstack()
print(df)

# by default stacking filters out missing values
print(df.stack())
print(df.stack(dropna=False))

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64
       a    b    c    d    e
one  0.0  1.0  2.0  3.0  NaN
two  NaN  NaN  4.0  5.0  6.0
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64


In [5]:
# unstack on DataFrame

df = pd.DataFrame(
    data={'left': result, 'right': result + 5},
    columns=pd.Index(['left', 'right'], name='side'))

print(df)

# the level unstacked becomes the lowest level in the result
print('\n-- unstack state')
side_state = df.unstack('state')
print(side_state)
print('-- unstack number')
print(df.unstack('number'))

# stack specifying the axis
print('\n-- stack side')
print(side_state.stack('side'))
print('-- stack state')
print(side_state.stack('state'))

side             left  right
state    number             
Ohio     one        0      5
         two        1      6
         three      2      7
Colorado one        3      8
         two        4      9
         three      5     10

-- unstack state
side   left          right         
state  Ohio Colorado  Ohio Colorado
number                             
one       0        3     5        8
two       1        4     6        9
three     2        5     7       10
-- unstack number
side     left           right          
number    one two three   one two three
state                                  
Ohio        0   1     2     5   6     7
Colorado    3   4     5     8   9    10

-- stack side
state         Colorado  Ohio
number side                 
one    left          3     0
       right         8     5
two    left          4     1
       right         9     6
three  left          5     2
       right        10     7
-- stack state
side             left  right
number state             

Pivoting “Long” to “Wide” Format

In [6]:
data = pd.read_csv('examples/macrodata.csv')
print(data.head(1))

periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')
data = data.reindex(columns=columns)
data.index = periods.to_timestamp('D', 'end')
ldata = data.stack().reset_index().rename(columns={0: 'value'})

print('\n-- long format for multiple time series')
print(ldata[:5])

print('\n-- one column per distinct item by pivoting')
pivoted = ldata.pivot('date', 'item', 'value')
print(pivoted.head())

     year  quarter   realgdp  realcons  realinv  realgovt  realdpi    cpi  \
0  1959.0      1.0  2710.349    1707.4  286.898   470.045   1886.9  28.98   

      m1  tbilrate  unemp      pop  infl  realint  
0  139.7      2.82    5.8  177.146   0.0      0.0  

-- long format for multiple time series
        date     item     value
0 1959-03-31  realgdp  2710.349
1 1959-03-31     infl     0.000
2 1959-03-31    unemp     5.800
3 1959-06-30  realgdp  2778.801
4 1959-06-30     infl     2.340

-- one column per distinct item by pivoting
item        infl   realgdp  unemp
date                             
1959-03-31  0.00  2710.349    5.8
1959-06-30  2.34  2778.801    5.1
1959-09-30  2.74  2775.488    5.3
1959-12-31  0.27  2785.204    5.6
1960-03-31  2.31  2847.699    5.2


In [7]:
# if interested in two values ...
ldata['value2'] = np.random.randn(len(ldata))
print(ldata.head())

# DataFrame with hierarchical columns
pivoted = ldata.pivot('date', 'item')
print(pivoted.head())

print(pivoted['value'][:5])
print(pivoted['value2'][:5])

# pivot() explained
print('\n-- pivot() == set_index() then unstack()')
unstacked = ldata.set_index(['date', 'item']).unstack('item')
print(unstacked.head())

        date     item     value    value2
0 1959-03-31  realgdp  2710.349  0.010247
1 1959-03-31     infl     0.000  1.481080
2 1959-03-31    unemp     5.800  0.050346
3 1959-06-30  realgdp  2778.801  0.524681
4 1959-06-30     infl     2.340  0.317856
           value                    value2                    
item        infl   realgdp unemp      infl   realgdp     unemp
date                                                          
1959-03-31  0.00  2710.349   5.8  1.481080  0.010247  0.050346
1959-06-30  2.34  2778.801   5.1  0.317856  0.524681  1.344840
1959-09-30  2.74  2775.488   5.3 -0.368734  1.758701  0.237833
1959-12-31  0.27  2785.204   5.6  1.372007  0.808043 -0.213937
1960-03-31  2.31  2847.699   5.2 -0.267593 -0.588876 -0.151791
item        infl   realgdp  unemp
date                             
1959-03-31  0.00  2710.349    5.8
1959-06-30  2.34  2778.801    5.1
1959-09-30  2.74  2775.488    5.3
1959-12-31  0.27  2785.204    5.6
1960-03-31  2.31  2847.699    5.2
item  

Pivoting “Wide” to “Long” Format

In [11]:
# pd.melt() vs df.pivot()

df = pd.DataFrame({
    'key': ['foo', 'bar', 'baz'],
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]})
print(df)

melted = pd.melt(df, ['key'])
print('\n-- melted')
print(melted)

reshaped = melted.pivot('key', 'variable', 'value')
print('\n-- pivot it back to original')
print(reshaped)

print('-- reset index')
print(reshaped.reset_index())

print('\npartial melting on key for only A and B')
print(pd.melt(df, id_vars=['key'], value_vars=['A', 'B']))

print('\npandas.melt() w/o group identifiers')
print(pd.melt(df, value_vars=['A', 'B', 'C']))
print(pd.melt(df, value_vars=['key', 'A', 'B']))

   A  B  C  key
0  1  4  7  foo
1  2  5  8  bar
2  3  6  9  baz

-- melted
   key variable  value
0  foo        A      1
1  bar        A      2
2  baz        A      3
3  foo        B      4
4  bar        B      5
5  baz        B      6
6  foo        C      7
7  bar        C      8
8  baz        C      9

-- pivot it back to original
variable  A  B  C
key              
bar       2  5  8
baz       3  6  9
foo       1  4  7
-- reset index
variable  key  A  B  C
0         bar  2  5  8
1         baz  3  6  9
2         foo  1  4  7

partial melting on key for only A and B
   key variable  value
0  foo        A      1
1  bar        A      2
2  baz        A      3
3  foo        B      4
4  bar        B      5
5  baz        B      6

pandas.melt() w/o group identifiers
  variable  value
0        A      1
1        A      2
2        A      3
3        B      4
4        B      5
5        B      6
6        C      7
7        C      8
8        C      9
  variable value
0      key   foo
1      key   ba