## [ Reshaping and Pivoting in Pandas ]
These operations change the structure (shape) of the data -- not the content

In [76]:
import numpy as np 
import pandas as pd 

## [ Reshaping with Hierarchical Indexing ]
- it provides a consistent way to rearrange data in a DataFrame
- there are two primary actions:
    - `stack`: this "rotates" or pivots from the columns in the data to the rows
    - `unstack`: this pivots from the rows into the columns

In [77]:
# illustration
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index=pd.Index(["Ohio", "Colorado"], name="state"),
                    columns=pd.Index(["one", "two", "three"],
                    name="number"))
print(data)

# using the stack method on this data pivots the columns into the rows, producing a Series
result = data.stack() # level="number"
print(result)

number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64


In [78]:
# from a hierarchically indexed Series, we can rearrange the data back into a DataFrame with unstack
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [79]:
# by default, the innermost level is unstacked (same with stack).
# to unstack a different level, pass a level number or name

print(result.unstack(level=0))
print(result.unstack(level="state"))

state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5
state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5


unstacking might introduce missing data if all of the values in the level aren't found in each subgroup

In [80]:
s1 = pd.Series([0, 1, 2, 3], index=["a", "b", "c", "d"], dtype="Int64")
s2 = pd.Series([4, 5, 6], index=["c", "d", "e"], dtype="Int64")
data2 = pd.concat([s1, s2], keys=["one", "two"])
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: Int64

In [81]:
data2.unstack()
# stacking filters out missing data by default, so the operation is more easily invertible 

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2,3,
two,,,4,5,6.0


In [82]:
data2.unstack().stack()

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: Int64

In [83]:
data2.unstack().stack(future_stack=True) # dropna=False

one  a       0
     b       1
     c       2
     d       3
     e    <NA>
two  a    <NA>
     b    <NA>
     c       4
     d       5
     e       6
dtype: Int64

In [84]:
# when you unstack in a DataFrame, the level unstacked becomes the lowest level in the result
df = pd.DataFrame({"left": result, "right": result + 5}, columns=pd.Index(["left", "right"], name="side"))
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [85]:
df.unstack(level="state")

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [86]:
# as with unstack, when calling stack we can indicate the name of the axis to stack
df.unstack(level="state").stack(level="side", future_stack=True)

Unnamed: 0_level_0,state,Ohio,Colorado
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,0,3
one,right,5,8
two,left,1,4
two,right,6,9
three,left,2,5
three,right,7,10


## [ Pivoting "Long" to "Wide" Format ]
- A common way to store multiple time series in databases and CSV files is what
is sometimes called long or stacked format. 
- In this format, individual values are represented by a single row in a table rather than multiple values per row.

####  So why use this?
Because:
- It’s **easier to store** in CSV or databases.
- It’s **better for analysis**, filtering, and grouping.
- It **avoids empty cells** when you don’t have data for some cities/years.

####  Pandas functions:
- Convert **wide → long** with: `pd.melt()`
- Convert **long → wide** with: `pivot()` or `pivot_table()`


In [87]:
data = pd.read_csv("examples/macrodata.csv")

In [88]:
data = data.loc[:, ["year", "quarter", "realgdp", "infl", "unemp"]]
data.head()

Unnamed: 0,year,quarter,realgdp,infl,unemp
0,1959.0,1.0,2710.349,0.0,5.8
1,1959.0,2.0,2778.801,2.34,5.1
2,1959.0,3.0,2775.488,2.74,5.3
3,1959.0,4.0,2785.204,0.27,5.6
4,1960.0,1.0,2847.699,2.31,5.2


In [89]:
# use pd.PeriodIndex is a special kind of index that represents periods of time -- like days, months, quators, or years -- instead of exact timestamps
periods = pd.PeriodIndex(year=data.pop("year"),
                         quarter=data.pop("quarter"),
                         name="date")
periods 

# pop is a method on dataFrame, which returns a column while deleting it from the DataFrame at the same time

  periods = pd.PeriodIndex(year=data.pop("year"),


PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203)

In [90]:
# to_timestamp() is a method that converts a PeriodIndex to a DatetimeIndex
data.index = periods.to_timestamp("D")  # D -> daily frequency 
    # it tells pandas that when converting from periods to timestamps, you want the result in terms of days
    # how="end" - converts to end of the period
data

Unnamed: 0_level_0,realgdp,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,2710.349,0.00,5.8
1959-04-01,2778.801,2.34,5.1
1959-07-01,2775.488,2.74,5.3
1959-10-01,2785.204,0.27,5.6
1960-01-01,2847.699,2.31,5.2
...,...,...,...
2008-07-01,13324.600,-3.16,6.0
2008-10-01,13141.920,-8.79,6.9
2009-01-01,12925.410,0.94,8.1
2009-04-01,12901.504,3.37,9.2


In [91]:
# now select a subset of columns and give the columns index the name "item"
data = data.reindex(columns=["realgdp", "infl", "unemp"])
data.columns.name = "item"
data.head()

item,realgdp,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,2710.349,0.0,5.8
1959-04-01,2778.801,2.34,5.1
1959-07-01,2775.488,2.74,5.3
1959-10-01,2785.204,0.27,5.6
1960-01-01,2847.699,2.31,5.2


In [95]:
# reshape with stack, turn the new index levels into columns with reset_index, and finally give the column containing the data values the name "value"
long_data = (data.stack().reset_index().rename(columns={0: "value"}))
long_data[:10]

# this is so called long format for multiple time series, each row in the tables represents a single observation
#sometimes, it's annoying to work with long format when we want to compare multiple items side by side for the same date

Unnamed: 0,date,item,value
0,1959-01-01,realgdp,2710.349
1,1959-01-01,infl,0.0
2,1959-01-01,unemp,5.8
3,1959-04-01,realgdp,2778.801
4,1959-04-01,infl,2.34
5,1959-04-01,unemp,5.1
6,1959-07-01,realgdp,2775.488
7,1959-07-01,infl,2.74
8,1959-07-01,unemp,5.3
9,1959-10-01,realgdp,2785.204


- so what does `pivot()` do?
- syntax: `df.pivot(index="date", columns="item", values="value")`

- It turns: 
    - the values in the item column into column names
    - the date column becomes the index
    - the value column fills in the actual data

- Relational Database explanation: 
- In SQL databases, we usually store data in the long format because:
    - it's more flexible (new items can be added without changing the table's structure)
    - it helps maintaining relational integrity (ensuring valid connections between tables using keys)
    - it's easier to join or filter using SQL

- Long format = good for databases
- Wide format = good for analysis
- `pivot()` = tool to convert long ---> wide

In [96]:
pivoted = long_data.pivot(index="date", columns="item", values="value")
pivoted.head()

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,0.0,2710.349,5.8
1959-04-01,2.34,2778.801,5.1
1959-07-01,2.74,2775.488,5.3
1959-10-01,0.27,2785.204,5.6
1960-01-01,2.31,2847.699,5.2


In [97]:
# suppose you had two value columns that we want to reshape simultaneously
long_data["value2"] = np.random.standard_normal(len(long_data))
long_data[:10]

Unnamed: 0,date,item,value,value2
0,1959-01-01,realgdp,2710.349,-0.269495
1,1959-01-01,infl,0.0,-0.653224
2,1959-01-01,unemp,5.8,-0.006386
3,1959-04-01,realgdp,2778.801,-0.40834
4,1959-04-01,infl,2.34,-0.411789
5,1959-04-01,unemp,5.1,2.023025
6,1959-07-01,realgdp,2775.488,-0.613646
7,1959-07-01,infl,2.74,0.246858
8,1959-07-01,unemp,5.3,-1.08897
9,1959-10-01,realgdp,2785.204,-1.271224


In [98]:
# by ommiting the last argument, you obtain a DataFrame with hierarchical columns
pivoted = long_data.pivot(index="date", columns="item")
pivoted.head()

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-01-01,0.0,2710.349,5.8,-0.653224,-0.269495,-0.006386
1959-04-01,2.34,2778.801,5.1,-0.411789,-0.40834,2.023025
1959-07-01,2.74,2775.488,5.3,0.246858,-0.613646,-1.08897
1959-10-01,0.27,2785.204,5.6,0.189622,-1.271224,-1.050373
1960-01-01,2.31,2847.699,5.2,-1.319149,-0.344224,-2.234334


In [99]:
pivoted["value"].head()

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,0.0,2710.349,5.8
1959-04-01,2.34,2778.801,5.1
1959-07-01,2.74,2775.488,5.3
1959-10-01,0.27,2785.204,5.6
1960-01-01,2.31,2847.699,5.2


In [100]:
# NOTE that pivot is equivalent to creating a hierarchical index using set_index followed by a call to unstack
unstacked = long_data.set_index(["date", "item"]).unstack(level="item")
unstacked.head()

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-01-01,0.0,2710.349,5.8,-0.653224,-0.269495,-0.006386
1959-04-01,2.34,2778.801,5.1,-0.411789,-0.40834,2.023025
1959-07-01,2.74,2775.488,5.3,0.246858,-0.613646,-1.08897
1959-10-01,0.27,2785.204,5.6,0.189622,-1.271224,-1.050373
1960-01-01,2.31,2847.699,5.2,-1.319149,-0.344224,-2.234334
