## [ Hierarchical Indexing ]
- it is an important feature of pandas that enables you to have  multiple index levels on an axis.
- it provides a way to work with higher dimensional data in a lower dimensional form. 

In [100]:
import numpy as np 
import pandas as pd 

# simple example: create a series with a list of lists(or arrays) as the index
data = pd.Series(np.random.uniform(size=9), 
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
print(data)

# instead of repeating the same outer index label for each inner value, Pandas shows it only onece, and leaves blanks("gaps") for the rows beneath it that belong to the same group.

# it's just a cleaner and more readable way to display the Series

print(data.index)

a  1    0.961319
   2    0.718193
   3    0.907748
b  1    0.258287
   3    0.714774
c  1    0.861088
   2    0.728414
d  2    0.003269
   3    0.378334
dtype: float64
MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )


In [101]:
# with a hierarchical index object, so-called partial indexing is possible, enabling you to concisely select subsets of the data

print(data["b"])

print(data["b":"c"])    # performs a slice over the first-level index form 'b' to 'c' inclusive

1    0.258287
3    0.714774
dtype: float64
b  1    0.258287
   3    0.714774
c  1    0.861088
   2    0.728414
dtype: float64


In [102]:
data.loc[['b', 'd']]

b  1    0.258287
   3    0.714774
d  2    0.003269
   3    0.378334
dtype: float64

In [103]:
# selection is even possible from an "inner" level. 
# here i select all of the values having the value 2 from the second index level
data.loc[:, 2]

a    0.718193
c    0.728414
d    0.003269
dtype: float64

In [104]:
# hierarchical indexing plays an important role in reshaping data and in group-based operations like forming a pivot table.

# example, we can rearrange this data into a DataFrame using its unstack method
print(data.unstack())

# the inverse operation of unstack is stack
print(data.unstack().stack())

          1         2         3
a  0.961319  0.718193  0.907748
b  0.258287       NaN  0.714774
c  0.861088  0.728414       NaN
d       NaN  0.003269  0.378334
a  1    0.961319
   2    0.718193
   3    0.907748
b  1    0.258287
   3    0.714774
c  1    0.861088
   2    0.728414
d  2    0.003269
   3    0.378334
dtype: float64


In [105]:
# with a DataFrame, either axis can have a hierarchical index
frame = pd.DataFrame(np.arange(12).reshape((4, 3)), 
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                     columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]])
print(frame)
print("\n")
# the hierarchical levels can have names (as strings or any python objects)
frame.index.names = ["key1", "key2"]
frame.columns.names = ["state", "color"]
print(frame)

# These names supersede the name attribute, which is used only with single-level indexes
# explanation:
    # if you're using a MultiIndex, you should use `.names` instead of `.name`
    # the `.name` attribute still exists, but it only works for a single-level index

     Ohio     Colorado
    Green Red    Green
a 1     0   1        2
  2     3   4        5
b 1     6   7        8
  2     9  10       11


state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11


In [106]:
# you can see how many levels an index has by accessing its n levels attribute
print(frame.index.nlevels)
print(frame.columns.nlevels)

2
2


In [107]:
# with partial column indexing you can similarly select groups of columns
frame["Ohio"]

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [108]:
# manually create a nultiIndex, which is useful when we want to define structured axes before building a DataFrame -- especially for columns in hierarchical form

columns = pd.MultiIndex.from_arrays([["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], names=["state", "color"])
# we can now use this MultiIndex as the columns of a DataFrame

frame_a = pd.DataFrame(np.arange(12).reshape((4,3)),
                        index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                        columns=columns)
print(frame_a)

state  Ohio     Colorado
color Green Red    Green
a 1       0   1        2
  2       3   4        5
b 1       6   7        8
  2       9  10       11


## [ Reordering and Sorting Levels ]
- at times we may need to rearrange the order of the levels on an axis or sort the data by the values in one specific level.
- the swaplevel method takes two level numbers or names and returns a new object with the levels interchanged (but the data is otherwise unaltered)

In [109]:
frame.swaplevel("key1", "key2")
# frame = frame.swaplevel("key1", "key2")


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [110]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [111]:
# sort_index() is used to sort our data(Series or DataFrame) based on the index
# if your index is a multiIndex, sort_index() will sort by all levels of the index, unless you specify otherwise

# Lexicographically?
    # It means the data is sorted like in a dictionary:
    #     first by the first level
    #     then by the second level, so on 
    # Think of it as aplhabetical or numerical order on each level of index

frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [112]:
frame.swaplevel(0,1).sort_index(level=0)

# selecting data is faster when you have a MultiIndex and the index is sorted by the outer (first) level
# because pandas can find things more quickly if the outer level is in order -- just like looking in a phonebook that's already alphabetized

# sorting by the outermost level (level 0) makes data selection faster

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


## [ Summary Statistics ly Level ]
many descriptive and summary statistics on DataFrame and Series have a level option in which you can specify the level you want to aggregate by on a particular axis.

In [113]:
frame 

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [114]:
# 
frame.reset_index()

state,key1,key2,Ohio,Ohio,Colorado
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Green,Red,Green
0,a,1,0,1,2
1,a,2,3,4,5
2,b,1,6,7,8
3,b,2,9,10,11


In [115]:
# aggregate by level on either the rows or columns

frame.groupby(level="key2").sum()

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [116]:
frame.groupby(level="color", axis="columns").sum()

# more will be discussed later

  frame.groupby(level="color", axis="columns").sum()


Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


## [ Indexing with a DataFrame's Columns ]
It’s not unusual to want to use one or more columns from a DataFrame as the row index;alternatively, you may wish to move the row index into the DataFrame’s columns.

In [117]:
frame = pd.DataFrame({"a": range(7), 
                      "b": range(7, 0, -1),
                      "c": ["one", "one", "one", "two", "two", "two", "two"],
                      "d": [0, 1, 2, 0, 1, 2, 3]})
frame 

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [118]:
# DataFrame's set_index function will create a new DataFrame using one or more of its columns as the index
frame2 = frame.set_index(["c", "d"])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [119]:
# by default the columns are removed from the DataFrame, though you can leave them in by passing drop=False to set_index
frame.set_index(["c", "d"], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [120]:
# reset_index, on the other hand does the opposite of set_index; the hierarchical index levels are moved into the columns
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1
