In [1]:
import pandas as pd
import numpy as np

## Explicit MultiIndex constructors

MultiIndex can be created using tuples, arrays and from product

In [3]:
a=pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])
a

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [6]:
b=pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])
b

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [7]:
c=pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) 
c

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

Similarly, you can construct the MultiIndex directly using its internal encoding by
passing levels (a list of lists containing available index values for each level) and
labels 

In [5]:
d=pd.MultiIndex(levels=[['a', 'b'], [1, 2]],       # This is internal coding , labels in book is represented as codes here.
                codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
d

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [6]:
data = {('California', 2000): 33871648, # to convert the dictionary into df , first convert that into series.
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
pop=pd.Series(data)
pop

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [7]:
index=pd.MultiIndex.from_tuples(data)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010),
            (  'New York', 2000),
            (  'New York', 2010)],
           )

In [8]:
# We can even name the index 
pop.index.names=['states','years']
pop

states      years
California  2000     33871648
            2010     37253956
Texas       2000     20851820
            2010     25145561
New York    2000     18976457
            2010     19378102
dtype: int64

With more involved datasets, this can be a useful way to keep track of the meaning of
various index values.

### MultiIndex for columns

In a DataFrame, the rows and columns are completely symmetric, and just as the rows
can have multiple levels of indices, the columns can have multiple levels as well

In [36]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                    names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                      names=['subject', 'type'])

# mock some data
np.random.seed(0) # stops reproducibility, same value is produced everytime
data1 = np.round(np.random.randn(4, 6), 1) # 1 represents decimals and rounds the no. to given no. of decimals
data1[:, ::2] *= 10
#data2=data1[:, ::2]*10
data =data1 + 37

# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data
# This is actually a 4D data shown in a 2D dataFrame 

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,55.0,37.4,47.0,39.2,56.0,36.0
2013,2,47.0,36.8,36.0,37.4,38.0,38.5
2014,1,45.0,37.1,41.0,37.3,52.0,36.8
2014,2,40.0,36.1,11.0,37.7,46.0,36.3


In [40]:
data2=data1[:, ::2]*10
data2 # only the every second position when multiplied by 10 is given to data2
#data2=pd.DataFrame(data2,columns=['value','value','value'])
#data2

array([[ 180.,  100.,  190.],
       [ 100.,  -10.,   10.],
       [  80.,   40.,  150.],
       [  30., -260.,   90.]])

In [41]:
data1 # Ever second number is in decimals and single digits

array([[ 18. ,   0.4,  10. ,   2.2,  19. ,  -1. ],
       [ 10. ,  -0.2,  -1. ,   0.4,   1. ,   1.5],
       [  8. ,   0.1,   4. ,   0.3,  15. ,  -0.2],
       [  3. ,  -0.9, -26. ,   0.7,   9. ,  -0.7]])

In [26]:
check=np.random.randn(3,4)
check

array([[ 1.23029068,  1.20237985, -0.38732682, -0.30230275],
       [-1.04855297, -1.42001794, -1.70627019,  1.9507754 ],
       [-0.50965218, -0.4380743 , -1.25279536,  0.77749036]])

In [42]:
health_data['Guido'] # using this multiIndexing we can save full information of a person and obtain it as and when needed.

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,47.0,39.2
2013,2,36.0,37.4
2014,1,41.0,37.3
2014,2,11.0,37.7


## Indexing and Slicing a MultiIndex

### Multiply indexed Series

In [9]:
pop

states      years
California  2000     33871648
            2010     37253956
Texas       2000     20851820
            2010     25145561
New York    2000     18976457
            2010     19378102
dtype: int64

In [10]:
pop['California',2010]

37253956

In [11]:
pop[0:3] # pop[0:3] actually signifies no. of rows to be taken , ignoring multiIndexing sequence.

states      years
California  2000     33871648
            2010     37253956
Texas       2000     20851820
dtype: int64

In [12]:
pop['California'] # shows the semiIndices under california

years
2000    33871648
2010    37253956
dtype: int64

In [13]:
pop.iloc[0:3] # works same as pop[0:3]

states      years
California  2000     33871648
            2010     37253956
Texas       2000     20851820
dtype: int64

In [14]:
pop=pop.sort_index()
pop.loc['California':'New York']

states      years
California  2000     33871648
            2010     37253956
New York    2000     18976457
            2010     19378102
dtype: int64

In [15]:
popdf=pd.DataFrame(pop,columns=['population'],index=index)
popdf

Unnamed: 0,Unnamed: 1,population
California,2000,33871648
California,2010,37253956
Texas,2000,20851820
Texas,2010,25145561
New York,2000,18976457
New York,2010,19378102


In [173]:
popdf=popdf.sort_index()
popdf.loc['California':'Texas']

Unnamed: 0,Unnamed: 1,population
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [184]:
# After such a long searching and trials , basic mistake found 
popdf.loc[('California',2000):('Texas',2000)]

Unnamed: 0,Unnamed: 1,population
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820


In [206]:
popdf[0:6:2]
#popdf[:,2000]  , wanted to use this but not able to perform , rather it's applicble in series type

Unnamed: 0,Unnamed: 1,population
California,2000,33871648
New York,2000,18976457
Texas,2000,20851820


In [262]:
indx = pd.IndexSlice    # atlast achieved the desired results through pd.IndexSlice
popdf.loc[indx[:, 2000], idx[:]]

Unnamed: 0_level_0,Unnamed: 1_level_0,population
State,Year,Unnamed: 2_level_1
California,2000,33871648
New York,2000,18976457
Texas,2000,20851820


In [257]:
popdf.index.names=['State','Year']
popdf

Unnamed: 0_level_0,Unnamed: 1_level_0,population
State,Year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [16]:
pop[:,2000] # In series this function runs well but is not working in dataframe

states
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [17]:
pop[pop>25000000] # produce only those index which have population greater than 25000000

states      years
California  2000     33871648
            2010     37253956
Texas       2010     25145561
dtype: int64

In [18]:
popdf[popdf>25000000] # NaN is printed in those position where population is not greater than 25000000

Unnamed: 0,Unnamed: 1,population
California,2000,33871648.0
California,2010,37253956.0
Texas,2000,
Texas,2010,25145561.0
New York,2000,
New York,2010,


In [19]:
pop[['California','Texas']] # Single bracket do not work here

states      years
California  2000     33871648
            2010     37253956
Texas       2000     20851820
            2010     25145561
dtype: int64

In [223]:
popdf?

In [228]:
popdf.loc['California',2000] # This tells about California population in the year 2000

population    33871648
Name: (California, 2000), dtype: int64

In [230]:
popdf.loc[['California','Texas']]

Unnamed: 0,Unnamed: 1,population
California,2000,33871648
California,2010,37253956
Texas,2000,20851820
Texas,2010,25145561


In [231]:
popdf.loc['California':'Texas']

Unnamed: 0,Unnamed: 1,population
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [232]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,55.0,37.4,47.0,39.2,56.0,36.0
2013,2,47.0,36.8,36.0,37.4,38.0,38.5
2014,1,45.0,37.1,41.0,37.3,52.0,36.8
2014,2,40.0,36.1,11.0,37.7,46.0,36.3


In [245]:
health_data['Guido','HR'] # used to find out the heart rate of Guido

year  visit
2013  1        47.0
      2        36.0
2014  1        41.0
      2        11.0
Name: (Guido, HR), dtype: float64

In [246]:
health_data.iloc[:2,:2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,55.0,37.4
2013,2,47.0,36.8


In [248]:
health_data.loc[:, ('Bob', 'HR')] # Index and semi-index can be obtained simultaneously by using tuple 

year  visit
2013  1        55.0
      2        47.0
2014  1        45.0
      2        40.0
Name: (Bob, HR), dtype: float64

Working with slices within these index tuples is not especially convenient; trying to
create a slice within a tuple will lead to a syntax error

In [249]:
health_data.loc[(:, 1), (:, 'HR')] # Using of slice within tuple produces syntax error.

SyntaxError: invalid syntax (<ipython-input-249-fb34fa30ac09>, line 1)

You could get around this by building the desired slice explicitly using Python’s builtin slice() function, but a better way in this context is to use an IndexSlice object,
which Pandas provides for precisely this situation

In [261]:
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]


Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,55.0,47.0,56.0
2014,1,45.0,41.0,52.0


## Rearranging Multi-Indices


One of the keys to working with multiply indexed data is knowing how to effectively
transform the data. There are a number of operations that will preserve all the infor‐
mation in the dataset, but rearrange it for the purposes of various computations

### Sorted and unsorted indices

Earlier, we briefly mentioned a caveat, but we should emphasize it more here. Many of
the MultiIndex slicing operations will fail if the index is not sorted.

In [270]:
index=pd.MultiIndex.from_product([['a','c','b'],[1,2]])  # This is unsorted index
data=pd.Series(np.random.rand(6),index=index)
data.index.names=['char','int']
data

char  int
a     1      0.943748
      2      0.681820
c     1      0.359508
      2      0.437032
b     1      0.697631
      2      0.060225
dtype: float64

In [272]:
try:    # if the index is unsorted , some of the outputs will not come and such returns would be received
 data['a':'b']
except KeyError as e:
 print(type(e))
 print(e)


<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [280]:
data=data.sort_index() # This is sorted index
data

char  int
a     1      0.943748
      2      0.681820
b     1      0.697631
      2      0.060225
c     1      0.359508
      2      0.437032
dtype: float64

With the index sorted in this way, partial slicing will work as expected

In [281]:
data['a':'b']

char  int
a     1      0.943748
      2      0.681820
b     1      0.697631
      2      0.060225
dtype: float64

### Stacking and unstacking indices


In [286]:
data.unstack(level=0)  # This is the unstack operation

char,a,b,c
int,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.943748,0.697631,0.359508
2,0.68182,0.060225,0.437032


In [287]:
data.unstack(level=1) # This could be unstack both ways

int,1,2
char,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.943748,0.68182
b,0.697631,0.060225
c,0.359508,0.437032


In [290]:
data.unstack().stack() # whole unstack and stack command is combined into a single command

char  int
a     1      0.943748
      2      0.681820
b     1      0.697631
      2      0.060225
c     1      0.359508
      2      0.437032
dtype: float64

### Index setting and resetting


Another way to rearrange hierarchical data is to turn the index labels into columns;
this can be accomplished with the reset_index method

In [20]:
pop_flat = pop.reset_index(name='population') # Converting index labels into columns
pop_flat

Unnamed: 0,states,years,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


Often when you are working with data in the real world, the raw input data looks like
this and it’s useful to build a MultiIndex from the column values. This can be done
with the set_index method of the DataFrame, which returns a multiply indexed Data
Frame

In [21]:
pop_flat.set_index(['states','years']) # Converting column labels back into hierarchial data 

Unnamed: 0_level_0,Unnamed: 1_level_0,population
states,years,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [22]:
pop.reset_index?