# MultiLindexing

In [75]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np

In [76]:
df = DataFrame({'row': [0, 1, 2],
                'One_X': [1.1] * 3,
                'One_Y': [1.2] * 3,
                'Two_X': [1.11] * 3,
                'Two_Y': [1.22] * 3}) ; df

Unnamed: 0,One_X,One_Y,Two_X,Two_Y,row
0,1.1,1.2,1.11,1.22,0
1,1.1,1.2,1.11,1.22,1
2,1.1,1.2,1.11,1.22,2


In [77]:
df = df.set_index('row') ;df

Unnamed: 0_level_0,One_X,One_Y,Two_X,Two_Y
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.1,1.2,1.11,1.22
1,1.1,1.2,1.11,1.22
2,1.1,1.2,1.11,1.22


In [78]:
df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in df.columns]); df

Unnamed: 0_level_0,One,One,Two,Two
Unnamed: 0_level_1,X,Y,X,Y
row,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1.1,1.2,1.11,1.22
1,1.1,1.2,1.11,1.22
2,1.1,1.2,1.11,1.22


In [79]:
df_st = df.stack(level=0) ; df_st

Unnamed: 0_level_0,Unnamed: 1_level_0,X,Y
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,One,1.1,1.2
0,Two,1.11,1.22
1,One,1.1,1.2
1,Two,1.11,1.22
2,One,1.1,1.2
2,Two,1.11,1.22


In [80]:
df = df_st.reset_index(1) ; df

Unnamed: 0_level_0,level_1,X,Y
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,One,1.1,1.2
0,Two,1.11,1.22
1,One,1.1,1.2
1,Two,1.11,1.22
2,One,1.1,1.2
2,Two,1.11,1.22


In [81]:
df.columns = ['Sample', 'All_x', 'All_Y'] ; df

Unnamed: 0_level_0,Sample,All_x,All_Y
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,One,1.1,1.2
0,Two,1.11,1.22
1,One,1.1,1.2
1,Two,1.11,1.22
2,One,1.1,1.2
2,Two,1.11,1.22


## Arithmetic

In [82]:
[(x,y) for x in ['a', 'b', 'c'] for y in ['o', 'i']]

[('a', 'o'), ('a', 'i'), ('b', 'o'), ('b', 'i'), ('c', 'o'), ('c', 'i')]

In [83]:
cols = pd.MultiIndex.from_tuples([(x,y) for x in ['A', 'B', 'C'] for y in ['O', 'I']]); cols

MultiIndex(levels=[['A', 'B', 'C'], ['I', 'O']],
           labels=[[0, 0, 1, 1, 2, 2], [1, 0, 1, 0, 1, 0]])

In [84]:
df = pd.DataFrame(np.random.randn(2, 6), index=['n', 'm'], columns=cols); df

Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,O,I,O,I,O,I
n,1.401458,0.141658,-1.044944,0.031139,-0.583423,-0.074535
m,-0.727399,-0.830327,-1.013957,-1.053051,-0.031298,0.004442


In [85]:
df = df.div(df['C'], level=1) ; df

Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,O,I,O,I,O,I
n,-2.402129,-1.90057,1.791056,-0.417785,1.0,1.0
m,23.240741,-186.918147,32.396416,-237.056497,1.0,1.0


## Slicing

In [86]:
coords = [('AA', 'one'), ('AA','six'),('BB','one'),('BB','two'),('BB','six')] ; coords

[('AA', 'one'), ('AA', 'six'), ('BB', 'one'), ('BB', 'two'), ('BB', 'six')]

In [87]:
index = pd.MultiIndex.from_tuples(coords) ; index

MultiIndex(levels=[['AA', 'BB'], ['one', 'six', 'two']],
           labels=[[0, 0, 1, 1, 1], [0, 1, 0, 2, 1]])

In [88]:
df = pd.DataFrame(data = [11, 22, 33, 44, 55], index = index, columns = ['MyData']) ; df

Unnamed: 0,Unnamed: 1,MyData
AA,one,11
AA,six,22
BB,one,33
BB,two,44
BB,six,55


In [89]:
df.xs('BB', level=0, axis=0)

Unnamed: 0,MyData
one,33
two,44
six,55


In [90]:
df.xs('six', level=1, axis=0)

Unnamed: 0,MyData
AA,22
BB,55


In [91]:
import itertools

In [92]:
index = list(itertools.product(['Ada', 'Quinn', 'Violet'], ['Comp', 'Math', 'Sci'])) ; index

[('Ada', 'Comp'),
 ('Ada', 'Math'),
 ('Ada', 'Sci'),
 ('Quinn', 'Comp'),
 ('Quinn', 'Math'),
 ('Quinn', 'Sci'),
 ('Violet', 'Comp'),
 ('Violet', 'Math'),
 ('Violet', 'Sci')]

In [93]:
headr = list(itertools.product(['Exams', 'Labs'], ['I', 'II'])); headr

[('Exams', 'I'), ('Exams', 'II'), ('Labs', 'I'), ('Labs', 'II')]

In [94]:
indx = pd.MultiIndex.from_tuples(index, names=['Student', 'Course']) ; indx

MultiIndex(levels=[['Ada', 'Quinn', 'Violet'], ['Comp', 'Math', 'Sci']],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
           names=['Student', 'Course'])

In [95]:
cols = pd.MultiIndex.from_tuples(headr, names=['Eden', 'Wang']) ; cols

MultiIndex(levels=[['Exams', 'Labs'], ['I', 'II']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['Eden', 'Wang'])

In [96]:
data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)]; data

[[70, 71, 72, 73],
 [71, 73, 75, 74],
 [72, 75, 75, 75],
 [73, 74, 75, 76],
 [74, 76, 78, 77],
 [75, 78, 78, 78],
 [76, 77, 78, 79],
 [77, 79, 81, 80],
 [78, 81, 81, 81]]

In [97]:
df = pd.DataFrame(data, indx, cols); df

Unnamed: 0_level_0,Eden,Exams,Exams,Labs,Labs
Unnamed: 0_level_1,Wang,I,II,I,II
Student,Course,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Ada,Comp,70,71,72,73
Ada,Math,71,73,75,74
Ada,Sci,72,75,75,75
Quinn,Comp,73,74,75,76
Quinn,Math,74,76,78,77
Quinn,Sci,75,78,78,78
Violet,Comp,76,77,78,79
Violet,Math,77,79,81,80
Violet,Sci,78,81,81,81


In [98]:
All = slice(None) ; All

slice(None, None, None)

In [99]:
df.loc['Violet']

Eden,Exams,Exams,Labs,Labs
Wang,I,II,I,II
Course,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Comp,76,77,78,79
Math,77,79,81,80
Sci,78,81,81,81


In [100]:
df.loc[(All, 'Math'), All]

Unnamed: 0_level_0,Eden,Exams,Exams,Labs,Labs
Unnamed: 0_level_1,Wang,I,II,I,II
Student,Course,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Ada,Math,71,73,75,74
Quinn,Math,74,76,78,77
Violet,Math,77,79,81,80


In [101]:
df.loc[(slice('Ada','Quinn'),'Math'), All]

Unnamed: 0_level_0,Eden,Exams,Exams,Labs,Labs
Unnamed: 0_level_1,Wang,I,II,I,II
Student,Course,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Ada,Math,71,73,75,74
Quinn,Math,74,76,78,77


In [102]:
df.loc[(All, 'Math'), ('Exams')]

Unnamed: 0_level_0,Wang,I,II
Student,Course,Unnamed: 2_level_1,Unnamed: 3_level_1
Ada,Math,71,73
Quinn,Math,74,76
Violet,Math,77,79


In [103]:
df.loc[(All,'Math'), (All, 'II')]

Unnamed: 0_level_0,Eden,Exams,Labs
Unnamed: 0_level_1,Wang,II,II
Student,Course,Unnamed: 2_level_2,Unnamed: 3_level_2
Ada,Math,73,74
Quinn,Math,76,77
Violet,Math,79,80


## Sorting

In [104]:
df.sort_values(by=('Labs', 'II'), ascending=False)

Unnamed: 0_level_0,Eden,Exams,Exams,Labs,Labs
Unnamed: 0_level_1,Wang,I,II,I,II
Student,Course,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Violet,Sci,78,81,81,81
Violet,Math,77,79,81,80
Violet,Comp,76,77,78,79
Quinn,Sci,75,78,78,78
Quinn,Math,74,76,78,77
Quinn,Comp,73,74,75,76
Ada,Sci,72,75,75,75
Ada,Math,71,73,75,74
Ada,Comp,70,71,72,73


---

## pandas.DataFrame.set_index

In [105]:
help(pd.DataFrame.set_index)

Help on function set_index in module pandas.core.frame:

set_index(self, keys, drop=True, append=False, inplace=False, verify_integrity=False)
    Set the DataFrame index (row labels) using one or more existing
    columns. By default yields a new object.
    
    Parameters
    ----------
    keys : column label or list of column labels / arrays
    drop : boolean, default True
        Delete columns to be used as the new index
    append : boolean, default False
        Whether to append columns to existing index
    inplace : boolean, default False
        Modify the DataFrame in place (do not create a new object)
    verify_integrity : boolean, default False
        Check the new index for duplicates. Otherwise defer the check until
        necessary. Setting to False will improve the performance of this
        method
    
    Examples
    --------
    >>> indexed_df = df.set_index(['A', 'B'])
    >>> indexed_df2 = df.set_index(['A', [0, 1, 2, 0, 1, 2]])
    >>> indexed_df3 = df.s

In [168]:
df_setindex = DataFrame({'row': [0, 1, 2],
                'One_X': [1.1] * 3,
                'One_Y': [1.2] * 3,
                'Two_X': [1.11] * 3,
                'Two_Y': [1.22] * 3}) ; df

Unnamed: 0,One_X,One_Y,Two_X,Two_Y,row
0,1.1,1.2,1.11,1.22,0
1,1.1,1.2,1.11,1.22,1
2,1.1,1.2,1.11,1.22,2


In [186]:
df_setindex.set_index(['row', 'One_Y','Two_X'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,One_X,Two_Y
row,One_Y,Two_X,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.2,1.11,1.1,1.22
1,1.2,1.11,1.1,1.22
2,1.2,1.11,1.1,1.22


In [170]:
df_setindex.set_index(['row',[0,3,2]])

Unnamed: 0_level_0,Unnamed: 1_level_0,One_X,One_Y,Two_X,Two_Y
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,1.1,1.2,1.11,1.22
1,3,1.1,1.2,1.11,1.22
2,2,1.1,1.2,1.11,1.22


- 多个行标签(index)的用途?
- 行标签和列名的区别


## pandas.MultiIndex.from_tuples

In [106]:
help(pd.MultiIndex.from_tuples)

Help on method from_tuples in module pandas.core.indexes.multi:

from_tuples(tuples, sortorder=None, names=None) method of builtins.type instance
    Convert list of tuples to MultiIndex
    
    Parameters
    ----------
    tuples : list / sequence of tuple-likes
        Each tuple is the index of one row/column.
    sortorder : int or None
        Level of sortedness (must be lexicographically sorted by that
        level)
    
    Returns
    -------
    index : MultiIndex
    
    Examples
    --------
    >>> tuples = [(1, u'red'), (1, u'blue'),
                  (2, u'red'), (2, u'blue')]
    >>> MultiIndex.from_tuples(tuples, names=('number', 'color'))
    
    See Also
    --------
    MultiIndex.from_arrays : Convert list of arrays to MultiIndex
    MultiIndex.from_product : Make a MultiIndex from cartesian product
                              of iterables



## str.split

In [107]:
help(str.split)

Help on method_descriptor:

split(...)
    S.split(sep=None, maxsplit=-1) -> list of strings
    
    Return a list of the words in S, using sep as the
    delimiter string.  If maxsplit is given, at most maxsplit
    splits are done. If sep is not specified or is None, any
    whitespace string is a separator and empty strings are
    removed from the result.



## pandas.DataFrame.stack

In [108]:
help(pd.DataFrame.stack)

Help on function stack in module pandas.core.frame:

stack(self, level=-1, dropna=True)
    Pivot a level of the (possibly hierarchical) column labels, returning a
    DataFrame (or Series in the case of an object with a single level of
    column labels) having a hierarchical index with a new inner-most level
    of row labels.
    The level involved will automatically get sorted.
    
    Parameters
    ----------
    level : int, string, or list of these, default last level
        Level(s) to stack, can pass level name
    dropna : boolean, default True
        Whether to drop rows in the resulting Frame/Series with no valid
        values
    
    Examples
    ----------
    >>> s
         a   b
    one  1.  2.
    two  3.  4.
    
    >>> s.stack()
    one a    1
        b    2
    two a    3
        b    4
    
    Returns
    -------
    stacked : DataFrame or Series



## pandas.DataFrame.reset_index

In [109]:
help(pd.DataFrame.reset_index)

Help on function reset_index in module pandas.core.frame:

reset_index(self, level=None, drop=False, inplace=False, col_level=0, col_fill='')
    For DataFrame with multi-level index, return new DataFrame with
    labeling information in the columns under the index names, defaulting
    to 'level_0', 'level_1', etc. if any are None. For a standard index,
    the index name will be used (if set), otherwise a default 'index' or
    'level_0' (if 'index' is already taken) will be used.
    
    Parameters
    ----------
    level : int, str, tuple, or list, default None
        Only remove the given levels from the index. Removes all levels by
        default
    drop : boolean, default False
        Do not try to insert index into dataframe columns. This resets
        the index to the default integer index.
    inplace : boolean, default False
        Modify the DataFrame in place (do not create a new object)
    col_level : int or str, default 0
        If the columns have multiple lev

##  pandas.DataFrame.div

In [110]:
help(pd.DataFrame.div)

Help on function truediv in module pandas.core.ops:

truediv(self, other, axis='columns', level=None, fill_value=None)
    Floating division of dataframe and other, element-wise (binary operator `truediv`).
    
    Equivalent to ``dataframe / other``, but with support to substitute a fill_value for
    missing data in one of the inputs.
    
    Parameters
    ----------
    other : Series, DataFrame, or constant
    axis : {0, 1, 'index', 'columns'}
        For Series input, axis to match Series index on
    fill_value : None or float value, default None
        Fill missing (NaN) values with this value. If both DataFrame
        locations are missing, the result will be missing
    level : int or name
        Broadcast across a level, matching Index values on the
        passed MultiIndex level
    
    Notes
    -----
    Mismatched indices will be unioned together
    
    Returns
    -------
    result : DataFrame
    
    See also
    --------
    DataFrame.rtruediv



## pandas.DataFrame.xs

In [111]:
help(pd.DataFrame.xs)

Help on function xs in module pandas.core.generic:

xs(self, key, axis=0, level=None, drop_level=True)
    Returns a cross-section (row(s) or column(s)) from the
    Series/DataFrame. Defaults to cross-section on the rows (axis=0).
    
    Parameters
    ----------
    key : object
        Some label contained in the index, or partially in a MultiIndex
    axis : int, default 0
        Axis to retrieve cross-section on
    level : object, defaults to first n levels (n=1 or len(key))
        In case of a key partially contained in a MultiIndex, indicate
        which levels are used. Levels can be referred by label or position.
    drop_level : boolean, default True
        If False, returns object with same levels as self.
    
    Examples
    --------
    >>> df
       A  B  C
    a  4  5  2
    b  4  0  9
    c  9  7  3
    >>> df.xs('a')
    A    4
    B    5
    C    2
    Name: a
    >>> df.xs('C', axis=1)
    a    2
    b    9
    c    3
    Name: C
    
    >>> df
            

## itertools.product

In [112]:
help(itertools.product)

Help on class product in module itertools:

class product(builtins.object)
 |  product(*iterables, repeat=1) --> product object
 |  
 |  Cartesian product of input iterables.  Equivalent to nested for-loops.
 |  
 |  For example, product(A, B) returns the same as:  ((x,y) for x in A for y in B).
 |  The leftmost iterators are in the outermost for-loop, so the output tuples
 |  cycle in a manner similar to an odometer (with the rightmost element changing
 |  on every iteration).
 |  
 |  To compute the product of an iterable with itself, specify the number
 |  of repetitions with the optional repeat keyword argument. For example,
 |  product(A, repeat=4) means the same as product(A, A, A, A).
 |  
 |  product('ab', range(3)) --> ('a',0) ('a',1) ('a',2) ('b',0) ('b',1) ('b',2)
 |  product((0,1), (0,1), (0,1)) --> (0,0,0) (0,0,1) (0,1,0) (0,1,1) (1,0,0) ...
 |  
 |  Methods defined here:
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __iter__(self, /

## slice

In [113]:
help(slice)

Help on class slice in module builtins:

class slice(object)
 |  slice(stop)
 |  slice(start, stop[, step])
 |  
 |  Create a slice object.  This is used for extended slicing (e.g. a[0:10:2]).
 |  
 |  Methods defined here:
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __le__(self, value, /)
 |      Return self<=value.
 |  
 |  __lt__(self, value, /)
 |      Return self<value.
 |  
 |  __ne__(self, value, /)
 |      Return self!=value.
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.
 |  
 |  __reduce__(...)
 |      Return state information for pickling.
 |  
 |  __repr__(self, /)
 |      Return repr(self).
 |  
 |  indices(...)
 |      S.indices(len) -> (start, stop, stride)
 |      
 | 