# 8.1 Hierarchical Indexing

Hierarchical indexing is an important feature of pandas that enables you to have mul‐
tiple (two or more) index levels on an axis.

In [1]:
import pandas as pd
import numpy as np


In [2]:
data = pd.Series(np.random.randn(9),index=[['a','a','a','b','b','c','c','d','d'],[1,2,3,1,3,1,2,2,3]])

In [3]:
data

a  1    1.862453
   2    0.906430
   3   -0.259103
b  1    0.060565
   3   -0.104923
c  1    0.123849
   2    0.395387
d  2   -0.170314
   3   -0.105657
dtype: float64

In [4]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])

In [5]:
data['b']

1    0.060565
3   -0.104923
dtype: float64

In [6]:
data['a']

1    1.862453
2    0.906430
3   -0.259103
dtype: float64

In [7]:
data.loc[['a','b']]

a  1    1.862453
   2    0.906430
   3   -0.259103
b  1    0.060565
   3   -0.104923
dtype: float64

In [8]:
data.loc[['a']]

a  1    1.862453
   2    0.906430
   3   -0.259103
dtype: float64

In [9]:
data.loc[:,2]

a    0.906430
c    0.395387
d   -0.170314
dtype: float64

## To change to dataframe

In [10]:
dataFrame = data.unstack()

In [11]:
dataFrame

Unnamed: 0,1,2,3
a,1.862453,0.90643,-0.259103
b,0.060565,,-0.104923
c,0.123849,0.395387,
d,,-0.170314,-0.105657


In [12]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],columns=[['Ohio', 'Ohio', 'Colorado'],['Green', 'Red', 'Green']])

In [13]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [14]:
frame.index.names= ['keys', 'locks']

In [15]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
keys,locks,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [16]:
frame.columns.names = ['chabi', 'taala']

In [17]:
frame

Unnamed: 0_level_0,chabi,Ohio,Ohio,Colorado
Unnamed: 0_level_1,taala,Green,Red,Green
keys,locks,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


## Indexing with a DataFrame’s columns

In [18]:
frame = pd.DataFrame({
    'a': range(7),
    'b': range(7,0,-1),
    'c': ['one','one','one','two','two','two','two'],
    'd': [0,1,2,0,1,2,3]
})

In [19]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


##### DataFrame’s set_index function will create a new DataFrame using one or more of its columns as the index

In [20]:
frame.set_index(['c','d'])

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


#### reset_index , on the other hand, does the opposite of set_index ; the hierarchical index levels are moved into the columns

In [21]:
frame.reset_index

<bound method DataFrame.reset_index of    a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3>

In [22]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


# 8.2 Combining and Merging Datasets- Need to Clear out the basic concepts. Didnt understand a thing!

### Database-Style DataFrame Joins

Merge or join operations combine datasets by linking rows using one or more keys.
These operations are central to relational databases (e.g., SQL-based). The merge
function in pandas is the main entry point for using these algorithms on your data.

In [29]:
df1 = pd.DataFrame({
    'key': ['b','b','a','c','a','a','b'],
    'data1': np.arange(7)
})

In [30]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [31]:
df2 = pd.DataFrame({
    'key': ['a','b','d'],
    'data2': np.arange(3)
})

In [32]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [36]:
# pd.merge(df1,df2)
print(df1)
print(df2)

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   a      5
6   b      6
  key  data2
0   a      0
1   b      1
2   d      2


In [37]:
pd.merge(df1,df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


## 8.3 Reshaping and Pivoting-study again

In [38]:
data = pd.DataFrame(np.arange(6).reshape((2,3)), 
                   index = pd.Index(['Silchar', 'Karimganj'], name='Dist'),
                   columns = pd.Index(['one','two','three'], name='Distance')
                   )

In [39]:
data

Distance,one,two,three
Dist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Silchar,0,1,2
Karimganj,3,4,5


In [40]:
# Using the stack method on this data pivots the columns into the rows, producing a Series
data.stack()

Dist       Distance
Silchar    one         0
           two         1
           three       2
Karimganj  one         3
           two         4
           three       5
dtype: int64

In [43]:
data.unstack()

Distance  Dist     
one       Silchar      0
          Karimganj    3
two       Silchar      1
          Karimganj    4
three     Silchar      2
          Karimganj    5
dtype: int64

In [42]:
data

Distance,one,two,three
Dist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Silchar,0,1,2
Karimganj,3,4,5
