In [2]:
import pandas as pd 
import numpy as np 

## Index objects

In [28]:
df = pd.DataFrame(np.random.randn(5, 3), 
columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,1.445456,0.622485,-1.255156
1,0.575118,-1.104595,1.40774
2,-0.137163,0.582018,-1.380512
3,-1.260781,0.554123,-0.316706
4,-0.967927,-1.069614,-0.226388


In [29]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [30]:
df.index

RangeIndex(start=0, stop=5, step=1)

## Multi-indexes

In [33]:
cidx = pd.MultiIndex.from_product([["A","B"],["x","y"]], names=["c1","c2"])
pd.DataFrame(np.random.rand(4,4), columns = cidx)

c1,A,A,B,B
c2,x,y,x,y
0,0.572727,0.324521,0.113746,0.733312
1,0.171528,0.763045,0.652714,0.633551
2,0.09532,0.072354,0.22911,0.513907
3,0.300453,0.919104,0.501841,0.900736


In [34]:
ridx = pd.MultiIndex.from_product([["m","n"],["l","p"]], names=["r1","r2"])
data = pd.DataFrame(np.random.rand(4,4), index= ridx, columns = cidx)

In [35]:
data

Unnamed: 0_level_0,c1,A,A,B,B
Unnamed: 0_level_1,c2,x,y,x,y
r1,r2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
m,l,0.225619,0.707707,0.631929,0.057536
m,p,0.976207,0.545764,0.04468,0.399048
n,l,0.786445,0.073083,0.41288,0.771617
n,p,0.677914,0.243224,0.854608,0.786713


## Reshaping data

In [14]:
df = pd.read_csv("../data/reshaping.csv", index_col=0)
df

Unnamed: 0,country,year,type,count
0,A,1999,cases,0.7K
1,A,1999,pop,19M
2,A,2000,cases,2K
3,A,2000,pop,20M
4,B,1999,cases,37K
5,B,1999,pop,172M
6,B,2000,cases,80K
7,B,2000,pop,174M
8,C,1999,cases,212K
9,C,1999,pop,1T


In [15]:
df_wide = df.pivot(
  index=["country","year"], 
  columns="type", 
  values="count"
)
df_wide

Unnamed: 0_level_0,type,cases,pop
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1999,0.7K,19M
A,2000,2K,20M
B,1999,37K,172M
B,2000,80K,174M
C,1999,212K,1T
C,2000,213K,1T


In [16]:
df_wide.index

MultiIndex([('A', 1999),
            ('A', 2000),
            ('B', 1999),
            ('B', 2000),
            ('C', 1999),
            ('C', 2000)],
           names=['country', 'year'])

In [17]:
df_wide.columns

Index(['cases', 'pop'], dtype='object', name='type')

In [18]:
df_wide.reset_index().rename_axis(columns=None)

Unnamed: 0,country,year,cases,pop
0,A,1999,0.7K,19M
1,A,2000,2K,20M
2,B,1999,37K,172M
3,B,2000,80K,174M
4,C,1999,212K,1T
5,C,2000,213K,1T


In [23]:
df = pd.read_csv("../data/rate.csv", index_col=0)
df

Unnamed: 0,country,year,rate
0,A,1999,0.7K/19M
1,A,2000,2K/20M
2,B,1999,37K/172M
3,B,2000,80K/174M
4,C,1999,212K/1T
5,C,2000,213K/1T


In [25]:
df.assign(rate = lambda d: d.rate.str.split("/"))

Unnamed: 0,country,year,rate
0,A,1999,"[0.7K, 19M]"
1,A,2000,"[2K, 20M]"
2,B,1999,"[37K, 172M]"
3,B,2000,"[80K, 174M]"
4,C,1999,"[212K, 1T]"
5,C,2000,"[213K, 1T]"


In [26]:
( df
  .assign(
    rate = lambda d: d.rate.str.split("/")
  )
  .explode("rate")
  .assign(
    type = lambda d: ["cases", "pop"] * int(d.shape[0]/2)
  )
  .pivot(index=["country","year"], columns="type", values="rate")
  .reset_index()
)

type,country,year,cases,pop
0,A,1999,0.7K,19M
1,A,2000,2K,20M
2,B,1999,37K,172M
3,B,2000,80K,174M
4,C,1999,212K,1T
5,C,2000,213K,1T


In [27]:
( df
  .assign(
    rate = lambda d: d.rate.str.split("/")
  )
  .explode("rate")
  .assign(
    type = lambda d: ["cases", "pop"] * int(d.shape[0]/2)
  )
)

Unnamed: 0,country,year,rate,type
0,A,1999,0.7K,cases
0,A,1999,19M,pop
1,A,2000,2K,cases
1,A,2000,20M,pop
2,B,1999,37K,cases
2,B,1999,172M,pop
3,B,2000,80K,cases
3,B,2000,174M,pop
4,C,1999,212K,cases
4,C,1999,1T,pop


## Split-Apply-Combine