# Multi-index
## two ways to set up multiple indexes

In [1]:
import pandas as pd

In [2]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"], date_format="%Y-%m-%d")
bigmac.head()

Unnamed: 0,Date,Country,Price in US Dollars
0,2000-04-01,Argentina,2.5
1,2000-04-01,Australia,1.541667
2,2000-04-01,Brazil,1.648045
3,2000-04-01,Canada,1.938776
4,2000-04-01,Switzerland,3.470588


In [3]:
bigmac.dtypes

Date                   datetime64[ns]
Country                        object
Price in US Dollars           float64
dtype: object

In [4]:
bigmac.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1386 entries, 0 to 1385
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date                 1386 non-null   datetime64[ns]
 1   Country              1386 non-null   object        
 2   Price in US Dollars  1386 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 32.6+ KB


In [5]:
# in the data, both country and date duplicated many times
# out: smallest number of unique values, then more and more unique

In [9]:
# or use index_col when read_csv
bigmac = bigmac.set_index(keys=["Date","Country"]).sort_index()

In [10]:
bigmac.nunique()

Price in US Dollars    1350
dtype: int64

In [11]:
bigmac.index #dtype is a tuple

MultiIndex([('2000-04-01',            'Argentina'),
            ('2000-04-01',            'Australia'),
            ('2000-04-01',               'Brazil'),
            ('2000-04-01',              'Britain'),
            ('2000-04-01',               'Canada'),
            ('2000-04-01',                'Chile'),
            ('2000-04-01',                'China'),
            ('2000-04-01',       'Czech Republic'),
            ('2000-04-01',              'Denmark'),
            ('2000-04-01',            'Euro area'),
            ...
            ('2020-07-01',               'Sweden'),
            ('2020-07-01',          'Switzerland'),
            ('2020-07-01',               'Taiwan'),
            ('2020-07-01',             'Thailand'),
            ('2020-07-01',               'Turkey'),
            ('2020-07-01',              'Ukraine'),
            ('2020-07-01', 'United Arab Emirates'),
            ('2020-07-01',        'United States'),
            ('2020-07-01',              'Uruguay

## extract index values
index.get_level_values with string names or int

In [12]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"], date_format="%Y-%m-%d",index_col=["Date","Country"])

In [13]:
bigmac.index.get_level_values("Date")

DatetimeIndex(['2000-04-01', '2000-04-01', '2000-04-01', '2000-04-01',
               '2000-04-01', '2000-04-01', '2000-04-01', '2000-04-01',
               '2000-04-01', '2000-04-01',
               ...
               '2020-07-01', '2020-07-01', '2020-07-01', '2020-07-01',
               '2020-07-01', '2020-07-01', '2020-07-01', '2020-07-01',
               '2020-07-01', '2020-07-01'],
              dtype='datetime64[ns]', name='Date', length=1386, freq=None)

In [14]:
bigmac.index.get_level_values(1)

Index(['Argentina', 'Australia', 'Brazil', 'Canada', 'Switzerland', 'Chile',
       'China', 'Czech Republic', 'Denmark', 'Euro area',
       ...
       'Singapore', 'Sweden', 'Thailand', 'Turkey', 'Taiwan', 'Ukraine',
       'Uruguay', 'United States', 'Vietnam', 'South Africa'],
      dtype='object', name='Country', length=1386)

## rename index values
set_names

In [16]:
bigmac.index.set_names(names="Time",level=0)
# you can use lists on names too

MultiIndex([('2000-04-01',      'Argentina'),
            ('2000-04-01',      'Australia'),
            ('2000-04-01',         'Brazil'),
            ('2000-04-01',         'Canada'),
            ('2000-04-01',    'Switzerland'),
            ('2000-04-01',          'Chile'),
            ('2000-04-01',          'China'),
            ('2000-04-01', 'Czech Republic'),
            ('2000-04-01',        'Denmark'),
            ('2000-04-01',      'Euro area'),
            ...
            ('2020-07-01',      'Singapore'),
            ('2020-07-01',         'Sweden'),
            ('2020-07-01',       'Thailand'),
            ('2020-07-01',         'Turkey'),
            ('2020-07-01',         'Taiwan'),
            ('2020-07-01',        'Ukraine'),
            ('2020-07-01',        'Uruguay'),
            ('2020-07-01',  'United States'),
            ('2020-07-01',        'Vietnam'),
            ('2020-07-01',   'South Africa')],
           names=['Time', 'Country'], length=1386)

## sort_index methods
use ascending

In [17]:
bigmac.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2000-04-01,Argentina,2.500000
2000-04-01,Australia,1.541667
2000-04-01,Brazil,1.648045
2000-04-01,Britain,3.002000
2000-04-01,Canada,1.938776
...,...,...
2020-07-01,Ukraine,2.174714
2020-07-01,United Arab Emirates,4.015846
2020-07-01,United States,5.710000
2020-07-01,Uruguay,4.327418


In [18]:
bigmac.sort_index(ascending=[False,True])

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2020-07-01,Argentina,3.509232
2020-07-01,Australia,4.578450
2020-07-01,Azerbaijan,2.324897
2020-07-01,Bahrain,3.713035
2020-07-01,Brazil,3.913528
...,...,...
2000-04-01,Sweden,2.714932
2000-04-01,Switzerland,3.470588
2000-04-01,Taiwan,2.287582
2000-04-01,Thailand,1.447368


## iloc and loc on multi-index
For less ambiguity, add a tuple (index,index) within [] for loc

In [36]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"], date_format="%Y-%m-%d", index_col=["Date", "Country"]).sort_index()

In [37]:
bigmac.iloc[1]

Price in US Dollars    1.541667
Name: (2000-04-01 00:00:00, Australia), dtype: float64

In [38]:
bigmac.loc["2020-07-1","Brazil"] #multiple index as identifiers

Price in US Dollars    3.913528
Name: (2020-07-01 00:00:00, Brazil), dtype: float64

In [39]:
bigmac.loc["2020-07-1","Price in US Dollars"].iloc[0:5]

Country
Argentina     3.509232
Australia     4.578450
Azerbaijan    2.324897
Bahrain       3.713035
Brazil        3.913528
Name: Price in US Dollars, dtype: float64

In [40]:
bigmac.loc[("2020-07-1","Brazil")] #multiple index as identifiers

Price in US Dollars    3.913528
Name: (2020-07-01 00:00:00, Brazil), dtype: float64

In [41]:
bigmac.loc[("2012-01-01", "Brazil"): ("2013-07-01", "Turkey")]

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2012-01-01,Brazil,5.678670
2012-01-01,Britain,3.823395
2012-01-01,Canada,4.632940
2012-01-01,Chile,4.050983
2012-01-01,China,2.438445
...,...,...
2013-07-01,Sweden,6.156874
2013-07-01,Switzerland,6.719041
2013-07-01,Taiwan,2.630834
2013-07-01,Thailand,2.845723


# transpose method

Columns can also indexes

In [45]:
start = ("2018-01-01", "China")
end = ("2018-01-01", "Denmark")

bigmac.loc[start:end].transpose()

Date,2018-01-01,2018-01-01,2018-01-01,2018-01-01,2018-01-01
Country,China,Colombia,Costa Rica,Czech Republic,Denmark
Price in US Dollars,3.171642,3.832468,4.027932,3.807779,4.93202


# Stack method
- move column index to row index and return a multi-idnex series

- The `unstack` method moves a row index to the column index (the inverse of the `stack` method).

In [46]:
world = pd.read_csv("worldstats.csv", index_col=["year", "country"]).sort_index()
world.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1
1960,Afghanistan,8994793.0,537777800.0
1960,Algeria,11124892.0,2723638000.0
1960,Australia,10276477.0,18567590000.0
1960,Austria,7047539.0,6592694000.0
1960,"Bahamas, The",109526.0,169802300.0


In [47]:
world.stack()

year  country                
1960  Afghanistan  Population    8.994793e+06
                   GDP           5.377778e+08
      Algeria      Population    1.112489e+07
                   GDP           2.723638e+09
      Australia    Population    1.027648e+07
                                     ...     
2015  World        GDP           7.343364e+13
      Zambia       Population    1.621177e+07
                   GDP           2.120156e+10
      Zimbabwe     Population    1.560275e+07
                   GDP           1.389294e+10
Length: 22422, dtype: float64

In [48]:
world.stack().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1
1960,Afghanistan,Population,8.994793e+06
1960,Afghanistan,GDP,5.377778e+08
1960,Algeria,Population,1.112489e+07
1960,Algeria,GDP,2.723638e+09
1960,Australia,Population,1.027648e+07
...,...,...,...
2015,World,GDP,7.343364e+13
2015,Zambia,Population,1.621177e+07
2015,Zambia,GDP,2.120156e+10
2015,Zimbabwe,Population,1.560275e+07


# pivot method: make the table more compact
arguments: index,columns,values

## The pivot Method
- The `pivot` method reshapes data from a tall format to a wide format.
- Ask yourself which direction the data will expand in if you add more entries.
- A tall/long format expands down. A wide format expands out.
- The `index` parameter sets the horizontal index of the pivoted **DataFrame**.
- The `columns` parameter sets the column whose values will be the columns in the pivoted **DataFrame**.
- The `values` parameter set the values of the pivoted **DataFrame**. Pandas will populate the correct values based on the index and column intersections.

In [49]:
sales = pd.read_csv("salesmen.csv")
sales

Unnamed: 0,Date,Salesman,Revenue
0,1/1/2025,Sharon,7172
1,1/2/2025,Sharon,6362
2,1/3/2025,Sharon,5982
3,1/4/2025,Sharon,7917
4,1/5/2025,Sharon,7837
...,...,...,...
1820,12/27/2025,Oscar,835
1821,12/28/2025,Oscar,3073
1822,12/29/2025,Oscar,6424
1823,12/30/2025,Oscar,7088


In [50]:
#           Sharon   Oscar  Salesman 1  Salesman 2   New Salesman
# Date					
# 1/1/2025	 7172	 1864
# 1/2/2025	 7543	 7105
# 1/3/2025	 1053	 6851

sales.pivot(index="Date", columns="Salesman", values="Revenue")

Salesman,Alexander,Dave,Oscar,Ronald,Sharon
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1/1/2025,4430,1864,5250,2639,7172
1/10/2025,301,7105,7663,8267,7543
1/11/2025,9489,6851,8888,1340,1053
1/12/2025,8719,7147,3092,279,4362
1/13/2025,2349,6160,6139,7540,6812
...,...,...,...,...,...
9/5/2025,2439,211,7743,4252,992
9/6/2025,7585,7293,5072,1112,556
9/7/2025,6669,9774,5230,3608,6499
9/8/2025,3058,8194,7755,5762,9621


# melt method: the opposite of pivot
- The `melt` method is ideal when you have multiple columns storing the *same* data point.
- Ask yourself whether the column's values are a *type* of the column header. If they're not, the data is likely stored in a wide format.
- The `id_vars` parameters accepts the column whose values will be repeated for every column.
- The `var_name` parameter sets the name of the new column for the varying values (the former column names).
- The `value_name` parameter set the new name of the values column (holding the values from the original **DataFrame**).

In [51]:
quarters = pd.read_csv("quarters.csv")
quarters

Unnamed: 0,Salesman,Q1,Q2,Q3,Q4
0,Boris,602908,233879,354479,32704
1,Piers,43790,514863,297151,544493
2,Tommy,392668,113579,430882,247231
3,Travis,834663,266785,749238,570524
4,Cindy,580935,411379,110390,651572
5,Rob,656644,70803,375948,321388
6,Mike,486141,600753,742716,404995
7,Stacy,479662,742806,770712,2501
8,Alexandra,992673,879183,37945,293710


In [52]:
quarters.melt(id_vars="Salesman", var_name="Quarter", value_name="Revenue")

Unnamed: 0,Salesman,Quarter,Revenue
0,Boris,Q1,602908
1,Piers,Q1,43790
2,Tommy,Q1,392668
3,Travis,Q1,834663
4,Cindy,Q1,580935
5,Rob,Q1,656644
6,Mike,Q1,486141
7,Stacy,Q1,479662
8,Alexandra,Q1,992673
9,Boris,Q2,233879


## The pivot_table Method
- The `pivot_table` method operates similarly to the Pivot Table feature in Excel.
- A pivot table is a table whose values are aggregations of groups of values from another table.
- The `values` parameter accepts the numeric column whose values will be aggregated.
- The `aggfunc` parameter declares the aggregation function (the default is mean/average).
- The `index` parameter sets the index labels of the pivot table. MultiIndexes are permitted.
- The `columns` parameter sets the column labels of the pivot table. MultiIndexes are permitted.

In [53]:
foods = pd.read_csv("foods.csv")
foods.head()

Unnamed: 0,First Name,Gender,City,Frequency,Item,Spend
0,Wanda,Female,Stamford,Weekly,Burger,15.66
1,Eric,Male,Stamford,Daily,Chalupa,10.56
2,Charles,Male,New York,Never,Sushi,42.14
3,Anna,Female,Philadelphia,Once,Ice Cream,11.01
4,Deborah,Female,Philadelphia,Daily,Chalupa,23.49


In [54]:
foods.pivot_table(values="Spend", index="Gender")
foods.pivot_table(values="Spend", index="Gender", aggfunc="mean")
foods.pivot_table(values="Spend", index="Gender", aggfunc="sum")

foods.pivot_table(values="Spend", index="Item", aggfunc="sum")

foods.pivot_table(values="Spend", index=["Gender", "Item"], aggfunc="sum")

foods.pivot_table(values="Spend", index=["Gender", "Item"], columns="City", aggfunc="sum")

foods.pivot_table(values="Spend", index="Item", columns=["Gender", "City"], aggfunc="sum")

foods.pivot_table(values="Spend", index="Item", columns=["Gender", "City"], aggfunc="mean")

foods.pivot_table(values="Spend", index="Item", columns=["Gender", "City"], aggfunc="count")

foods.pivot_table(values="Spend", index="Item", columns=["Gender", "City"], aggfunc="max")

foods.pivot_table(values="Spend", index="Item", columns=["Gender", "City"], aggfunc="min")

Gender,Female,Female,Female,Male,Male,Male
City,New York,Philadelphia,Stamford,New York,Philadelphia,Stamford
Item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Burger,2.25,1.97,6.24,5.43,1.71,2.83
Burrito,1.02,1.04,1.18,15.9,8.58,3.64
Chalupa,1.96,9.35,9.09,11.61,1.94,10.56
Donut,3.15,2.13,1.68,1.49,1.26,6.63
Ice Cream,13.39,7.61,8.8,14.06,4.89,3.43
Sushi,2.52,11.68,8.2,3.28,2.01,32.15
