In [1]:
import pandas as pd
import numpy as np

In [3]:
sp500 = pd.read_csv('data/sp500.csv', 
                    index_col='Symbol', 
                    usecols=[0, 2, 3, 7])
sp500

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.60,15.573
ABBV,Health Care,53.95,2.954
ACN,Information Technology,79.79,8.326
ACE,Financials,102.91,86.897
...,...,...,...
YHOO,Information Technology,35.02,12.768
YUM,Consumer Discretionary,74.77,5.147
ZMH,Health Care,101.84,37.181
ZION,Financials,28.43,30.191


In [5]:
np.random.seed(123456)
df = pd.DataFrame({'foo':np.random.random(10000), 'key':range(100, 10100)})
df_with_index = df.set_index(['key'])
df_with_index[:5]

Unnamed: 0_level_0,foo
key,Unnamed: 1_level_1
100,0.12697
101,0.966718
102,0.260476
103,0.897237
104,0.37675


In [6]:
%timeit df[df.key==10099] #slower

379 µs ± 956 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [7]:
%timeit df_with_index.loc[10099] #faster

63.2 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [8]:
temps = pd.DataFrame({ "City": ["Missoula", "Philadelphia"],
                       "Temperature": [70, 80] })
temps

Unnamed: 0,City,Temperature
0,Missoula,70
1,Philadelphia,80


In [10]:
temps.columns

Index(['City', 'Temperature'], dtype='object')

In [11]:
#Int64Index
df_i64 = pd.DataFrame(np.arange(10, 20), index=np.arange(0, 10))
df_i64.index

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [13]:
# index物件有許多和Numpy陣列相似的屬性
df_i64.index.size, df_i64.index.shape, df_i64.index.ndim, df_i64.index.dtype

(10, (10,), 1, dtype('int64'))

In [14]:
#RangeIndex
df_range = pd.DataFrame(np.arange(10, 15))
df_range.index

RangeIndex(start=0, stop=5, step=1)

In [15]:
#IntervalIndex
df_interval = pd.DataFrame({ "A": [1, 2, 3, 4]},
                    index = pd.IntervalIndex.from_breaks(
                        [0, 0.5, 1.0, 1.5, 2.0]))
df_interval

Unnamed: 0,A
"(0.0, 0.5]",1
"(0.5, 1.0]",2
"(1.0, 1.5]",3
"(1.5, 2.0]",4


In [16]:
df_interval.index

IntervalIndex([(0.0, 0.5], (0.5, 1.0], (1.0, 1.5], (1.5, 2.0]],
              closed='right',
              dtype='interval[float64]')

In [21]:
#CategoricalIndex
df_categorical = pd.DataFrame({'A': np.arange(6),
                               'B': list('aabbca')})
df_categorical['B'] = df_categorical['B'].astype('category')
df_categorical = df_categorical.set_index('B')
df_categorical

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
a,1
b,2
b,3
c,4
a,5


In [22]:
df_categorical.index

CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=False, name='B', dtype='category')

In [23]:
# DatetimeIndex
# 各種freq簡寫請看以下:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases
rng = pd.date_range('5/1/2017', periods=5, freq='H')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts.index

DatetimeIndex(['2017-05-01 00:00:00', '2017-05-01 01:00:00',
               '2017-05-01 02:00:00', '2017-05-01 03:00:00',
               '2017-05-01 04:00:00'],
              dtype='datetime64[ns]', freq='H')

In [24]:
#PeriodIndex
periods = pd.PeriodIndex(['2017-1', '2017-2', '2017-3'], freq='M')
period_series = pd.Series(np.random.randn(len(periods)),index=periods)
period_series.index

PeriodIndex(['2017-01', '2017-02', '2017-03'], dtype='period[M]', freq='M')

In [25]:
index_moved_to_col = sp500.reset_index()
index_moved_to_col[:5]

Unnamed: 0,Symbol,Sector,Price,Book Value
0,MMM,Industrials,141.14,26.668
1,ABT,Health Care,39.6,15.573
2,ABBV,Health Care,53.95,2.954
3,ACN,Information Technology,79.79,8.326
4,ACE,Financials,102.91,86.897


In [26]:
index_moved_to_col.set_index('Sector')[:5]

Unnamed: 0_level_0,Symbol,Price,Book Value
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Industrials,MMM,141.14,26.668
Health Care,ABT,39.6,15.573
Health Care,ABBV,53.95,2.954
Information Technology,ACN,79.79,8.326
Financials,ACE,102.91,86.897


In [19]:
#Hierarchical indexing
reindexed = sp500.reset_index()
multi_fi = reindexed.set_index(['Sector', 'Symbol'])
multi_fi[:5]

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Book Value
Sector,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1
Industrials,MMM,141.14,26.668
Health Care,ABT,39.6,15.573
Health Care,ABBV,53.95,2.954
Information Technology,ACN,79.79,8.326
Financials,ACE,102.91,86.897


In [20]:
#MultiIndex
multi_fi.index

MultiIndex([(           'Industrials',  'MMM'),
            (           'Health Care',  'ABT'),
            (           'Health Care', 'ABBV'),
            ('Information Technology',  'ACN'),
            (            'Financials',  'ACE'),
            (           'Health Care',  'ACT'),
            ('Information Technology', 'ADBE'),
            (             'Utilities',  'AES'),
            (           'Health Care',  'AET'),
            (            'Financials',  'AFL'),
            ...
            (             'Utilities',  'XEL'),
            ('Information Technology',  'XRX'),
            ('Information Technology', 'XLNX'),
            (            'Financials',   'XL'),
            (           'Industrials',  'XYL'),
            ('Information Technology', 'YHOO'),
            ('Consumer Discretionary',  'YUM'),
            (           'Health Care',  'ZMH'),
            (            'Financials', 'ZION'),
            (           'Health Care',  'ZTS')],
           names=['Sect

In [21]:
len(multi_fi.index.levels)

2

In [22]:
multi_fi.index.levels[0]

Index(['Consumer Discretionary', 'Consumer Discretionary ', 'Consumer Staples',
       'Consumer Staples ', 'Energy', 'Financials', 'Health Care',
       'Industrials', 'Industries', 'Information Technology', 'Materials',
       'Telecommunications Services', 'Utilities'],
      dtype='object', name='Sector')

In [23]:
multi_fi.index.levels[1]

Index(['A', 'AA', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACE', 'ACN', 'ACT', 'ADBE',
       ...
       'XLNX', 'XOM', 'XRAY', 'XRX', 'XYL', 'YHOO', 'YUM', 'ZION', 'ZMH',
       'ZTS'],
      dtype='object', name='Symbol', length=500)