In [1]:
import pandas as pd
import numpy as np

# First, Series

In [2]:
index = pd.MultiIndex.from_arrays([['col1','col2','col3'],['lev1','lev2','lev3']],names=['col','lev'])
pd_Series = pd.Series(data=np.arange(3),index=index,name='column')
print(pd_Series)

col   lev 
col1  lev1    0
col2  lev2    1
col3  lev3    2
Name: column, dtype: int64


In [3]:
print(pd_Series.values)
print(pd_Series.name)
print(pd_Series.index)

[0 1 2]
column
MultiIndex([('col1', 'lev1'),
            ('col2', 'lev2'),
            ('col3', 'lev3')],
           names=['col', 'lev'])


In [4]:
pd_Series.apply(lambda x:x**2)

col   lev 
col1  lev1    0
col2  lev2    1
col3  lev3    4
Name: column, dtype: int64

In [5]:
pd_Series.map({0:'real0',1:'real1',2:'real2'})

col   lev 
col1  lev1    real0
col2  lev2    real1
col3  lev3    real2
Name: column, dtype: object

In [6]:
pd_Series.astype('str').str.replace('0','00')

col   lev 
col1  lev1    00
col2  lev2     1
col3  lev3     2
Name: column, dtype: object

In [7]:
list(pd_Series.groupby(['g1','g2','g1']))

[('g1',
  col   lev 
  col1  lev1    0
  col3  lev3    2
  Name: column, dtype: int64),
 ('g2',
  col   lev 
  col2  lev2    1
  Name: column, dtype: int64)]

In [8]:
pd_Series.groupby(['g1','g2','g1']).apply(lambda x:max(x)) 
# think about, the chunk[0] will go to resultant index, chunk[1]--the grouped_df, is where apply function act upon

g1    2
g2    1
Name: column, dtype: int64

# Next, category

1. `pd.Categorical` is a specific data type, its dtype is CategoricalDtype, along with some accessible attributes and functions
2. When assign to a Series or DataFrame and access it, it become a series with dtype = 'category' (same as CategoricalDtype), and need `.cat` to return an accessor for `Categorical`'s functions and attributes
3. you can also directly built a `CategoricalDtype` for custom sorting application

In [9]:
col4 = pd.Categorical(['first','second','third'])

In [11]:
type(col4)

pandas.core.arrays.categorical.Categorical

In [12]:
col4.dtype

CategoricalDtype(categories=['first', 'second', 'third'], ordered=False)

In [14]:
col4.categories

Index(['first', 'second', 'third'], dtype='object')

In [15]:
col4.reorder_categories(['second','first','third'])

['first', 'second', 'third']
Categories (3, object): ['second', 'first', 'third']

In [16]:
df = pd.DataFrame({'col1':[1,2,3],'col2':[0.3,0.4,0.5],'col3':['first','second','third'],'col4':col4,
                   'col5':[1,0.4,'first']})

In [20]:
print(type(df['col4']))
print(df['col4'].dtype)

<class 'pandas.core.series.Series'>
category


In [21]:
print(df.loc[:,'col4'].cat.categories)
print(df.loc[:,'col4'].cat.reorder_categories(['second','first','third']))

Index(['first', 'second', 'third'], dtype='object')
0     first
1    second
2     third
Name: col4, dtype: category
Categories (3, object): ['second', 'first', 'third']


In [23]:
custom_order = [2,1,3]
df['col1'] = df['col1'].astype(dtype=pd.CategoricalDtype(categories=custom_order,ordered=True)) # default, 2 is the smallest
df.sort_values(by='col1',inplace=True)  # default, ascending
df

Unnamed: 0,col1,col2,col3,col4,col5
1,2,0.4,second,second,0.4
0,1,0.3,first,first,1
2,3,0.5,third,third,first
