In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
df = pd.DataFrame({'date': pd.date_range('2000.01.03',periods=12),
                   'variable':list('AAABBBCCCDDD'),
                   'value':np.random.randn(12)})
df

Unnamed: 0,date,value,variable
0,2000-01-03,1.420308,A
1,2000-01-04,0.841693,A
2,2000-01-05,1.781672,A
3,2000-01-06,-0.941738,B
4,2000-01-07,-2.066605,B
5,2000-01-08,0.88326,B
6,2000-01-09,-0.812537,C
7,2000-01-10,0.240951,C
8,2000-01-11,0.075717,C
9,2000-01-12,0.462799,D


In [9]:
import pandas.util.testing as tm;tm.N = 3
def unpivot(frame):
    N , K = frame.shape
    data = {'value':frame.values.ravel('F'),
           'variable':np.asarray(frame.columns).repeat(N),
           'date':np.tile(np.asarray(frame.index),K)}
    return pd.DataFrame(data , columns=['date','variable','value'])
df = unpivot(tm.makeTimeDataFrame())

In [10]:
df

Unnamed: 0,date,variable,value
0,2000-01-03,A,-0.595101
1,2000-01-04,A,-0.857201
2,2000-01-05,A,-0.784072
3,2000-01-03,B,-1.064984
4,2000-01-04,B,1.470092
5,2000-01-05,B,0.4965
6,2000-01-03,C,-0.949554
7,2000-01-04,C,0.457663
8,2000-01-05,C,0.612305
9,2000-01-03,D,-1.46757


In [11]:
df[df['variable'] == 'A']

Unnamed: 0,date,variable,value
0,2000-01-03,A,-0.595101
1,2000-01-04,A,-0.857201
2,2000-01-05,A,-0.784072


In [12]:
df.pivot(index='date' , columns='variable' , values='value')

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,-0.595101,-1.064984,-0.949554,-1.46757
2000-01-04,-0.857201,1.470092,0.457663,-0.852631
2000-01-05,-0.784072,0.4965,0.612305,-0.081823


In [13]:
df['value2'] = df['value']*2
df

Unnamed: 0,date,variable,value,value2
0,2000-01-03,A,-0.595101,-1.190202
1,2000-01-04,A,-0.857201,-1.714403
2,2000-01-05,A,-0.784072,-1.568143
3,2000-01-03,B,-1.064984,-2.129967
4,2000-01-04,B,1.470092,2.940183
5,2000-01-05,B,0.4965,0.992999
6,2000-01-03,C,-0.949554,-1.899108
7,2000-01-04,C,0.457663,0.915327
8,2000-01-05,C,0.612305,1.224609
9,2000-01-03,D,-1.46757,-2.935141


In [14]:
pivoted = df.pivot(index='date' , columns='variable')
pivoted

Unnamed: 0_level_0,value,value,value,value,value2,value2,value2,value2
variable,A,B,C,D,A,B,C,D
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2000-01-03,-0.595101,-1.064984,-0.949554,-1.46757,-1.190202,-2.129967,-1.899108,-2.935141
2000-01-04,-0.857201,1.470092,0.457663,-0.852631,-1.714403,2.940183,0.915327,-1.705262
2000-01-05,-0.784072,0.4965,0.612305,-0.081823,-1.568143,0.992999,1.224609,-0.163646


In [17]:
pivoted.columns

MultiIndex(levels=[['value', 'value2'], ['A', 'B', 'C', 'D']],
           labels=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]],
           names=[None, 'variable'])

In [20]:
pivoted['value2']

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,-1.190202,-2.129967,-1.899108,-2.935141
2000-01-04,-1.714403,2.940183,0.915327,-1.705262
2000-01-05,-1.568143,0.992999,1.224609,-0.163646


In [24]:
pivoted['value2'].index

DatetimeIndex(['2000-01-03', '2000-01-04', '2000-01-05'], dtype='datetime64[ns]', name='date', freq=None)

# reshaping by stacking and unstacking

In [26]:
a=list(zip([1,2,3],[2,3,4]))
a
#合并

[(1, 2), (2, 3), (3, 4)]

In [27]:
a=list(zip(*a))
#还原

In [28]:
a

[(1, 2, 3), (2, 3, 4)]

In [29]:
tuples = list(zip(*[['bar','bar','baz','baz',
                     'foo','foo','qux','qux'],
                    ['one','two','one','two',
                    'one','two','one','two']]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [30]:
index=pd.MultiIndex.from_tuples(tuples , names=['first',
                                               'second'])

df = pd.DataFrame(np.random.randn(8,2),index=index,
                 columns=['A','B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.270466,1.309341
bar,two,-1.401907,-0.582651
baz,one,1.813309,0.112675
baz,two,-0.728536,-0.328811
foo,one,-1.858225,1.814037
foo,two,-0.848902,0.404162
qux,one,0.558393,-2.298928
qux,two,-1.176655,0.994355


In [32]:
df2=df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.270466,1.309341
bar,two,-1.401907,-0.582651
baz,one,1.813309,0.112675
baz,two,-0.728536,-0.328811


In [33]:
#stack函数在dataframe的column中的一个level进行压缩
#如果index 是多个index 就需要指定level
stacked = df2.stack()
stacked

first  second   
bar    one     A    0.270466
               B    1.309341
       two     A   -1.401907
               B   -0.582651
baz    one     A    1.813309
               B    0.112675
       two     A   -0.728536
               B   -0.328811
dtype: float64

In [36]:
type(stacked)

pandas.core.series.Series

In [38]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.270466,1.309341
bar,two,-1.401907,-0.582651
baz,one,1.813309,0.112675
baz,two,-0.728536,-0.328811


In [39]:
stacked.unstack(level=1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,0.270466,-1.401907
bar,B,1.309341,-0.582651
baz,A,1.813309,-0.728536
baz,B,0.112675,-0.328811


In [41]:
stacked.unstack(level=0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.270466,1.813309
one,B,1.309341,0.112675
two,A,-1.401907,-0.728536
two,B,-0.582651,-0.328811


In [42]:
#也可以指定 index的name (前提是有name) 就不需要level了
stacked.unstack('second')

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,0.270466,-1.401907
bar,B,1.309341,-0.582651
baz,A,1.813309,-0.728536
baz,B,0.112675,-0.328811


In [43]:
#stack 和 unstack会对index进行排序
index = pd.MultiIndex.from_product([[2,1],['a','b']])
index

MultiIndex(levels=[[1, 2], ['a', 'b']],
           labels=[[1, 1, 0, 0], [0, 1, 0, 1]])

In [52]:
df = pd.DataFrame(np.random.randn(4) ,index=index,columns=['A'])
df

Unnamed: 0,Unnamed: 1,A
2,a,0.802713
2,b,-0.724131
1,a,0.229779
1,b,1.823405


In [55]:
df.unstack().index

Int64Index([1, 2], dtype='int64')

In [56]:
df.unstack().stack()

Unnamed: 0,Unnamed: 1,A
1,a,0.229779
1,b,1.823405
2,a,0.802713
2,b,-0.724131


In [57]:
df.sort_index()

Unnamed: 0,Unnamed: 1,A
1,a,0.229779
1,b,1.823405
2,a,0.802713
2,b,-0.724131


# multiple levels

In [58]:
columns = pd.MultiIndex.from_tuples([
    ('A','cat','long'),('B','cat','long'),
    ('A','dog','short'),('B','dog','short')
], names=['exp','animal','hair_length'])

columns

MultiIndex(levels=[['A', 'B'], ['cat', 'dog'], ['long', 'short']],
           labels=[[0, 1, 0, 1], [0, 0, 1, 1], [0, 0, 1, 1]],
           names=['exp', 'animal', 'hair_length'])

In [59]:
df = pd.DataFrame(np.random.randn(4,4) , columns=columns)
df

exp,A,B,A,B
animal,cat,cat,dog,dog
hair_length,long,long,short,short
0,-1.228368,0.658935,0.048362,1.288791
1,-0.020848,0.358853,1.287084,1.707253
2,0.467708,-0.084888,-1.014249,0.103082
3,2.584791,-0.674309,-0.940192,0.624535


In [60]:
df.stack(level=['animal', 'hair_length'])

Unnamed: 0_level_0,Unnamed: 1_level_0,exp,A,B
Unnamed: 0_level_1,animal,hair_length,Unnamed: 3_level_1,Unnamed: 4_level_1
0,cat,long,-1.228368,0.658935
0,dog,short,0.048362,1.288791
1,cat,long,-0.020848,0.358853
1,dog,short,1.287084,1.707253
2,cat,long,0.467708,-0.084888
2,dog,short,-1.014249,0.103082
3,cat,long,2.584791,-0.674309
3,dog,short,-0.940192,0.624535


In [61]:
df.stack(level=[1,2]) #和上面的作用是一样的


Unnamed: 0_level_0,Unnamed: 1_level_0,exp,A,B
Unnamed: 0_level_1,animal,hair_length,Unnamed: 3_level_1,Unnamed: 4_level_1
0,cat,long,-1.228368,0.658935
0,dog,short,0.048362,1.288791
1,cat,long,-0.020848,0.358853
1,dog,short,1.287084,1.707253
2,cat,long,0.467708,-0.084888
2,dog,short,-1.014249,0.103082
3,cat,long,2.584791,-0.674309
3,dog,short,-0.940192,0.624535


In [62]:
# missing data
columns =pd.MultiIndex.from_tuples([('A','cat'),
                                   ('B','dog'),
                                   ('B','cat'),
                                   ('A','animal')])

index=pd.MultiIndex.from_product([('bar','baz','foo','qux'),
                                 ('one','two')],
                                names = ['first','second'])
df = pd.DataFrame(np.random.randn(8,4),index=index , columns=columns)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,B,A
Unnamed: 0_level_1,Unnamed: 1_level_1,cat,dog,cat,animal
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,-0.388996,0.762423,-0.040458,1.270001
bar,two,-0.705756,-0.32801,-0.985353,-0.890746
baz,one,-0.435155,-0.883818,0.424819,-0.414439
baz,two,0.07947,-0.899737,-1.129503,-1.148362
foo,one,0.83437,0.404799,1.151229,-0.471714
foo,two,0.189193,0.866867,-0.409345,1.608045
qux,one,-0.11703,0.275607,-0.575366,-1.445422
qux,two,-0.216466,-0.233188,2.275476,0.20337


In [63]:
df2=df.iloc[[0,1,2,4,5,7]]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,B,A
Unnamed: 0_level_1,Unnamed: 1_level_1,cat,dog,cat,animal
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,-0.388996,0.762423,-0.040458,1.270001
bar,two,-0.705756,-0.32801,-0.985353,-0.890746
baz,one,-0.435155,-0.883818,0.424819,-0.414439
foo,one,0.83437,0.404799,1.151229,-0.471714
foo,two,0.189193,0.866867,-0.409345,1.608045
qux,two,-0.216466,-0.233188,2.275476,0.20337


# reshaping by melt

In [71]:
cheese = pd.DataFrame({'first':['john','mary'],
                      'last':['doe','bo'],
                      'height':[5.5 , 6.],
                      'weight':[130,150]})
cheese

Unnamed: 0,first,height,last,weight
0,john,5.5,doe,130
1,mary,6.0,bo,150


In [72]:
cheese.melt(id_vars=['first','last'])

Unnamed: 0,first,last,variable,value
0,john,doe,height,5.5
1,mary,bo,height,6.0
2,john,doe,weight,130.0
3,mary,bo,weight,150.0


In [73]:
# cut函数
ages = np.array([10,15,13,12,23,25,28,59,60])
pd.cut(ages,bins=3)

[(9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (26.667, 43.333], (43.333, 60.0], (43.333, 60.0]]
Categories (3, interval[float64]): [(9.95, 26.667] < (26.667, 43.333] < (43.333, 60.0]]

In [74]:
c = pd.cut(ages , bins=[0 , 18,35,70])
c

[(0, 18], (0, 18], (0, 18], (0, 18], (18, 35], (18, 35], (18, 35], (35, 70], (35, 70]]
Categories (3, interval[int64]): [(0, 18] < (18, 35] < (35, 70]]

In [76]:
df[:3].unstack(0)

Unnamed: 0_level_0,A,A,B,B,B,B,A,A
Unnamed: 0_level_1,cat,cat,dog,dog,cat,cat,animal,animal
first,bar,baz,bar,baz,bar,baz,bar,baz
second,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
one,-0.388996,-0.435155,0.762423,-0.883818,-0.040458,0.424819,1.270001,-0.414439
two,-0.705756,,-0.32801,,-0.985353,,-0.890746,


In [77]:
#cross table
#交叉表
df = pd.DataFrame({'A':[1,2,2,2,2],'B':[3,3,4,4,4],
                  'C':[1,1,np.nan,1,1]})
df

Unnamed: 0,A,B,C
0,1,3,1.0
1,2,3,1.0
2,2,4,
3,2,4,1.0
4,2,4,1.0


In [78]:
pd.crosstab(df.A , df.B)
#交叉项（表中的数值） 是出现的次数

B,3,4
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,1,3


In [80]:
pd.crosstab(df.A , df.B , normalize=True)
#和上面一样 只是显示地频率 百分比

B,3,4
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.2,0.0
2,0.2,0.6


In [83]:
pd.crosstab(df.A , df.B , normalize='columns') #在列上做频率

B,3,4
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.5,0.0
2,0.5,1.0


In [84]:
pd.crosstab(df.A,df.B, values=df.C , aggfunc=np.sum)
#将对应的column C的值加和

B,3,4
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,
2,1.0,2.0


In [85]:
pd.crosstab(df.A , df.B , values=df.C , aggfunc=np.sum , normalize=True,
           margins=True)

B,3,4,All
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.25,0.0,0.25
2,0.25,0.5,0.75
All,0.5,0.5,1.0


# computing indicator or dummy variables

In [90]:
df = pd.DataFrame({'key':list('bbacab'),'datal':[3,2,4,5,6,4]})
df

Unnamed: 0,datal,key
0,3,b
1,2,b
2,4,a
3,5,c
4,6,a
5,4,b


In [91]:
pd.get_dummies(df['key'])
#1的位置就是key处的值得顺序

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [93]:
dummies = pd.get_dummies(df['key'] , prefix='fei')
dummies

Unnamed: 0,fei_a,fei_b,fei_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [95]:
df[['datal']].join(dummies)

Unnamed: 0,datal,fei_a,fei_b,fei_c
0,3,0,1,0
1,2,0,1,0
2,4,1,0,0
3,5,0,0,1
4,6,1,0,0
5,4,0,1,0


In [None]:
df['column name 一个column'] #返回series
df[['column name 若干column names']]

In [96]:
values = np.random.randn(10)
values

array([-0.89359605, -2.26047275,  1.38986593,  0.12325602,  0.05729529,
        1.57019743, -1.15163513, -1.08947727,  0.16355597, -0.71310105])

In [97]:
bins=[0,.2,.4,.6,.8,1]
pd.get_dummies(pd.cut(values , bins))
#每个数字属于的区间

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0
5,0,0,0,0,0
6,0,0,0,0,0
7,0,0,0,0,0
8,1,0,0,0,0
9,0,0,0,0,0


In [99]:
df=pd.DataFrame({'A':['a','b','c'],
                'B':['c','c','b'],
                'C':[1,2,3]})
df

Unnamed: 0,A,B,C
0,a,c,1
1,b,c,2
2,c,b,3


In [100]:
pd.get_dummies(df)
#category变量 进行数字化

Unnamed: 0,C,A_a,A_b,A_c,B_b,B_c
0,1,1,0,0,0,1
1,2,0,1,0,0,1
2,3,0,0,1,1,0


In [101]:
pd.get_dummies(df , columns=['A'])

Unnamed: 0,B,C,A_a,A_b,A_c
0,c,1,1,0,0
1,c,2,0,1,0
2,b,3,0,0,1


In [105]:
s = pd.Series(list('abcaa'))
pd.get_dummies(s , prefix='fei')

Unnamed: 0,fei_a,fei_b,fei_c
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0
4,1,0,0


In [103]:
df = pd.DataFrame({'A':list('aaaaa'),
                  'B':list('ababc')})
pd.get_dummies(df)

Unnamed: 0,A_a,B_a,B_b,B_c
0,1,1,0,0
1,1,0,1,0
2,1,1,0,0
3,1,0,1,0
4,1,0,0,1


In [104]:
pd.get_dummies(df , drop_first=True)
#drop_first将每个prefix的第一个删除掉
#防止共线性

Unnamed: 0,B_b,B_c
0,0,0
1,1,0
2,0,0
3,1,0
4,0,1


# factorizing values

In [106]:
x = pd.Series(['A','A',np.nan , 'B',3.14,np.inf])
x

0       A
1       A
2     NaN
3       B
4    3.14
5     inf
dtype: object

In [107]:
labels , uniques = pd.factorize(x)

In [108]:
labels

array([ 0,  0, -1,  1,  2,  3], dtype=int64)

In [109]:
uniques

Index(['A', 'B', 3.14, inf], dtype='object')

In [None]:
#factorize是枚举函数
#labels是枚举值
#index是枚举值的原来的值
#nan的枚举值使用-1代替