# numpy应用

In [1]:
import numpy as np
data=np.array([[0.95,-0.24,-0.88],[0.56,0.24,0.91]])
data.shape

(2L, 3L)

In [2]:
data.dtype

dtype('float64')

In [24]:
arr=np.arange(32).reshape((8,4))

In [3]:
arr[[1,5,7,2],[0,3,1,2]]  #前行后列，一一对应

array([ 4, 23, 29, 10])

In [6]:
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [7]:
arr[np.ix_([1,5,7,2],[0,3,1,2])]   #花式索引是把数据复制到新数组中

array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

In [26]:
arr=np.arange(15).reshape(5,3)

#### 矩阵乘法

In [11]:
np.dot(arr.T,arr)   #np.dot计算矩阵内积

array([[270, 300, 330],
       [300, 335, 370],
       [330, 370, 410]])

In [27]:
arr.T.dot(arr)

array([[270, 300, 330],
       [300, 335, 370],
       [330, 370, 410]])

In [14]:
arr=np.random.normal(size=10)
arr

array([ 0.93900247, -0.80258431,  0.47803396,  0.55657205,  0.4044932 ,
        0.56018473,  0.06367156,  0.2403247 ,  0.24390483,  0.28434815])

### where用法

In [15]:
xarr=np.array([1.1,1.2,1.3,1.4,1.5])
yarr=np.array([2.1,2.2,2.3,2.4,2.5])
cond=np.array([True,False,True,True,False])
result=np.where(cond,xarr,yarr)
result

array([ 1.1,  2.2,  1.3,  1.4,  2.5])

In [28]:
arr=np.random.normal(size=15).reshape(3,5)
arr

array([[-0.34057242,  0.66974983, -1.05794062, -1.44723288, -2.87202593],
       [ 0.79639367,  2.2613375 ,  0.12207705,  0.02347452,  1.29988173],
       [-0.47632664,  0.10364284, -0.08923431, -1.30252043,  1.0643912 ]])

In [17]:
arr1=np.where(arr>0,2,-2)
arr1

array([[ 2,  2,  2,  2, -2],
       [ 2, -2, -2,  2,  2],
       [ 2,  2, -2, -2,  2]])

In [18]:
arr is arr1

False

In [19]:
arr.argmin()   #产生最小元素的索引

13

In [20]:
(arr>0).sum()  #统计正值的数量

10

In [21]:
arr.sort()   #是分行sort了
arr

array([[-0.62779865,  0.55193326,  0.59855108,  0.60392428,  0.78596751],
       [-0.70677743, -0.61346019,  0.53513425,  0.81996271,  0.9875806 ],
       [-1.33727187, -0.12228429,  0.36041556,  0.8648148 ,  1.10787973]])

In [22]:
arr.sort(0)   #0是分列sort

In [23]:
arr

array([[-1.33727187, -0.61346019,  0.36041556,  0.60392428,  0.78596751],
       [-0.70677743, -0.12228429,  0.53513425,  0.81996271,  0.9875806 ],
       [-0.62779865,  0.55193326,  0.59855108,  0.8648148 ,  1.10787973]])

### numpy.linalg

In [42]:
#包括inv pinv (计算矩阵的伪逆), qr(计算qr分解), svd计算奇异值分解
import numpy.linalg as npl
arr=np.random.normal(size=9).reshape(3,3)
mat=arr.T.dot(arr)
mat

array([[ 4.89659933, -0.43649922, -0.28833238],
       [-0.43649922,  0.94834401,  2.52791951],
       [-0.28833238,  2.52791951,  7.55872284]])

In [43]:
mat=arr.T.dot(arr)
mat.dot(npl.inv(mat))

array([[  1.00000000e+00,   7.42788400e-16,  -3.60734946e-16],
       [ -3.47733056e-17,   1.00000000e+00,  -3.14672276e-16],
       [ -2.90464710e-16,  -2.79685835e-15,   1.00000000e+00]])

In [52]:
nwalks=5000
nstep=1000
draws=np.random.randint(0,2,size=(nwalks,nstep))
steps=np.where(draws>0,1,-1)
walks=steps.cumsum(1)

In [53]:
walks.max()

145

In [54]:
walks.min()

-113

In [55]:
hist30=(np.abs(walks>=30)).any(1)   #any()用于检测数组中是否存在True, any(1)表示横向检测
hist30

array([False,  True,  True, ...,  True, False, False], dtype=bool)

In [56]:
hist30.sum()

1732

In [57]:
walks[hist30]

array([[ -1,  -2,  -1, ...,  50,  49,  48],
       [  1,   0,   1, ...,  44,  43,  44],
       [ -1,  -2,  -1, ...,  28,  29,  30],
       ..., 
       [  1,   0,   1, ...,  68,  67,  68],
       [  1,   2,   1, ..., -16, -17, -16],
       [  1,   2,   1, ...,   2,   3,   2]])

In [58]:
crossing_time=(np.abs(walks[hist30])>=30).argmax(1)
crossing_time

array([317, 323, 649, ..., 379, 115, 465], dtype=int64)

In [59]:
crossing_time.mean()

501.64318706697458

## 计算随机漫步的函数表达

In [70]:
def crossing_time(nwalks,ops,args=[2,1]):
    nstep=1000
    result=[]
    for op,n in zip(ops,args):
        draws=op(0,n,size=(nwalks,nstep))
        steps=np.where(draws>0,1,-1)
        walks=steps.cumsum(1)
        hist30=(np.abs(walks>=30)).any(1)   #any()用于检测数组中是否存在True, any(1)表示横向检测
        crossing_time=(np.abs(walks[hist30])>=30).argmax(1)
        crossing_time1=crossing_time.mean()
        result.append(crossing_time1)
    return result

In [71]:
ops=[np.random.randint,np.random.normal]
crossing_time(5000,ops)

[499.10710259301015, 502.26278659611995]

### 内包函数—随机漫步表达

In [91]:
def crossing_time(x):
    ops0=[np.random.randint,np.random.normal]
    ops1=[np.random.binomial,np.random.gamma]
    args0=[2,1]
    args1=[3,4]
    dictinaries={'00':(ops0,args0),'01':(ops1,args1)}
    result=[]
    ops,args=dictinaries[x]
    def close_run(nwalks):
        for op,n in zip(ops,args):
            draws=op(0,n,size=(nwalks[0],nwalks[1]))
            steps=np.where(draws>0,1,-1)
            walks=steps.cumsum(1)
            hist30=(np.abs(walks>=30)).any(1)   #any()用于检测数组中是否存在True, any(1)表示横向检测
            crossing_time=(np.abs(walks[hist30])>=30).argmax(1)
            crossing_time1=crossing_time.mean()
            result.append(crossing_time1)
        return result
    return close_run
   

In [92]:
aps=crossing_time('00')
aps([5000,1000])

[499.59649122807019, 495.58796025715958]

# pandas初步

In [94]:
import pandas as pd
obj=pd.Series([4,5,7,3],index=['d','c','a','b'])
obj

d    4
c    5
a    7
b    3
dtype: int64

In [99]:
sdata={'o':3500,'t':7100,'s':1600}
obj1=pd.Series(sdata)
obj1

o    3500
s    1600
t    7100
dtype: int64

In [104]:
states=['a','o','t']
obj2=pd.Series(sdata,index=states)
obj2

a       NaN
o    3500.0
t    7100.0
dtype: float64

#### 给目录栏命名

In [105]:
obj2.index.name='state'
obj2
#obj2['state']是错误的
# obj2[obj2.index[1]]   结果为3500

state
a       NaN
o    3500.0
t    7100.0
dtype: float64

#### 更改目录的值

In [112]:
obj2.index=['bob','steve','jeff']
obj2

bob         NaN
steve    3500.0
jeff     7100.0
dtype: float64

## DataFrame

# reindex

In [114]:
obj3=pd.Series(['blue','purple','yellow'],index=[0,2,4])
obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [63]:
frame=pd.DataFrame(np.arange(9).reshape(3,3),index=['a','c','b'],columns=['u','p','s'])
frame

Unnamed: 0,u,p,s
a,0,1,2
c,3,4,5
b,6,7,8


In [48]:
frame.index.name='names'  #照样无法访问
frame.reset_index()

Unnamed: 0,names,u,p,s
0,a,0,1,2
1,c,3,4,5
2,b,6,7,8


In [49]:
frame.set_index(['u','p'])

Unnamed: 0_level_0,Unnamed: 1_level_0,s
u,p,Unnamed: 2_level_1
0,1,2
3,4,5
6,7,8


In [64]:
frame.reindex(['a','c','d','b'])  #重新将names变为index

Unnamed: 0,u,p,s
a,0.0,1.0,2.0
c,3.0,4.0,5.0
d,,,
b,6.0,7.0,8.0


In [65]:
frame.fillna(method='ffill')  #d没被填充值
# frame.fillna?

Unnamed: 0,u,p,s
a,0,1,2
c,3,4,5
b,6,7,8


In [68]:
frame.loc['d','u']=15   #一行必须要有一个值后才能填充缺省值
frame1=frame.fillna(method='ffill')   #fillna之后必须重命名，否则frame没有填充缺省值
frame.fillna(method='ffill') 

Unnamed: 0,u,p,s
a,0.0,1.0,2.0
c,3.0,4.0,5.0
b,6.0,7.0,8.0
d,15.0,7.0,8.0


In [69]:
frame.sort_index()

Unnamed: 0,u,p,s
a,0.0,1.0,2.0
b,6.0,7.0,8.0
c,3.0,4.0,5.0
d,15.0,,


In [74]:
frame2=frame.dropna()   #dropna之后也必须重命名，否则frame没变
frame2

Unnamed: 0,u,p,s
a,0.0,1.0,2.0
c,3.0,4.0,5.0
b,6.0,7.0,8.0


In [70]:
frame1.sort_index()

Unnamed: 0,u,p,s
a,0.0,1.0,2.0
b,6.0,7.0,8.0
c,3.0,4.0,5.0
d,15.0,7.0,8.0


#### 修改columns名称

In [144]:
obj3=pd.DataFrame({'color':['blue','purple','yellow'],'name':['a','b','c']},index=['d','e','f'])
obj3

Unnamed: 0,color,name
d,blue,a
e,purple,b
f,yellow,c


In [141]:
obj3.columns={'cs','up'}

In [142]:
obj3

Unnamed: 0,cs,up
d,blue,a
e,purple,b
f,yellow,c


#### DataFrame的drop

In [1]:
import pandas as pd
import numpy as np
obj=pd.DataFrame(np.arange(16).reshape(4,4),index=['a','b','c','d'],columns=['one','two','three','four'])
new_obj=obj.drop('c')
new_obj

Unnamed: 0,one,two,three,four
a,0,1,2,3
b,4,5,6,7
d,12,13,14,15


In [11]:
new_obj.drop(['two','four'],axis=1)     #加上axis=1非常重要

Unnamed: 0,one,three
a,0,2
b,4,6
d,12,14


In [12]:
obj[:2]   #对行进行切片

Unnamed: 0,one,two,three,four
a,0,1,2,3
b,4,5,6,7


In [21]:
obj.iloc[2]  #iloc通过行号获取数据

one       8
two       9
three    10
four     11
Name: c, dtype: int32

In [15]:
obj[obj.three>5]

Unnamed: 0,one,two,three,four
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [17]:
obj.loc[['a','c'],['four','one','two']]   #取对应行，对应列

Unnamed: 0,four,one,two
a,3,0,1
c,11,8,9


In [19]:
obj.loc[obj.three>5]    #loc里面也可以是逻辑表达

Unnamed: 0,one,two,three,four
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


#### 成批修改DataFrame的值

In [22]:
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
    s.set_value(label, value+2)   #值全部加2
s.head()                          #看头几个元素

0    217
1    298
2    473
3    131
4    569
dtype: int32

### DataFrame的合并

In [76]:
staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR'},
                         {'Name': 'Sally', 'Role': 'Course liasion'},
                         {'Name': 'James', 'Role': 'Grader'}])
staff_df = staff_df.set_index('Name')
student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business'},
                           {'Name': 'Mike', 'School': 'Law'},
                           {'Name': 'Sally', 'School': 'Engineering'}])
student_df = student_df.set_index('Name')
print(staff_df.head())
print()
print(student_df.head())
pd.merge(staff_df, student_df, how='outer', left_index=True, right_index=True)   #how表示 outer:一边有就行；inner:两边都得有；left:以左为标准；right:以右为准
#pd.merge(staff_df, student_df, how='inner', left_on=['First Name'], right_on=['First Name'])  #没有目录时left_on right_on表示合并的项

                 Role
Name                 
Kelly  Director of HR
Sally  Course liasion
James          Grader
()
            School
Name              
James     Business
Mike           Law
Sally  Engineering


Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Kelly,Director of HR,
Mike,,Law
Sally,Course liasion,Engineering


In [2]:
df1=pd.DataFrame(np.arange(12).reshape(3,4),columns=list('abcd'))
df2=pd.DataFrame(np.arange(20).reshape(4,5),columns=list('abcde'))
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [80]:
df3=df1.add(df2,fill_value=0)
df3

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [82]:
df2.add(df1,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


### 对dataframe应用函数

In [83]:
f=lambda x: x.max()-x.min()
df3.apply(f)

a    18.0
b    18.0
c    18.0
d    18.0
e    15.0
dtype: float64

In [84]:
df3.apply(f,axis=1)

0     6.0
1     6.0
2    10.0
3     4.0
dtype: float64

In [90]:
def f(x):
    u=pd.Series([x.min(),x.max()],index=['min','max'])
    return u
df3.apply(f)

Unnamed: 0,a,b,c,d,e
min,0.0,2.0,4.0,6.0,4.0
max,18.0,20.0,22.0,24.0,19.0


In [91]:
df3.apply(f,axis=1)

Unnamed: 0,min,max
0,0.0,6.0
1,9.0,15.0
2,14.0,24.0
3,15.0,19.0


#### 对df排序排名

In [92]:
df3.set_index('a')

Unnamed: 0_level_0,b,c,d,e
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,2.0,4.0,6.0,4.0
9.0,11.0,13.0,15.0,9.0
18.0,20.0,22.0,24.0,14.0
15.0,16.0,17.0,18.0,19.0


In [94]:
df3.sort_values(by=['b','c'])   #sort_values对值排序，sort_index对目录排序

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
3,15.0,16.0,17.0,18.0,19.0
2,18.0,20.0,22.0,24.0,14.0


In [97]:
df3.b.rank()    #返回的是排名

0    1.0
1    2.0
2    4.0
3    3.0
Name: b, dtype: float64

In [98]:
df3.rank(axis=1)

Unnamed: 0,a,b,c,d,e
0,1.0,2.0,3.5,5.0,3.5
1,1.5,3.0,4.0,5.0,1.5
2,2.0,3.0,4.0,5.0,1.0
3,1.0,2.0,3.0,4.0,5.0


# 计算df两列的相关系数

In [101]:
df3.b.corr(df3.c)   #计算两列的相关系数

0.9980665240246237

In [103]:
df3.iloc[2].corr(df3.iloc[3])

-0.16439898730535726

In [106]:
df3.corrwith(df3.b)   #针对各列计算相关系数值

a    0.998220
b    1.000000
c    0.998067
d    0.992001
e    0.848235
dtype: float64

In [107]:
df3.corrwith(df2)  #计算按列匹配的相关系数

a    0.878310
b    0.848235
c    0.813676
d    0.774597
e    1.000000
dtype: float64

In [114]:
df3.corrwith(df2,axis=1)  #计算按行匹配的相关系数

0    0.832050
1    0.242536
2   -0.164399
3    1.000000
dtype: float64

In [109]:
df2

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [115]:
df3

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


### Series中值计数

In [116]:
df3.iloc[0].value_counts()   #计算各值出现的频率

4.0    2
6.0    1
2.0    1
0.0    1
Name: 0, dtype: int64

In [119]:
df3.e.isin([4,9])

0     True
1     True
2    False
3    False
Name: e, dtype: bool

In [120]:
data=pd.DataFrame({'qu1':[1,3,4,3,4],
                  'qu2':[2,3,1,2,3],
                  'qu3':[1,4,2,4,15]})
result=data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,qu1,qu2,qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
15,0.0,0.0,1.0


# 缺失数据处理

In [3]:
df4=df1+df2
df4

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [122]:
df4.dropna(axis=1,how='all')  # how='all' 可以只删除那些全为NA的行

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,6.0
1,9.0,11.0,13.0,15.0
2,18.0,20.0,22.0,24.0
3,,,,


In [123]:
df4.dropna(how='all')

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,


In [141]:
df4.c[0]=np.nan
df4

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [127]:
df4.fillna({'c':100,'e':18})   #通过字典可以对指定的列填入指定的值

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,18.0
1,9.0,11.0,100.0,15.0,18.0
2,18.0,20.0,22.0,24.0,18.0
3,,,100.0,,18.0


In [128]:
df5=df4.copy()
df5.e[0]=24
df5.fillna(method='ffill',limit=2)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,24.0
1,9.0,11.0,4.0,15.0,24.0
2,18.0,20.0,22.0,24.0,24.0
3,18.0,20.0,22.0,24.0,


In [129]:
df4.fillna({'e':df4.d.mean()})

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,15.0
1,9.0,11.0,,15.0,15.0
2,18.0,20.0,22.0,24.0,15.0
3,,,,,15.0


### 高级fillna技巧

In [4]:
df4

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [5]:
df5=df4.copy()
i=range(5)
for ik in i:
    s=df5.columns[ik]
    df5.fillna({s:df5[s].mean()},inplace=True)
df5

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,9.0,11.0,13.0,15.0,


In [12]:
df6=df5.copy()
df6.iloc[2]['c','a']=np.nan
df6

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,,20.0,,24.0,
3,9.0,11.0,13.0,15.0,


In [27]:
#横向填充缺失值
j=range(4)
for jk in j:
    s=df6.iloc[jk].isnull()
    df6.iloc[jk][s]=df6.iloc[jk].mean()
df6

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,3.0
1,9.0,11.0,13.0,15.0,12.0
2,22.0,20.0,22.0,24.0,22.0
3,9.0,11.0,13.0,15.0,12.0


In [29]:
df6=df5.copy()
df6.iloc[2]['c','a']=np.nan
df6.iloc[1]['c']=np.nan
df6

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,,15.0,
2,,20.0,,24.0,
3,9.0,11.0,13.0,15.0,


In [31]:
#分析'c'是缺失值时'a','b'的值
df6[['a','b']][df6['c'].isnull()==True]

Unnamed: 0,a,b
1,9.0,11.0
2,,20.0


In [33]:
df6

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,,15.0,
2,,20.0,,24.0,
3,9.0,11.0,13.0,15.0,


In [32]:
df6['c'].interpolate()   #使用插值来估计NaN

0     4.0
1     7.0
2    10.0
3    13.0
Name: c, dtype: float64

In [34]:
df6.interpolate?

In [36]:
df6.fillna(df6.mean())    #用各列平均值填充各列缺失值

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,8.5,15.0,
2,6.0,20.0,8.5,24.0,
3,9.0,11.0,13.0,15.0,


## 数据变换高级技巧

In [None]:
#将字符串变成对应数字表示
houseprice['MSZoning']=houseprice['MSZoning'].map({'RL':1,'RM':2,'RR':3,}).astype(int)  

In [68]:
df7=df6.copy()
df7['f']=['a','b',1,2]
u=df7.dtypes[df7.dtypes!="object"].index   #把内容为数值的特征列找出来
u

Index([u'a', u'b', u'c', u'd', u'e'], dtype='object')

In [67]:
df7[u]    #df7=df7[u],将df7变为只有数值的特征列，方便进行下面df.columns[df7.mean()>10]的操作

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,,15.0,
2,,20.0,,24.0,
3,9.0,11.0,13.0,15.0,


In [44]:
df7.dtypes

a    float64
b    float64
c    float64
d    float64
e    float64
f     object
dtype: object

In [49]:
df7['f'].dtypes

dtype('O')

In [50]:
df7.dtypes.index

Index([u'a', u'b', u'c', u'd', u'e', u'f'], dtype='object')

### 对符合要求的相关列进行变换

In [None]:
#下面几行代码将偏斜度大于0.75的数值列做一个log转换，使之尽量符合正态分布，因为很多模型的假设数据是服从正态分布的
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index    #取出对应的index, 方便在all_data上进行操作
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])   #变化是log(1+x)

In [60]:
df7=df7.drop('f',axis=1)   #去除无关的列，因为df7.mean()>10只能产生abcde五个列，
#而df7有abcdef六个列。将不能进行df.columns[df7.mean()>10]的操作
df7

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,,15.0,
2,,20.0,,24.0,
3,9.0,11.0,13.0,15.0,


In [65]:
s=df7.columns[df7.mean()>10]
df7[s]=np.log1p(df7[s])   #变化是log(1+x)
df7

Unnamed: 0,a,b,c,d,e
0,0.0,1.098612,4.0,1.94591,
1,9.0,2.484907,,2.772589,
2,,3.044522,,3.218876,
3,9.0,2.484907,13.0,2.772589,


### 对符合要求的相关行进行变换

#### 其一

In [69]:
df7=df6.copy()
df7

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,,15.0,
2,,20.0,,24.0,
3,9.0,11.0,13.0,15.0,


In [72]:
df8=df7[df7>=df7.mean()]
df8

Unnamed: 0,a,b,c,d,e
0,,,,,
1,9.0,11.0,,15.0,
2,,20.0,,24.0,
3,9.0,11.0,13.0,15.0,


In [75]:
us=df8.isnull().sum(1).argmin()
us

3

In [76]:
df8.iloc[us]=np.log1p(df8.iloc[us])
df8

Unnamed: 0,a,b,c,d,e
0,,,,,
1,9.0,11.0,,15.0,
2,,20.0,,24.0,
3,2.302585,2.484907,2.639057,2.772589,


#### 其二

In [82]:
df7.mean(1)>10

0    False
1     True
2     True
3     True
dtype: bool

In [97]:
df9=df7[df7.mean(1)>10]
df10=df9.copy()
df9

Unnamed: 0,a,b,c,d,e
1,9.0,11.0,,15.0,
2,,20.0,,24.0,
3,9.0,11.0,13.0,15.0,


In [111]:
df7.loc[df7.mean(1)>10]

Unnamed: 0,a,b,c,d,e
1,9.0,11.0,,15.0,
2,,20.0,,24.0,
3,9.0,11.0,13.0,15.0,


In [87]:
df9=np.log1p(df9)
df9

Unnamed: 0,a,b,c,d,e
1,2.302585,2.484907,,2.772589,
2,,3.044522,,3.218876,
3,2.302585,2.484907,2.639057,2.772589,


In [94]:
df11=df10.div(df10.mean(1),axis=0)
df11

Unnamed: 0,a,b,c,d,e
1,0.771429,0.942857,,1.285714,
2,,0.909091,,1.090909,
3,0.75,0.916667,1.083333,1.25,


#### 关于df在删除特定行后，iloc和loc的小试验

In [104]:
df10

Unnamed: 0,a,b,c,d,e
1,9.0,11.0,,15.0,
2,,20.0,,24.0,
3,9.0,11.0,13.0,15.0,


In [103]:
df10.iloc[1]   #iloc的索引始终是从0开始，逐一往下加的

a     NaN
b    20.0
c     NaN
d    24.0
e     NaN
Name: 2, dtype: float64

In [107]:
df10.index    #df10的Index是返回的dataframe的标号

Int64Index([1, 2, 3], dtype='int64')

In [110]:
# df7.loc[df7.mean(1)>10]  方式一
# df7[df7.mean(1)>10]      方式二
# 这两种方式都可以选取特定的行。所以loc或直接与选行对应，loc也与index对应。  对列的选取，用的是s=df7.columns[df7.mean()>10]  df7[s]  s是列名
df7.loc[df10.index[:2]]  #df10.loc[index] 可访问原数据的特定的行。元数据是df7, 索引从0开始

Unnamed: 0,a,b,c,d,e
1,9.0,11.0,,15.0,
2,,20.0,,24.0,


In [99]:
df12=df10.div(pd.Series([9,9,20,11]),axis=0)   #在div时
df12

Unnamed: 0,a,b,c,d,e
0,,,,,
1,1.0,1.222222,,1.666667,
2,,1.0,,1.2,
3,0.818182,1.0,1.181818,1.363636,


# 层次化索引

In [1]:
import numpy as np
import pandas as pd
data=pd.Series(np.random.normal(size=10),index=[['a','a','a','b','b','b','c','c','d','d'],
                                               [1,2,3,1,2,3,1,2,2,3]])
data

a  1    1.120635
   2   -0.511683
   3    0.619259
b  1   -0.213650
   2    1.685527
   3    0.192229
c  1   -0.627336
   2    0.541256
d  2    0.101995
   3   -1.898284
dtype: float64

In [2]:
data.index

MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [4]:
data['b':'c']

b  1   -0.213650
   2    1.685527
   3    0.192229
c  1   -0.627336
   2    0.541256
dtype: float64

In [6]:
data[['b','d']]

b  1   -0.213650
   2    1.685527
   3    0.192229
d  2    0.101995
   3   -1.898284
dtype: float64

In [7]:
data[:,2]

a   -0.511683
b    1.685527
c    0.541256
d    0.101995
dtype: float64

In [8]:
data.unstack()  #大索引做了行索引   unstack用法

Unnamed: 0,1,2,3
a,1.120635,-0.511683,0.619259
b,-0.21365,1.685527,0.192229
c,-0.627336,0.541256,
d,,0.101995,-1.898284


In [9]:
frame=pd.DataFrame(np.arange(12).reshape(4,3),index=[['a','a','b','b'],[1,2,1,2]],
                  columns=[['ohio','ohio','colorado'],['green','red','green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,ohio,ohio,colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,green,red,green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [10]:
frame['ohio']

Unnamed: 0,Unnamed: 1,green,red
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [11]:
frame.swaplevel(0,1)

Unnamed: 0_level_0,Unnamed: 1_level_0,ohio,ohio,colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,green,red,green
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [13]:
frame.swaplevel(0,1).sort_index(level=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,ohio,ohio,colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,green,red,green
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [26]:
#可对columns排序
frame.sort_index(level='color',axis=1)  #对columns命名之后可指定level='color', 一定要加上axis=1

Unnamed: 0_level_0,state,colorado,ohio,ohio
Unnamed: 0_level_1,color,green,green,red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


In [14]:
frame.sum(level=1)

Unnamed: 0_level_0,ohio,ohio,colorado
Unnamed: 0_level_1,green,red,green
1,6,8,10
2,12,14,16


In [15]:
frame.index.names=['key1','key2']
frame.columns.names=['state','color']
frame

Unnamed: 0_level_0,state,ohio,ohio,colorado
Unnamed: 0_level_1,color,green,red,green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [22]:
frame.sum(level='color',axis=1) #先给columns命名，然后level='color'选取Level,再指定axis

Unnamed: 0_level_0,color,green,red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [24]:
frame.sum(level='state',axis=1)

Unnamed: 0_level_0,state,colorado,ohio
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,5,7
b,1,8,13
b,2,11,19
