# 1.处理缺失数据

In [1]:
import pandas as pd
import numpy as np
from pandas import  Series ,DataFrame

In [3]:
string_data = pd.Series(['aardvark','artichoke',np.nan,'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
string_data[0] = None
string_data.isnull()   #python内置的None值在对象数组中也可以作为NA

0     True
1    False
2     True
3    False
dtype: bool

# 滤除缺失数据

In [8]:
from numpy import nan as NA
data  = pd.Series([1,NA,3.5,NA,7])
data.dropna()         #可以通过pandas.notnull（isnull）或布尔索引的方法。
                      #对于一个Series，dropna返回一个仅含非空数据和索引值的Series

0    1.0
2    3.5
4    7.0
dtype: float64

等价于：

In [9]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

而对于DataFrame对象，你可能希望丢弃全NA或含有NA的行或列。dropna默认丢弃任何含有缺失值的行：

In [9]:
data = pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
cleaned = data.dropna()
cleaned    #丢弃任何含有缺失值的行！！！ dropna（） 默认行！

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


传入how=‘all’将只丢弃值全为NA的那些行

In [17]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


用这种方式丢弃列，只需传入axis=1即可：

In [19]:
data.dropna(axis=1)

0
1
2
3


In [10]:
data[4]=NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [21]:
data.dropna(how='all',axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


另一个滤除DataFrame行的问题涉及时间序列数据。假设你只想留下一部分观测数据，可以用thresh参数实现此目的： thresh=n 表示保留数组中至少含有n个非NaN值的行 （axis=1 指列） 

In [11]:
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,-1.167835,-0.764654,-0.13189
1,0.042831,1.366958,0.80062
2,1.423962,-0.733941,0.609141
3,0.934688,-0.098745,-1.045301
4,-1.778235,-0.58241,0.978832
5,2.010499,-0.733379,0.648693
6,2.149601,-0.795679,-0.497854


In [12]:
df.iloc[:4,1] = NA
df.iloc[:2,2] = NA
df

Unnamed: 0,0,1,2
0,-1.167835,,
1,0.042831,,
2,1.423962,,0.609141
3,0.934688,,-1.045301
4,-1.778235,-0.58241,0.978832
5,2.010499,-0.733379,0.648693
6,2.149601,-0.795679,-0.497854


In [13]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.778235,-0.58241,0.978832
5,2.010499,-0.733379,0.648693
6,2.149601,-0.795679,-0.497854


In [14]:
df.dropna(thresh=2)    #保留数组中至少含有2个非NaN值的行  （axis=1 列）

Unnamed: 0,0,1,2
2,1.423962,,0.609141
3,0.934688,,-1.045301
4,-1.778235,-0.58241,0.978832
5,2.010499,-0.733379,0.648693
6,2.149601,-0.795679,-0.497854


# 填充缺失数据

通过一个常数调用fillna就会将缺失值替换为那个常数值：

In [15]:
df.fillna(0)   #fillna(a) 将缺失值变为a

Unnamed: 0,0,1,2
0,-1.167835,0.0,0.0
1,0.042831,0.0,0.0
2,1.423962,0.0,0.609141
3,0.934688,0.0,-1.045301
4,-1.778235,-0.58241,0.978832
5,2.010499,-0.733379,0.648693
6,2.149601,-0.795679,-0.497854


若是通过一个字典调用fillna，就可以实现对不同的列填充不同的值：

In [16]:
df.fillna({1:0.5,2:0})  #将第二列的缺失值变为0.5，将第三列的缺失值变为0

Unnamed: 0,0,1,2
0,-1.167835,0.5,0.0
1,0.042831,0.5,0.0
2,1.423962,0.5,0.609141
3,0.934688,0.5,-1.045301
4,-1.778235,-0.58241,0.978832
5,2.010499,-0.733379,0.648693
6,2.149601,-0.795679,-0.497854


In [18]:
df.fillna(0,inplace=True)
df        #inplace=True  直接在源数据上修改，不返回新的对象

Unnamed: 0,0,1,2
0,-1.167835,0.0,0.0
1,0.042831,0.0,0.0
2,1.423962,0.0,0.609141
3,0.934688,0.0,-1.045301
4,-1.778235,-0.58241,0.978832
5,2.010499,-0.733379,0.648693
6,2.149601,-0.795679,-0.497854


对reindexing有效的那些插值方法也可用于fillna

In [19]:
df = pd.DataFrame(np.random.randn(6,3))
df

Unnamed: 0,0,1,2
0,0.311976,0.229054,-0.044451
1,0.569335,-0.970178,0.63456
2,0.996601,0.612195,0.942282
3,1.780071,1.882075,0.673648
4,-0.27068,-0.511819,1.321046
5,0.057962,1.323277,0.687834


In [21]:
df.iloc[2:,1] = NA
df.iloc[4:,2] = NA
df

Unnamed: 0,0,1,2
0,0.311976,0.229054,-0.044451
1,0.569335,-0.970178,0.63456
2,0.996601,,0.942282
3,1.780071,,0.673648
4,-0.27068,,
5,0.057962,,


In [22]:
df.fillna(method='ffill')   #用前一个非缺失值值来填充缺失值

Unnamed: 0,0,1,2
0,0.311976,0.229054,-0.044451
1,0.569335,-0.970178,0.63456
2,0.996601,-0.970178,0.942282
3,1.780071,-0.970178,0.673648
4,-0.27068,-0.970178,0.673648
5,0.057962,-0.970178,0.673648


In [26]:
df.fillna(method='ffill',limit=2)   #limit 限制每列的填充个数

Unnamed: 0,0,1,2
0,0.311976,0.229054,-0.044451
1,0.569335,-0.970178,0.63456
2,0.996601,-0.970178,0.942282
3,1.780071,-0.970178,0.673648
4,-0.27068,,0.673648
5,0.057962,,0.673648


In [27]:
data = pd.Series([1,NA,3,2,NA])
data.fillna(data.mean())   #mean（) 填充平均值，数的个数不包含缺失值 （1+3+2）/3 = 2

0    1.0
1    2.0
2    3.0
3    2.0
4    2.0
dtype: float64

# 2.数据转换

过滤、清理以及其他的转换工作

In [28]:
data = pd.DataFrame({'k1':['one','two']*3+['two'],'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


DataFrame的duplicated方法返回一个布尔型Series，表示各行是否是重复行(前面出现过的行)

In [29]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [30]:
data.drop_duplicates()  #drop_duplicated 会删除重复的数组，返回一个新的DataFrame

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


也可以指定部分列进行重复项判断。假设我们还有一列值，且只希望根据k1过滤重复项

In [31]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [32]:
data.drop_duplicates(['k1'])  # 将k1列中重复的行全部删去

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


duplicated和drop_duplicates默认保留的是第一个出现的值组合。传入keep='last'则保留最后一个

In [33]:
data.drop_duplicates(['k1','k2'],keep='last') #结合‘k1’，‘k2’过滤重复值

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


# 利用函数或映射关系进行数据转换

In [34]:
data = pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','novalox'],'ounces':[4,3,12,6,7.5,8,3,5,6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,novalox,6.0


In [40]:
meat_to_animal = {'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','novalox':'salmon'}

In [42]:
lowercased = data['food'].str.lower()
lowercased   # 有些肉类的首字母大写了，二另一些没有，用Series的str.lower方法，将各个值转换为小写，再去匹配映射

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8        novalox
Name: food, dtype: object

Series的map方法可以接受一个函数或含有映射关系的字典型对象

In [41]:
data['animal']=lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,novalox,6.0,salmon


也可以传入一个能够完成全部这些工作的函数：

In [43]:
data['food'].map(lambda x : meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

# 替换值    replace方法

In [47]:
data = pd.Series([1.,-999.,2.,-999.,-1000.,3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [48]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

如果希望一次性替换多个值，可以传入一个由待替换值组成的列表以及一个替换值：

In [49]:
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

要让每个值有不同的替换，可以传递一个替换列表：

In [50]:
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

传入的参数也可以是字典：

In [51]:
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

# 重命名轴索引    

轴标签也可以通过函数或者映射进行转换  map

In [63]:
data = pd.DataFrame(np.arange(12).reshape((3,4)),index = ['Ohio','Colorado','New York'],columns = ['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [72]:
transform = lambda x : x[:4].upper()
data.index.map(transform)    #x[:5].upper() 将每个行索引的前五个字母大写

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [73]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


如果想要创建数据集的转换版（而不是修改原始数据），比较实用的方法是rename：

In [74]:
data.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


rename可以结合字典型对象实现对部分轴标签的更新：

In [75]:
data.rename(index={'OHIO':'INDIANA'},columns={'three':'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [77]:
data.rename(index={'OHIO':'INDIANA'},inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


# 离散化和面元划分 

In [79]:
ages = [20,22,25,27,21,23,37,31,61,45,41,32]

接下来，将这些数据划分为‘18到25’、‘26到35’，‘35到60’，‘60以上’几个面元。要实现该功能，你需要使用pandas的cut函数： 传入的数据必须是一维数组！！

In [80]:
bins = [18,25,36,60,100]
cats = pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 36], (18, 25], ..., (25, 36], (60, 100], (36, 60], (36, 60], (25, 36]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 36] < (36, 60] < (60, 100]]

In [82]:
cats.codes #年龄数据的标签（索引）

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [83]:
cats.categories

IntervalIndex([(18, 25], (25, 36], (36, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [86]:
pd.value_counts(cats)   #pd.value_counts(cats)是pandas.cut结果的面元计数

(18, 25]     5
(36, 60]     3
(25, 36]     3
(60, 100]    1
dtype: int64

跟‘区间’的数学符号一样。哪边是闭端可以通过right = False进行修改：

In [87]:
pd.cut(ages,[18,26,36,61,100],right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

可以通过传递一个列表或数组到labels（标签），设置自己的面元名称：

In [89]:
group_names = ['Youth','YoungAdult','MiddleAged','Senior']
pd.cut(ages,bins,labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

如果向cut传入的是面元的数量而不是确切的面元边界，则它会根据数据的最小值和最大值计算等长面元。 数据的范围会扩大千分之一，以包含最大值和最小值

In [100]:
data = np.arange(1,21)
data     

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])

In [101]:
pd.cut(data,4) #precision=2 值保留2位小数    pd.cut(data,4)  (max-min)/4

[(0.981, 5.75], (0.981, 5.75], (0.981, 5.75], (0.981, 5.75], (0.981, 5.75], ..., (15.25, 20.0], (15.25, 20.0], (15.25, 20.0], (15.25, 20.0], (15.25, 20.0]]
Length: 20
Categories (4, interval[float64]): [(0.981, 5.75] < (5.75, 10.5] < (10.5, 15.25] < (15.25, 20.0]]

qcut是一个非常类似于cut的函数，它可以根据样本分位数对数据进行面元划分，因此可以得到大小基本相等面元，即每个面元里的数据的数量几乎相等

In [102]:
data = np.random.randn(1000)
data

array([-1.89586054e-01,  5.18403666e-01,  1.66838292e+00,  2.93822558e-01,
       -2.16757499e+00,  1.78751776e-01,  2.37000335e-01,  1.81266960e+00,
        2.21500397e+00, -1.22905282e+00, -1.34731236e+00,  1.30766421e+00,
        8.94527315e-01,  3.42031614e-01, -6.51953355e-01,  1.85827114e-01,
       -1.52184789e+00,  2.35630622e-01,  3.00284611e-01,  4.00709238e-01,
        2.18597140e-01,  7.85424349e-01,  1.19402610e+00, -5.34427257e-01,
        5.43374137e-01, -8.13930913e-01, -3.04058712e-02, -4.68118518e-02,
       -1.31760665e+00,  6.81476105e-01,  1.32242670e+00,  3.31572373e-01,
        1.54437513e+00,  1.48972318e-02,  6.13912095e-02,  7.28811311e-01,
        5.33136068e-01,  1.24109409e+00, -1.02325099e-01, -1.86252154e-01,
        2.27661670e-02, -3.83266038e-01, -4.75770036e-01, -1.01303271e+00,
       -9.47194711e-01, -6.96484653e-01, -3.53788223e-01, -4.07427298e-01,
        6.16599142e-01,  2.73596863e+00,  5.47647086e-01,  1.68215076e+00,
        9.06952189e-01, -

In [103]:
cats = pd.qcut(data,4)
cats  #为整数时，数组划成4等份

[(-0.703, -0.017], (-0.017, 0.624], (0.624, 2.974], (-0.017, 0.624], (-3.13, -0.703], ..., (-3.13, -0.703], (-3.13, -0.703], (-0.703, -0.017], (-0.703, -0.017], (0.624, 2.974]]
Length: 1000
Categories (4, interval[float64]): [(-3.13, -0.703] < (-0.703, -0.017] < (-0.017, 0.624] < (0.624, 2.974]]

In [104]:
pd.value_counts(cats)

(0.624, 2.974]      250
(-0.017, 0.624]     250
(-0.703, -0.017]    250
(-3.13, -0.703]     250
dtype: int64

In [106]:
bins = [0.,0.1,0.5,0.9,1.]  #传入自定义分位数（0到1之间的数值，包含端点） 
cats = pd.qcut(data,bins)   # 将数组划分成，10%，40%  40%  10%
cats         

[(-1.229, -0.017], (-0.017, 1.247], (1.247, 2.974], (-0.017, 1.247], (-3.13, -1.229], ..., (-1.229, -0.017], (-1.229, -0.017], (-1.229, -0.017], (-1.229, -0.017], (-0.017, 1.247]]
Length: 1000
Categories (4, interval[float64]): [(-3.13, -1.229] < (-1.229, -0.017] < (-0.017, 1.247] < (1.247, 2.974]]

In [107]:
pd.value_counts(cats)

(-0.017, 1.247]     400
(-1.229, -0.017]    400
(1.247, 2.974]      100
(-3.13, -1.229]     100
dtype: int64

# 检测和过滤异常值

过滤或变换异常值（outlier）在很大程度上就是运用了数组运算。

In [121]:
data = pd.DataFrame(np.random.randn(1000,4))  #1000行*4列
data

Unnamed: 0,0,1,2,3
0,0.156205,-1.143472,-0.698653,-1.298870
1,0.648977,-0.061549,-0.855007,-1.046157
2,-0.720469,2.011366,0.014733,-0.236374
3,-1.119315,-0.890802,0.679814,-0.336281
4,0.388070,-0.628040,1.487291,-0.531122
5,-1.143595,-1.059647,-0.633158,-0.377176
6,1.477666,1.719351,-0.350120,-1.250499
7,0.431962,0.603804,-0.471232,0.698284
8,0.062763,0.006999,1.478366,1.272748
9,-0.049583,1.523948,0.381663,1.018653


In [109]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.059981,-0.002834,0.004169,0.027565
std,0.979052,1.023965,0.983217,0.989345
min,-3.618747,-3.641487,-2.987871,-3.142476
25%,-0.614123,-0.664489,-0.682086,-0.638401
50%,0.071196,0.000651,0.021014,0.038274
75%,0.748777,0.636979,0.707361,0.699747
max,3.855565,3.90345,3.785707,3.249242


假设想要找出第三列中绝对值大小超过3的值：

In [112]:
col = data[2]
col[np.abs(col)>3]    #np.abs()   布尔值型数组

237    3.785707
Name: 2, dtype: float64

要选出含有‘超过3或者-3’的行，在布尔值型DataFrame中使用any方法

In [122]:
data[(np.abs(data)>3).any(1)]   #any(1)  =any(axis=1) 

Unnamed: 0,0,1,2,3
14,-0.335108,0.148364,-0.580511,-3.99834
475,1.536827,-3.35853,-0.244051,0.181424
588,0.828263,1.22151,3.009476,-1.714617


根据这些条件，就可以对值进行设置。下面的代码将只限制在区间-3到3以内：

In [123]:
data[np.abs(data)>3] = np.sign(data)*3   #np.sign()  取数字前的正负号：1,0，-1
data

Unnamed: 0,0,1,2,3
0,0.156205,-1.143472,-0.698653,-1.298870
1,0.648977,-0.061549,-0.855007,-1.046157
2,-0.720469,2.011366,0.014733,-0.236374
3,-1.119315,-0.890802,0.679814,-0.336281
4,0.388070,-0.628040,1.487291,-0.531122
5,-1.143595,-1.059647,-0.633158,-0.377176
6,1.477666,1.719351,-0.350120,-1.250499
7,0.431962,0.603804,-0.471232,0.698284
8,0.062763,0.006999,1.478366,1.272748
9,-0.049583,1.523948,0.381663,1.018653


In [124]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.012879,-0.012026,-0.0025,0.009087
std,1.013449,0.990771,0.966532,1.004825
min,-2.752348,-3.0,-2.997098,-3.0
25%,-0.729248,-0.701139,-0.681884,-0.668198
50%,-0.024977,-0.035903,-0.030435,-0.03197
75%,0.687659,0.616333,0.615162,0.714421
max,2.741458,2.911982,3.0,2.92812


In [125]:
np.sign(data).head()    #根据数值来取前面的正负号：-1,0，1

Unnamed: 0,0,1,2,3
0,1.0,-1.0,-1.0,-1.0
1,1.0,-1.0,-1.0,-1.0
2,-1.0,1.0,1.0,-1.0
3,-1.0,-1.0,1.0,-1.0
4,1.0,-1.0,1.0,-1.0


# 排列和随机采样

利用numpy.random.permutation函数可以轻松实现对Series或DataFrame的列的排序工作（permuting，随机重排序）。通过需要排列的轴的长度调用permutation，可以产生一个表示新序列的整数数组：     （轴的长度！！！！！！）

In [132]:
df = pd.DataFrame(np.arange(5*4).reshape((5,4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [133]:
sampler = np.random.permutation(5) 
sampler    #长度为5 表示将行标签重新排列

array([3, 2, 1, 0, 4])

In [135]:
df.take(sampler)   # take函数使用该数组

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19


In [137]:
df.iloc[(sampler)]  #基于iloc索引使用该数组

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19


如果不想用替换的方式选取随机子集，可以在Series和DataFrame上使用sample方法：

In [142]:
df.sample(3)   #df.sample(n)  随机选取n行数组

Unnamed: 0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


要通过替换的方式产生样本（允许重复选择），可以传递replace =True到sample：

In [146]:
choices = pd.Series([5,7,-1,6,4])
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [152]:
draws = choices.sample(n=10,replace=True)
draws

4    4
2   -1
2   -1
4    4
3    6
2   -1
4    4
1    7
4    4
1    7
dtype: int64

# 计算指标/哑变量

如果DataFrame的某一列含有k个不同的值，则可以派生出一个k列矩阵（k个列）或DataFrame（其值全为1和0，1表示就是该值，0表示不是）。pd.get_dummies(列)可实现

In [153]:
df = pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [154]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


你可能想给指标DataFrame的列加上一个前缀，以便能够和其他数据进行合并。get_dummies的prefix(前缀)参数可以实现该功能：

In [158]:
dummies = pd.get_dummies(df['key'],prefix = 'key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy         #df[['data1']] 是DataFrame 

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


如果DataFrame中的某行同属于多个分类，则事情就会有点复杂。

In [161]:
mnames = ['movie_id','title','genres']
movies = pd.read_table('datasets\\movielens\\movies.dat',sep='::',header=None,names=mnames)
movies

  
  


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [162]:
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


要为每个genre添加指标变量就需要做一些数据规整操作。首先，我们从数据集中抽取不同的genre值：

In [163]:
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

构建指标DataFrame的方法之一就是从一个全零DataFrame开始：

In [165]:
zero_matrix = np.zeros((len(movies),len(genres)))
dummies = pd.DataFrame(zero_matrix,columns = genres)
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


现在，迭代每一部电影，并将dummies各行的条目设为1。要这么做，我们使用dummies.columns来计算每个类型的列索引：

In [170]:
gen = movies.genres[0]
gen

"Animation|Children's|Comedy"

In [171]:
gen.split('|')

['Animation', "Children's", 'Comedy']

In [172]:
dummies.columns.get_indexer(gen.split('|'))   #get_indexer()  gen.split('|')在dummies.columns中对应的索引位置

array([0, 1, 2], dtype=int64)

然后，根据索引，使用iloc设定值：

In [174]:
for i ,gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i,indices] = 1

然后，和以前一样，再讲其与movies合并起来：

In [175]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

一个对统计应用有用的秘诀是：结合get_dummies和诸如cut之类的离散化函数：

In [176]:
np.random.seed(12345)     #np.random.seed???
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [177]:
bins = [0,0.2,0.4,0.6,0.8,1]
pd.cut(values,bins)

[(0.8, 1.0], (0.2, 0.4], (0.0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.8, 1.0], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8]]
Categories (5, interval[float64]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

In [178]:
pd.get_dummies(pd.cut(values,bins))

   (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]
0           0           0           0           0           1
1           0           1           0           0           0
2           1           0           0           0           0
3           0           1           0           0           0
4           0           0           1           0           0
5           0           0           1           0           0
6           0           0           0           0           1
7           0           0           0           1           0
8           0           0           0           1           0
9           0           0           0           1           0

# 3.字符串操作

## 字符串对象方法

In [179]:
val = 'a,b,                           guido'
val.split(',')

['a', 'b', 'guido']

split常常strip一起使用，以去除空白符（包括换行符）：

In [181]:
pieces = [x.strip( ) for x in val.split(',')]
pieces

['a', 'b', 'guido']