# 第七章 数据清洗和准备

## 7.1处理缺失数据

In [3]:
import pandas as pd
import numpy as np

In [4]:
string_data=pd.Series(['aardvark','artichoke',np.nan,'avocado'])

In [5]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [6]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
string_data[0]=None#Python内置的None值在对象数组中也被认为是NA

In [8]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [9]:
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [10]:
string_data.dropna()#丢弃缺失值

1    artichoke
3      avocado
dtype: object

In [11]:
string_data.fillna(0)#使用指定值或插值的方法填充缺失值

0            0
1    artichoke
2            0
3      avocado
dtype: object

In [12]:
string_data.notnull()

0    False
1     True
2    False
3     True
dtype: bool

In [13]:
#滤除缺失数据

In [14]:
from numpy import nan as NA

In [15]:
data=pd.Series([1,NA,3.5,NA,7])

In [16]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [17]:
data.dropna()#drop方法丢弃缺失值

0    1.0
2    3.5
4    7.0
dtype: float64

In [18]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [19]:
#对DataFrame对象，可能是丢弃含有缺失值或全是缺失值的行或者列
#但dropna默认是丢弃含有缺失值的行

In [20]:
data=pd.DataFrame([[1.,6.5,3],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])

In [21]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [22]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [23]:
data.dropna(how="all")#传入参数how='all'将只丢弃全为NA的行

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [24]:
#丢弃列，指定axis=1

In [25]:
data[4]=NA

In [26]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [27]:
data.dropna(axis=1,how='all')#指定axis=1，即是以列为轴进行丢弃，丢弃方法是丢掉全是NA的列

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [28]:
#对于时间序列数据，只想留下一部分观测数据，用thresh参数实现
df=pd.DataFrame(np.random.randn(7,3))

In [29]:
df.iloc[:4,1]=NA

In [30]:
df.iloc[:2,2]=NA

In [31]:
df

Unnamed: 0,0,1,2
0,0.717393,,
1,-0.420779,,
2,0.341412,,0.925585
3,-1.017883,,1.244165
4,-0.628376,0.650236,-0.712094
5,1.126629,-0.076719,-0.721665
6,0.257575,-0.659439,-1.112701


In [32]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.628376,0.650236,-0.712094
5,1.126629,-0.076719,-0.721665
6,0.257575,-0.659439,-1.112701


In [33]:
df.dropna(thresh=2)#指定删除第三列含有NA值的行

Unnamed: 0,0,1,2
2,0.341412,,0.925585
3,-1.017883,,1.244165
4,-0.628376,0.650236,-0.712094
5,1.126629,-0.076719,-0.721665
6,0.257575,-0.659439,-1.112701


In [34]:
#填充缺失值
df.fillna(0)#使用0填充缺失值

Unnamed: 0,0,1,2
0,0.717393,0.0,0.0
1,-0.420779,0.0,0.0
2,0.341412,0.0,0.925585
3,-1.017883,0.0,1.244165
4,-0.628376,0.650236,-0.712094
5,1.126629,-0.076719,-0.721665
6,0.257575,-0.659439,-1.112701


In [35]:
#通过字典调用fillna，可以对不同列填充不同的值
df.fillna({1:5,2:0,1:1})#注意字典虽然是无序的，但程序的执行是分先后的

Unnamed: 0,0,1,2
0,0.717393,1.0,0.0
1,-0.420779,1.0,0.0
2,0.341412,1.0,0.925585
3,-1.017883,1.0,1.244165
4,-0.628376,0.650236,-0.712094
5,1.126629,-0.076719,-0.721665
6,0.257575,-0.659439,-1.112701


In [36]:
df123=df.fillna(0,inplace=True)#直接在现有对象基础上进行修改

In [37]:
df

Unnamed: 0,0,1,2
0,0.717393,0.0,0.0
1,-0.420779,0.0,0.0
2,0.341412,0.0,0.925585
3,-1.017883,0.0,1.244165
4,-0.628376,0.650236,-0.712094
5,1.126629,-0.076719,-0.721665
6,0.257575,-0.659439,-1.112701


In [38]:
df123#使用inplace并没有新建对象

In [39]:
df=pd.DataFrame(np.random.randn(6,3))

In [40]:
df

Unnamed: 0,0,1,2
0,-0.025153,-0.27025,-0.611682
1,0.754418,-1.040958,0.963855
2,-0.791395,1.244439,1.584763
3,-0.293574,-1.277524,-0.026918
4,-1.003203,-0.283532,1.014392
5,2.37734,0.333814,-0.731917


In [41]:
df.iloc[2:,1]

2    1.244439
3   -1.277524
4   -0.283532
5    0.333814
Name: 1, dtype: float64

In [42]:
df.iloc[2:4,1]=NA

In [43]:
df.iloc[3:4,2]=NA

In [44]:
df

Unnamed: 0,0,1,2
0,-0.025153,-0.27025,-0.611682
1,0.754418,-1.040958,0.963855
2,-0.791395,,1.584763
3,-0.293574,,
4,-1.003203,-0.283532,1.014392
5,2.37734,0.333814,-0.731917


In [45]:
df.fillna(method='ffill')#ffill用缺失值上一行的数据填充

Unnamed: 0,0,1,2
0,-0.025153,-0.27025,-0.611682
1,0.754418,-1.040958,0.963855
2,-0.791395,-1.040958,1.584763
3,-0.293574,-1.040958,1.584763
4,-1.003203,-0.283532,1.014392
5,2.37734,0.333814,-0.731917


In [46]:
df.fillna(method='bfill') #用空缺值下边的值填充

Unnamed: 0,0,1,2
0,-0.025153,-0.27025,-0.611682
1,0.754418,-1.040958,0.963855
2,-0.791395,-0.283532,1.584763
3,-0.293574,-0.283532,1.014392
4,-1.003203,-0.283532,1.014392
5,2.37734,0.333814,-0.731917


In [47]:
df[aixs=0,6]=[1,2,3]#怎么指定用左边的值填充？

SyntaxError: invalid syntax (<ipython-input-47-cb24d34fa3ce>, line 1)

In [48]:
df.fillna(method='ffill',limit=1)#limit=1限定了只能使用上边的值填充一次

Unnamed: 0,0,1,2
0,-0.025153,-0.27025,-0.611682
1,0.754418,-1.040958,0.963855
2,-0.791395,-1.040958,1.584763
3,-0.293574,,1.584763
4,-1.003203,-0.283532,1.014392
5,2.37734,0.333814,-0.731917


In [49]:
data=pd.Series([1,NA,3.5,NA,7])

In [50]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [51]:
data.fillna(data.mean())#用均值填充

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [52]:
data.fillna(data.max())#用最大值填充

0    1.0
1    7.0
2    3.5
3    7.0
4    7.0
dtype: float64

In [53]:
data.fillna(data.median())#用中位数填充

0    1.0
1    3.5
2    3.5
3    3.5
4    7.0
dtype: float64

In [54]:
data.describe()

count    3.000000
mean     3.833333
std      3.013857
min      1.000000
25%      2.250000
50%      3.500000
75%      5.250000
max      7.000000
dtype: float64

In [55]:
data.fillna(data.std())#用方差填充

0    1.000000
1    3.013857
2    3.500000
3    3.013857
4    7.000000
dtype: float64

In [56]:
data.fillna(data.skew())#用偏度填充

0    1.000000
1    0.491613
2    3.500000
3    0.491613
4    7.000000
dtype: float64

In [57]:
data.fillna(data.kurt())#用峰度填充#为什么会是NaN？？

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

## 7.2数据转换

### 移除重复数据

In [59]:
data=pd.DataFrame({'k1':['one','two']*3+['two'],'k2':[1,1,2,3,3,4,4]})

In [60]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [63]:
data.duplicated()#duplicated方法返回各行是否是重复行，不重复则为False

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [64]:
data.drop_duplicates()#丢弃掉重复行的方法

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [66]:
#以上两个方法默认判断全部列，也可以指定列
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [67]:
data['v1']=range(7)#增加列并赋值

In [68]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [69]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [70]:
data.drop_duplicates(['k1','k2'],keep='last')#按多列去重，默认会保留第一个出现的值对
#参数keep='last'表示保留最后一个出现的值对

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### 利用函数或映射进行数据转换

In [74]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
 'Pastrami', 'corned beef', 'Bacon',
 'pastrami', 'honey ham', 'nova lox'],
 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [75]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [76]:
#编写映射，就是个字典
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

In [79]:
#Series的map方法
lowercased=data['food'].str.lower()

In [80]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [81]:
data['animal']=lowercased.map(meat_to_animal)#map方法调用定义好的映射,从字典中取相应的值，并赋值给data的新列

In [82]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [83]:
#一步完成以上工作
data['food'].map(lambda x: meat_to_animal[x.lower()])
#

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [None]:
#map是一种实现元素级转换以及许多其他数据清理工作的便捷方法。

### 替换值

In [85]:
data=pd.Series([1,-999,2,-999,-1000,3])

In [86]:
data

0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: int64

In [87]:
data.replace(-999,np.nan)#使用replace函数将值A替换为值B

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [88]:
data.replace([-999,-1000],np.nan)#一次替换多个值

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [89]:
data.replace([-999,-1000],[np.nan,0])#传递两个等长列表，将列表1的值分别替换成列表2的值

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [90]:
data.replace({-999:np.nan,-1000:0})#当然还可以传入字典

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### 重命名轴索引

In [91]:
data=pd.DataFrame(np.arange(12).reshape((3,4)),
                 index=['Ohio','Colorado','New York'],
                 columns=['one','two','three','four'])

In [92]:
#轴索引的map方法
transform=lambda x:x[:4].upper()#把x的前四个字母变成大写？

In [93]:
data.index.map(transform)
#map()可以以函数为参数，相当于自定义的函数，都可以通过map()来对对象使用了
#类似的好像还有apply()吧？记得不是很清楚了

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [94]:
data.index=data.index.map(transform)

In [96]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [97]:
data.rename(index=str.title,columns=str.upper)
#rename方法创建数据集的转换版，而不是修改原始数据

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [99]:
data.rename?? #[('copy', True),('inplace', False), ('level', None)] 查看代码可见其相关参数设置

In [100]:
data.rename(index={'Ohio':'INDIANA'},inplace=True)#OHIO 开始是小写，没有替换

In [101]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [103]:
data.rename(index={'OHIO':'INDIANA'},inplace=True)

In [104]:
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [105]:
#离散化和面元划分
#连续数据的离散化——拆分为面元(bin)
ages=[20,22,25,27,21,23,37,31,61,45,41,32]

In [106]:
bins=[18,25,35,60,100]

In [107]:
cats=pd.cut(ages,bins)#cut函数按照区段划分，并用原始值所在的区段替换原始值

In [108]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [109]:
cats.codes#codes属性可以查看所属区段的标签（类似于索引？）

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [110]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [111]:
pd.value_counts(cats)#value_counts()方法统计出现次数#注意区间是左开右闭的，可以通过修改right=False设置

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [112]:
pd.cut(ages,[18,26,36,61,100],right=False)#变成了左闭右开

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [113]:
pd.cut(ages,[18,26,36,61,100],left=True)
#由于要保持区间拼接起来的完整性，所以必有一端且仅有一端是闭，因此不能也没有必要用left进行设置

TypeError: cut() got an unexpected keyword argument 'left'

In [114]:
group_names=['Youth','YoungAdult','MiddleAge','Senior']

In [116]:
pd.cut(ages,bins,labels=group_names)
#传递一个列表或者数组，可以重新命名面元

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAge, MiddleAge, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAge < Senior]

In [118]:
#还可以只传递面元数量，会自动根据数据的range计算登场面元
data=np.random.rand(20)

In [119]:
data

array([0.62182926, 0.45395201, 0.48436005, 0.13946938, 0.28087636,
       0.25042932, 0.77138892, 0.05870122, 0.17980941, 0.54166797,
       0.73900562, 0.22262933, 0.75244883, 0.11040185, 0.32580145,
       0.71720852, 0.61414052, 0.86039751, 0.51367531, 0.37224073])

In [120]:
pd.cut(data,4,precision=2)#4表示指定面元数量是4，然后使用precision=2指定了精度，注意默认是左开右闭区间

[(0.46, 0.66], (0.26, 0.46], (0.46, 0.66], (0.058, 0.26], (0.26, 0.46], ..., (0.66, 0.86], (0.46, 0.66], (0.66, 0.86], (0.46, 0.66], (0.26, 0.46]]
Length: 20
Categories (4, interval[float64]): [(0.058, 0.26] < (0.26, 0.46] < (0.46, 0.66] < (0.66, 0.86]]

In [None]:
#qcut使用样本分位数进行面元划分，因此可以得到大小基本相等的面元

In [121]:
data=np.random.randn(100)#正态分布的（伪）随机数

In [122]:
data

array([ 0.18156922,  0.75471204,  1.31287545,  1.76769626,  2.39895088,
       -0.10647088,  1.11200596,  0.5577904 , -0.37110903, -0.87993351,
       -0.13684634, -0.41042073,  0.47790529,  0.02260873,  1.01476108,
       -0.32606889, -0.51520148,  0.54114781,  2.05002506, -0.56965262,
        0.57016815,  1.16683249, -0.33085313,  1.04130148,  0.17202241,
       -0.79806055,  0.24040129,  2.54161518,  0.08406112, -0.52697094,
        0.65986047, -0.04624578,  0.62296617,  1.85498416,  0.64744632,
       -0.6955271 ,  0.00482141,  0.70437369,  0.2487607 ,  0.02602737,
        0.2617902 ,  1.06707795,  0.87968389, -0.51708455,  2.35075904,
        1.38014195,  0.61545594, -0.30644547,  0.28414101,  0.37368098,
        1.51934938,  0.08149754,  1.65468072, -1.15219961, -0.92776274,
       -0.49689531,  0.90552317, -0.01296224, -0.67569621,  0.47926466,
       -0.34796186,  1.25555379,  1.43629177,  0.25045354, -0.5667913 ,
        1.01282602, -0.30000318, -0.12950214, -0.20457111, -0.08

In [123]:
cats=pd.qcut(data,4)#按样本分位数划分为四组

In [124]:
cats

[(-0.335, 0.245], (0.245, 0.928], (0.928, 2.542], (0.928, 2.542], (0.928, 2.542], ..., (-0.335, 0.245], (-2.811, -0.335], (-2.811, -0.335], (-2.811, -0.335], (0.928, 2.542]]
Length: 100
Categories (4, interval[float64]): [(-2.811, -0.335] < (-0.335, 0.245] < (0.245, 0.928] < (0.928, 2.542]]

In [125]:
pd.value_counts(cats)#qcut和cut不同之处大概可以概括为，cut等长度划分，qcut等数量划分

(0.928, 2.542]      25
(0.245, 0.928]      25
(-0.335, 0.245]     25
(-2.811, -0.335]    25
dtype: int64

In [126]:
#与cut类似，也可以传递指定的分位数
pd.qcut(data,[0,0.1,0.5,0.9,1])

[(-0.874, 0.245], (0.245, 1.32], (0.245, 1.32], (1.32, 2.542], (1.32, 2.542], ..., (-0.874, 0.245], (-2.811, -0.874], (-0.874, 0.245], (-2.811, -0.874], (0.245, 1.32]]
Length: 100
Categories (4, interval[float64]): [(-2.811, -0.874] < (-0.874, 0.245] < (0.245, 1.32] < (1.32, 2.542]]

In [None]:
#cut和qcut的离散化，使得可以对数据进行分组。本质上是减小原始数据的精度来实现。这给探索数据的分布带来了方便

In [None]:
#检测和过滤异常值
#outlier

In [127]:
data=pd.DataFrame(np.random.randn(1000,4))

In [128]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.005838,-0.024867,0.018869,-0.079429
std,0.988288,1.000115,1.034991,1.034195
min,-3.24515,-3.492053,-3.609538,-3.821295
25%,-0.61525,-0.703,-0.711321,-0.75167
50%,0.063302,0.022452,0.036175,-0.115155
75%,0.667156,0.665252,0.731686,0.611264
max,2.991961,3.009897,3.124429,3.21866


In [129]:
col=data[2]#选出某列

In [130]:
col[np.abs(col)>3]#找出该列里绝对值大于3的

649    3.124429
778   -3.609538
Name: 2, dtype: float64

In [None]:
#选出全部含有绝对值大于3的值的行，对布尔型DataFrame使用any方法

In [135]:
data[(np.abs(data)>3).any(1)]
#

Unnamed: 0,0,1,2,3
127,0.258046,3.009897,0.926244,0.092831
241,0.198804,-1.984379,-2.137325,3.21866
245,-0.767031,-3.17033,2.545222,-0.051183
269,-0.916499,0.558001,-1.903626,-3.18193
322,-3.24515,0.23454,-1.837876,-1.045734
557,-2.293076,0.248379,-0.160251,-3.821295
649,-0.727293,0.066213,3.124429,-0.541042
778,0.292105,-0.755574,-3.609538,0.159465
906,-0.744535,-3.492053,-0.395048,1.266159
914,-0.426876,0.392997,0.291628,3.037749


In [139]:
data[(np.abs(data)>3).any(1)]
#测试了一下，any后的参数好像必须是1

Unnamed: 0,0,1,2,3
127,0.258046,3.009897,0.926244,0.092831
241,0.198804,-1.984379,-2.137325,3.21866
245,-0.767031,-3.17033,2.545222,-0.051183
269,-0.916499,0.558001,-1.903626,-3.18193
322,-3.24515,0.23454,-1.837876,-1.045734
557,-2.293076,0.248379,-0.160251,-3.821295
649,-0.727293,0.066213,3.124429,-0.541042
778,0.292105,-0.755574,-3.609538,0.159465
906,-0.744535,-3.492053,-0.395048,1.266159
914,-0.426876,0.392997,0.291628,3.037749


In [140]:
data[np.abs(data)>3]=np.sign(data)*3#将绝对值大于3的限制在-3到3之间。

In [141]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.006083,-0.02412,0.019354,-0.078682
std,0.987511,0.997688,1.032663,1.03022
min,-3.0,-3.0,-3.0,-3.0
25%,-0.61525,-0.703,-0.711321,-0.75167
50%,0.063302,0.022452,0.036175,-0.115155
75%,0.667156,0.665252,0.731686,0.611264
max,2.991961,3.0,3.0,3.0


In [149]:
data[np.abs(data)>2]=np.sign(data)*2#将绝对值大于5的限制在-2到2之间。

In [150]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.010095,-0.017356,0.016496,-0.079197
std,0.952111,0.962564,0.989858,0.986014
min,-2.0,-2.0,-2.0,-2.0
25%,-0.61525,-0.703,-0.711321,-0.75167
50%,0.063302,0.022452,0.036175,-0.115155
75%,0.667156,0.665252,0.731686,0.611264
max,2.0,2.0,2.0,2.0


In [153]:
np.sign(data).head()#sign函数获取值的正负

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,-1.0
1,1.0,1.0,-1.0,-1.0
2,-1.0,1.0,1.0,1.0
3,1.0,-1.0,1.0,-1.0
4,1.0,1.0,1.0,1.0


In [154]:
#排列和随机采样

In [155]:
#numpy.random.permutation函数
df=pd.DataFrame(np.arange(5*4).reshape((5,4)))

In [156]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [163]:
sampler=np.random.permutation(5)

In [164]:
sampler

array([4, 1, 2, 0, 3])

In [165]:
df.take(sampler)#按sampler重新索引后的df

Unnamed: 0,0,1,2,3
4,16,17,18,19
1,4,5,6,7
2,8,9,10,11
0,0,1,2,3
3,12,13,14,15


In [169]:
df.sample(n=3)#多运行几次，会发现，是随机选取n=3行数据，默认是无放回抽取。不是用替换的方式选取

Unnamed: 0,0,1,2,3
4,16,17,18,19
1,4,5,6,7
3,12,13,14,15


In [170]:
choices=pd.Series([5,7,-1,6,4])

In [171]:
draws=choices.sample(n=10,replace=True)#replace参数设置为有放回抽取，抽取的样本数可以大于总体数

In [172]:
draws

0    5
0    5
2   -1
3    6
3    6
0    5
4    4
2   -1
0    5
2   -1
dtype: int64

In [173]:
draws=choices.sample(n=10)#不设置有放回抽取，则不可能抽取大于总体数的样本

ValueError: Cannot take a larger sample than population when 'replace=False'

In [174]:
df.sample(n=6)#同样也报错。

ValueError: Cannot take a larger sample than population when 'replace=False'

In [175]:
df.sample(n=8,replace=True)#设置为有放回后，同样可以抽取样本数大于总体数的样本

Unnamed: 0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
2,8,9,10,11
4,16,17,18,19
0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3


In [None]:
#计算指标/哑变量
#将分类变量转换为哑变量或指标矩阵

In [177]:
df=pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})

In [178]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [180]:
pd.get_dummies(df['key'])
#get_dummies函数可以实现从一个某列含有k个不同值的DF数据派生出一个k列矩阵或新的DF
#get_dummies()的原理是怎么回事呢？
#由于df的key列有a,b,c三个不同值，因此get_dummies()函数就生成了一个三列的DF数据，每列的列名分别是a,b,c
#然后下方每一行的取值就是根据原始数据中，对应行的索引位置是否是列名相应的值，是则为1，否则为0.
#完整形式pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False)
#参考 ：https://blog.csdn.net/qq_36523839/article/details/80382924

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [181]:
#get_dummies()的prefix参数可以为该DF数据的列加上一个前缀，以便于跟其他数据进行合并
dummies=pd.get_dummies(df['key'],prefix='key')

In [182]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [185]:
pd.get_dummies??

In [186]:
mnames=['movie_id','title','genres']

In [187]:
movies=pd.read_table('datasets/movielens/movies.dat',sep='::',header=None,names=mnames)
#读取较多的数据
#书上用的read_table,会有警告

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


In [188]:
movies[:10]#前十行
#观察发现，genres的取值包括多个，如果需要如上判断该电影的genres是否包含某个值，就比较复杂
#首先需要将每一行包含的genres分离开

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [189]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [190]:
all_genres=[]
for x in movies.genres:
    all_genres.extend(x.split('|'))
#原始的genres是使用|分割的多个genres，需要提取不同的genres，而不是不同的所有组合
#构建初始空列表，然后添加genres中被|分割的元素进去

In [191]:
genres=pd.unique(all_genres)
#使用unique函数选出不同的genres

In [192]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [193]:
all_genres

['Animation',
 "Children's",
 'Comedy',
 'Adventure',
 "Children's",
 'Fantasy',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Comedy',
 'Action',
 'Crime',
 'Thriller',
 'Comedy',
 'Romance',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Adventure',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Horror',
 'Animation',
 "Children's",
 'Drama',
 'Action',
 'Adventure',
 'Romance',
 'Drama',
 'Thriller',
 'Drama',
 'Romance',
 'Thriller',
 'Comedy',
 'Action',
 'Action',
 'Comedy',
 'Drama',
 'Crime',
 'Drama',
 'Thriller',
 'Thriller',
 'Drama',
 'Sci-Fi',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Romance',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Drama',
 'Drama',
 'Sci-Fi',
 'Adventure',
 'Romance',
 "Children's",
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Documentary',
 'Comedy',
 'Comedy',
 'Romance',
 'Drama',
 'Drama',
 'War',
 'Action',
 'Crime',
 'Drama',
 'Drama',
 'Action',
 'Adventure',
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Crime',
 'Thrill

In [210]:
#构造指标DF，从全零的DF开始迭代
zero_matrix=np.zeros((len(movies),len(genres)))#使用了len()函数获取长度，构造相应的初始零矩阵

In [217]:
zero_matrix#零矩阵

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [218]:
dummies=pd.DataFrame(zero_matrix,columns=genres)#从零矩阵构造DF，指定列为没有重复值的genres

In [219]:
dummies[:10]

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [220]:
#开始构造迭代每一部电影的方法
gen=movies.genres[0]#对于第一部，查看其genres列的原始值

In [221]:
gen

"Animation|Children's|Comedy"

In [222]:
gen.split('|')#然后使用split()方法获得原始值所包含的不同的genres

['Animation', "Children's", 'Comedy']

In [223]:
dummies.columns.get_indexer(gen.split('|'))#然后根据原始值所包含的不同的genres值，获取这些值的索引

array([0, 1, 2], dtype=int64)

In [224]:
#根据索引，使用iloc设定值
for i,gen in enumerate(movies.genres):
    indices=dummies.columns.get_indexer(gen.split('|'))#找出第i列的genres的原始值所包含的不同的genres值的对应的索引
    dummies.iloc[i,indices]=1#把dummies中相应索引的零值改为1

In [225]:
dummies[:10]#可以看到，上述循环已经根据每部电影的genres列所包含的单独的genres的对应的指标赋值为1了。

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [204]:
movies_windic=movies.join(dummies.add_prefix('Genre_'))#里边使用add_prefix()参数为列（也就是单独的genres）增加前缀
#外边使用join方法，按照行索引，将原始的movies数据和表征了genres的dummies矩阵合并

In [205]:
movies_windic.iloc[0]#取合并矩阵的第一行，可以看到列包括了movies和dummies的列的并集

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [206]:
movies_windic

Unnamed: 0,movie_id,title,genres,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Adventure,Genre_Fantasy,Genre_Romance,Genre_Drama,...,Genre_Crime,Genre_Thriller,Genre_Horror,Genre_Sci-Fi,Genre_Documentary,Genre_War,Genre_Musical,Genre_Mystery,Genre_Film-Noir,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,Heat (1995),Action|Crime|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,Sabrina (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,Tom and Huck (1995),Adventure|Children's,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,Sudden Death (1995),Action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,GoldenEye (1995),Action|Adventure|Thriller,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#对于很大的数据，这种方法构建多成员指标变量会变得非常慢，最好使用更低级的函数，将其写入NumPy数组，然后结果包装在DF数据中。

In [226]:
#结合get_dummies()和cut()离散化函数
np.random.seed(12345)

In [227]:
values=np.random.rand(10)

In [228]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [229]:
bins=[0,0.2,0.4,0.6,0.8,1]

In [231]:
pd.get_dummies(pd.cut(values,bins))#m每一行只有一个1，表示该索引对应的值所落在的区间

   (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]
0           0           0           0           0           1
1           0           1           0           0           0
2           1           0           0           0           0
3           0           1           0           0           0
4           0           0           1           0           0
5           0           0           1           0           0
6           0           0           0           0           1
7           0           0           0           1           0
8           0           0           0           1           0
9           0           0           0           1           0

In [232]:
pd.cut(values,bins)

[(0.8, 1.0], (0.2, 0.4], (0.0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.8, 1.0], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8]]
Categories (5, interval[float64]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

In [None]:
#7.3 字符串操作


#字符串对象方法

In [235]:
val='a,b, guido'

In [277]:
type(val)

str

In [236]:
val.split(',')#split()方法，按指定分隔符对字符串进行分割

['a', 'b', ' guido']

In [278]:
type(val.split(','))

list

In [237]:
#联合strip去除空白字符
pieces=[x.strip() for x in val.split(',')]

In [238]:
pieces#注意观察，guido前的空格被去掉了

['a', 'b', 'guido']

In [239]:
first,second,third=pieces

In [240]:
first+'::'+second+'::'+third#并不实用

'a::b::guido'

In [241]:
'::'.join(pieces)#向字符串的join方法传递列表或数组作为参数，就会用字符串将列表的值连起来

'a::b::guido'

In [264]:
lis1=str(list(np.arange(10)))

In [265]:
lis1

'[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]'

In [266]:
'+'.join(lis1)#数据类型不对

'[+0+,+ +1+,+ +2+,+ +3+,+ +4+,+ +5+,+ +6+,+ +7+,+ +8+,+ +9+]'

In [271]:
lis='证号,姓名,性别,出生日期,单位（学院）,专业,读者状态,读者类型,学生类别,有效期,联系电话,物理卡号,条码号,预设口令,操作代码'

In [275]:
lis.split(',')#为什么结果会换行？

['证号',
 '姓名',
 '性别',
 '出生日期',
 '单位（学院）',
 '专业',
 '读者状态',
 '读者类型',
 '学生类别',
 '有效期',
 '联系电话',
 '物理卡号',
 '条码号',
 '预设口令',
 '操作代码']

In [276]:
type(lis)

str

In [279]:
type(lis.split(','))

list

In [280]:
'::'.join(list(lis.split))#因为是中文的吗？

TypeError: 'builtin_function_or_method' object is not iterable

In [281]:
lis='ZH,XM,XB,CSRQ,DW,ZY,DZZT,LX,LB,ENDTIME,PHONE,CARD,BARCODE,AC,DM'

In [282]:
'::'.join(lis.split(','))#果然是因为中文的原因

'ZH::XM::XB::CSRQ::DW::ZY::DZZT::LX::LB::ENDTIME::PHONE::CARD::BARCODE::AC::DM'

In [283]:
'guido'in val#检测字串

True

In [284]:
val.index(',')#同样是找字符串，index找不到会报错

1

In [287]:
val.index(':')

ValueError: substring not found

In [286]:
val.find(':')#find不会报错，会返回-1

-1

In [288]:
val.count(',')#字符串的count方法返回字串出现次数

2

In [289]:
val.replace(',',':')

'a:b: guido'

In [290]:
f=open('dis.txt')

In [294]:
pd.read_csv(f)

Unnamed: 0,证号,姓名,性别,出生日期,单位（学院）,专业,读者状态,读者类型,有效期,联系电话,物理卡号,条码号,预设口令,操作代码
0,20181001,方世玉,男,1963.02.12,艺术学院,艺术设计研一,24,0,20190930.0,,86571275,,,1
1,20172001,胡斐,男,1975.02.10,制造科学学院,工业设计研二,24,0,20190930.0,,60313099,,,1
2,20173001,韦一笑,女,1974.07.11,新闻学院,网络与新媒体研二,24,0,20190930.0,,19700363,,,1
3,20184001,谢逊,男,1986.11.01,文学学院,文艺与传媒研一,24,0,20190930.0,,50506731,,,1
4,20185001,黄蓉,女,1986.02.10,文学学院,文艺与传媒研一,24,0,20190930.0,,3539083,,,1
5,20176001,李莫愁,女,1983.02.01,法学院,国际法学研二,24,0,,20190930.0,99347475,,,1


In [298]:
f=open('C:\\Users\\Administrator\\PythonforDA\\dis.txt')

In [300]:
pd.read_csv(f)

Unnamed: 0,证号,姓名,性别,出生日期,单位（学院）,专业,读者状态,读者类型,有效期,联系电话,物理卡号,条码号,预设口令,操作代码
0,20181001,方世玉,男,1963.02.12,艺术学院,艺术设计研一,24,0,20190930.0,,86571275,,,1
1,20172001,胡斐,男,1975.02.10,制造科学学院,工业设计研二,24,0,20190930.0,,60313099,,,1
2,20173001,韦一笑,女,1974.07.11,新闻学院,网络与新媒体研二,24,0,20190930.0,,19700363,,,1
3,20184001,谢逊,男,1986.11.01,文学学院,文艺与传媒研一,24,0,20190930.0,,50506731,,,1
4,20185001,黄蓉,女,1986.02.10,文学学院,文艺与传媒研一,24,0,20190930.0,,3539083,,,1
5,20176001,李莫愁,女,1983.02.01,法学院,国际法学研二,24,0,,20190930.0,99347475,,,1


In [314]:
#正则表达式

In [315]:
import re

In [316]:
text="foo      bar\t baz      \tqux"

In [317]:
re.split('\s+',text)#拆分分隔符为数量不定的空白符（制表符、空格、换行） \s+ 代表一个或多个空白符

['foo', 'bar', 'baz', 'qux']

In [319]:
regex=re.compile('\s+')#编译

In [320]:
regex.split(text)#调用编译

['foo', 'bar', 'baz', 'qux']

In [321]:
regex.findall(text)#得到所有被匹配到的模式

['      ', '\t ', '      \t']

In [None]:
#多多个字符串应用同一条正则表达式，先编译可节省cpu时间

In [322]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [323]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [324]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [325]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [326]:
m = regex.search(text)#search方法返回第一个匹配到的对象

In [327]:
m

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

In [328]:
text[m.start():m.end()]

'dave@google.com'

In [330]:
print(regex.match(text))#match方法只匹配出现在字符串开始的模式

None


In [331]:
print(regex.sub('REDACTED', text))#sub方法替换匹配到的结果为指定字符串

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [332]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [335]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [336]:
m = regex.match('wesm@bright.net')

In [337]:
m

<_sre.SRE_Match object; span=(0, 15), match='wesm@bright.net'>

In [338]:
m.groups()#group 方法返回由模式各部分组成的元组对象

('wesm', 'bright', 'net')

In [339]:
regex.findall(text)#findall方法返回元组构成的列表

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [None]:
#正则表达式博大精深，需要一整本书来讲解

In [None]:
# pandas的矢量化字符串函数


In [340]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com','Rob': 'rob@gmail.com', 'Wes': np.nan}

In [341]:
data

{'Dave': 'dave@google.com',
 'Steve': 'steve@gmail.com',
 'Rob': 'rob@gmail.com',
 'Wes': nan}

In [342]:
data = pd.Series(data)

In [343]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [344]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [345]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [346]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [347]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [348]:
matches = data.str.match(pattern, flags=re.IGNORECASE)

In [349]:
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [350]:
matches.str.get(1)

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [351]:
matches.str[0]

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [352]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object