## 3.11 向量化字符串操作

### 3.11.1 Pandas字符串操作简介 

In [1]:
import numpy as np 
import pandas as pd

In [3]:
x = np.array([2, 3, 5, 7, 11, 13])        
x * 2 

array([ 4,  6, 10, 14, 22, 26])

In [5]:
data = ['peter', 'Paul', 'MARY', 'gUIDO'] 
[s.capitalize() for s in data] 

['Peter', 'Paul', 'Mary', 'Guido']

In [6]:
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']        
[s.capitalize() for s in data] 

AttributeError: 'NoneType' object has no attribute 'capitalize'

In [8]:
names = pd.Series(data)
names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [9]:
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

### 3.11.2 Pandas字符串方法列表

#### 1. 与Python字符串方法相似的方法 

In [10]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

In [11]:
monte.str.lower() 

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [12]:
monte.str.len() 

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [18]:
monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [21]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

#### 2. 使用正则表达式的方法

In [24]:
monte.str.extract('([A-Za-z]+)')

Unnamed: 0,0
0,Graham
1,John
2,Terry
3,Eric
4,Terry
5,Michael


In [29]:
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

#### 3. 其他字符串方法 

##### (1)向量化字符串的取值与切片操作

In [30]:
monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [37]:
#相同效果
monte.str.split().str[-1]
monte.str.split().str.get(-1)

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

##### (2)指标变量

In [38]:
full_monte = pd.DataFrame({'name': monte,                           
                           'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C',
                                    'B|C|D']}) 
full_monte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


In [40]:
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


### 3.11.3 案例：食谱数据库 

In [13]:
#方法一
try:
    recipes = pd.read_json('PythonDataScienceHandbook-master/notebooks/data/20170107-061401-recipeitems.json', encoding='utf-8', lines = True)
except ValueError as e:
    print("ValueError:", e)

In [3]:
#供测试使用
with open('PythonDataScienceHandbook-master/notebooks/data/20170107-061401-recipeitems.json') as f:
    line = f.readline()
pd.read_json(line).shape

(2, 12)

In [11]:
#方法二：
with open('PythonDataScienceHandbook-master/notebooks/data/20170107-061401-recipeitems.json',encoding='utf-8', mode = 'r') as f:
    data = (line.strip() for line in f)
    data_json = "[{0}]".format(','.join(data))
recipes = pd.read_json(data_json) 

In [14]:
recipes.shape 

(173278, 17)

In [15]:
recipes.iloc[0]

_id                                {'$oid': '5160756b96cc62079cc2db15'}
cookTime                                                          PT30M
creator                                                             NaN
dateModified                                                        NaN
datePublished                                                2013-03-11
description           Late Saturday afternoon, after Marlboro Man ha...
image                 http://static.thepioneerwoman.com/cooking/file...
ingredients           Biscuits\n3 cups All-purpose Flour\n2 Tablespo...
name                                    Drop Biscuits and Sausage Gravy
prepTime                                                          PT10M
recipeCategory                                                      NaN
recipeInstructions                                                  NaN
recipeYield                                                          12
source                                                  thepione

In [17]:
recipes.ingredients.str.len().describe()

count    173278.000000
mean        244.617926
std         146.705285
min           0.000000
25%         147.000000
50%         221.000000
75%         314.000000
max        9067.000000
Name: ingredients, dtype: float64

In [22]:
recipes.name[np.argmax(recipes.ingredients.str.len())] 

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  return getattr(obj, method)(*args, **kwds)


'Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots'

In [24]:
recipes.description.str.contains('[Bb]reakfast').sum()

3524

In [25]:
recipes.ingredients.str.contains('[Cc]innamon').sum()

10526

In [26]:
recipes.ingredients.str.contains('[Cc]inamon').sum()

11

#### 1.制作简易的美食推荐系统

In [38]:
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
             'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']
import re
spice_df = pd.DataFrame(dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE))
                            for spice in spice_list))
spice_df.head()

Unnamed: 0,salt,pepper,oregano,sage,parsley,rosemary,tarragon,thyme,paprika,cumin
0,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,True,True,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False


In [39]:
selection = spice_df.query('parsley & paprika & tarragon')
len(selection)

10

In [40]:
recipes.name[selection.index] 

2069      All cremat with a Little Gem, dandelion and wa...
74964                         Lobster with Thermidor butter
93768      Burton's Southern Fried Chicken with White Gravy
113926                     Mijo's Slow Cooker Shredded Beef
137686                     Asparagus Soup with Poached Eggs
140530                                 Fried Oyster Po’boys
158475                Lamb shank tagine with herb tabbouleh
158486                 Southern fried chicken in buttermilk
163175            Fried Chicken Sliders with Pickles + Slaw
165243                        Bar Tartine Cauliflower Salad
Name: name, dtype: object

#### 2.继续完善美食推荐系统 

>这其实也揭示了数据科学的真相——真实数据的清洗与整理工作往往会占据的大部分时间，而使用 Pandas 提供的工具可以提高你的工作效率。 

## 3.12 处理时间序列

### 3.12.1 Python的日期与时间工具 

#### 1.原生Python的日期与时间工具：datetime与dateutil

In [44]:
from datetime import datetime
datetime(year = 2015, month = 7, day = 4)

datetime.datetime(2015, 7, 4, 0, 0)

In [47]:
from dateutil import parser
date = parser.parse("4, July, 2015")
date

datetime.datetime(2015, 7, 4, 0, 0)

In [48]:
date.strftime('%A')

'Saturday'

#### 2.时间类型数组：NumPy的datetime64类型 

In [50]:
import numpy as np
date = np.array('2015-07-04', dtype = np.datetime64)
date

array('2015-07-04', dtype='datetime64[D]')

In [54]:
date + np.arange(12)

array(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
       '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
       '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
      dtype='datetime64[D]')

In [55]:
np.datetime64('2015-07-04')

numpy.datetime64('2015-07-04')

In [57]:
np.datetime64('2015-07-04 12:00')

numpy.datetime64('2015-07-04T12:00')

In [58]:
np.datetime64('2015-07-04 12:59:59.50', 'ns')

numpy.datetime64('2015-07-04T12:59:59.500000000')

#### 3.Pandas的日期与时间工具：理想与现实的最佳解决方案 

In [60]:
import pandas as pd
date = pd.to_datetime("4th of July, 2015")
date

Timestamp('2015-07-04 00:00:00')

In [63]:
date.strftime('%A')

'Saturday'

In [73]:
date + pd.to_timedelta(np.arange(12), 'D')

DatetimeIndex(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
               '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
               '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
              dtype='datetime64[ns]', freq=None)

### 3.12.2 Pandas时间序列：用时间作索引 

In [75]:
index = pd.DatetimeIndex(['2014-07-04', '2014-08-04',                                   
                           '2015-07-04', '2015-08-04'])
data = pd.Series([0, 1, 2, 3], index = index)
data

2014-07-04    0
2014-08-04    1
2015-07-04    2
2015-08-04    3
dtype: int64

In [76]:
data['2014-07-04':'2015-07-04'] 

2014-07-04    0
2014-08-04    1
2015-07-04    2
dtype: int64

In [77]:
data['2015']

2015-07-04    2
2015-08-04    3
dtype: int64

### 3.12.3 Pandas时间序列数据结构 

In [86]:
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',                                
                        '2015-Jul-6', '07-07-2015', '20150708'])         
dates

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
               '2015-07-08'],
              dtype='datetime64[ns]', freq=None)

In [87]:
dates.to_period('D') 

PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
             '2015-07-08'],
            dtype='period[D]', freq='D')

In [88]:
dates - dates[0]

TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)

In [89]:
 pd.date_range('2015-07-03', '2015-07-10') 

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

In [90]:
pd.date_range('2015-07-03', periods=8) 

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

In [91]:
 pd.date_range('2015-07-03', periods=8, freq='H') 

DatetimeIndex(['2015-07-03 00:00:00', '2015-07-03 01:00:00',
               '2015-07-03 02:00:00', '2015-07-03 03:00:00',
               '2015-07-03 04:00:00', '2015-07-03 05:00:00',
               '2015-07-03 06:00:00', '2015-07-03 07:00:00'],
              dtype='datetime64[ns]', freq='H')

In [95]:
pd.period_range('2015-07-03', periods = 8, freq = 'm')

PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',
             '2016-01', '2016-02'],
            dtype='period[M]', freq='M')

In [96]:
pd.timedelta_range(0, periods=10, freq='H') 

TimedeltaIndex(['00:00:00', '01:00:00', '02:00:00', '03:00:00', '04:00:00',
                '05:00:00', '06:00:00', '07:00:00', '08:00:00', '09:00:00'],
               dtype='timedelta64[ns]', freq='H')

### 3.12.4 时间频率与偏移量

In [97]:
pd.timedelta_range(0, periods = 9, freq = "2H30T")

TimedeltaIndex(['00:00:00', '02:30:00', '05:00:00', '07:30:00', '10:00:00',
                '12:30:00', '15:00:00', '17:30:00', '20:00:00'],
               dtype='timedelta64[ns]', freq='150T')

In [98]:
from pandas.tseries.offsets import BDay
pd.date_range('2015-07-01', periods = 5, freq = BDay())

DatetimeIndex(['2015-07-01', '2015-07-02', '2015-07-03', '2015-07-06',
               '2015-07-07'],
              dtype='datetime64[ns]', freq='B')

### 3.12.5 重新取样、迁移和窗口 

In [103]:
# pd.core.common.is_list_like = pd.api.types.is_list_like
# from pandas_datareader import data 
# goog = data.DataReader('GOOG', start='2004', end='2016',                                
#                         data_source='google')         
# goog.head() 

## 3.13 高性能Pandas：eval()与query()

## 3.14 参考资料