## Pandas 技巧

In [14]:
import pandas as pd
# data
locations = pd.read_csv('./data/locations.gz',sep='\t',header=None, 
                        dtype={'latitude':pd.np.float32,'longitude':pd.np.float32,'duration':pd.np.int},
                        names=['phone','latitude','longitude','datetime','duration'])
locations.head()

Unnamed: 0,phone,latitude,longitude,datetime,duration
0,00050C1891C1DBF6C9ECA6ADED2BE6DE,19.289,-99.005997,15/10/2014 22:10:41,48
1,00050C1891C1DBF6C9ECA6ADED2BE6DE,19.289,-99.005997,15/10/2014 21:09:28,78
2,0006EBC091C7E79FA9BF9164528929F8,19.463314,-99.146431,15/10/2014 13:01:33,22
3,0006EBC091C7E79FA9BF9164528929F8,19.471701,-99.140297,15/10/2014 07:07:51,29
4,0006EBC091C7E79FA9BF9164528929F8,19.463314,-99.146431,15/10/2014 12:12:09,26


### 将字符串存储的日期变成日期object

In [16]:
locations['datetime'] = pd.to_datetime(locations['datetime'],format="%d/%m/%Y %H:%M:%S")
locations['datetime'].head()

0   2014-10-15 22:10:41
1   2014-10-15 21:09:28
2   2014-10-15 13:01:33
3   2014-10-15 07:07:51
4   2014-10-15 12:12:09
Name: datetime, dtype: datetime64[ns]

### 按给定时间段resample并填充

对于每个用户
1. 合并30分钟之内的duration并求和
2. 如果某个30分钟时间段没有duration那么该时间段不会出现，对这些时间段，填充0

In [39]:
sub = locations.groupby('phone').get_group('000A8481F9274EEA3F994376973B23CE')

# add DateTimeIndex
samples = sub.set_index('datetime')['duration'].resample('30T').sum()
samples

datetime
2014-10-15 16:00:00    122.0
2014-10-15 16:30:00      NaN
2014-10-15 17:00:00      NaN
2014-10-15 17:30:00      NaN
2014-10-15 18:00:00    113.0
2014-10-15 18:30:00      NaN
2014-10-15 19:00:00    441.0
2014-10-15 19:30:00      NaN
2014-10-15 20:00:00    411.0
Freq: 30T, Name: duration, dtype: float64

1. 结果里面有NaN值
2. 时间段不完整，只有16:00 到 20:00
3. 需要填充一天的所有时间段

In [42]:
samples = samples.fillna(0.0)
ranges = pd.date_range('15/10/2014',freq='30T',periods=48)
samples = samples.reindex(ranges).fillna(0)
samples

2014-10-15 00:00:00      0.0
2014-10-15 00:30:00      0.0
2014-10-15 01:00:00      0.0
2014-10-15 01:30:00      0.0
2014-10-15 02:00:00      0.0
2014-10-15 02:30:00      0.0
2014-10-15 03:00:00      0.0
2014-10-15 03:30:00      0.0
2014-10-15 04:00:00      0.0
2014-10-15 04:30:00      0.0
2014-10-15 05:00:00      0.0
2014-10-15 05:30:00      0.0
2014-10-15 06:00:00      0.0
2014-10-15 06:30:00      0.0
2014-10-15 07:00:00      0.0
2014-10-15 07:30:00      0.0
2014-10-15 08:00:00      0.0
2014-10-15 08:30:00      0.0
2014-10-15 09:00:00      0.0
2014-10-15 09:30:00      0.0
2014-10-15 10:00:00      0.0
2014-10-15 10:30:00      0.0
2014-10-15 11:00:00      0.0
2014-10-15 11:30:00      0.0
2014-10-15 12:00:00      0.0
2014-10-15 12:30:00      0.0
2014-10-15 13:00:00      0.0
2014-10-15 13:30:00      0.0
2014-10-15 14:00:00      0.0
2014-10-15 14:30:00      0.0
2014-10-15 15:00:00      0.0
2014-10-15 15:30:00      0.0
2014-10-15 16:00:00    122.0
2014-10-15 16:30:00      0.0
2014-10-15 17:

### 获得数据里面的全部日期

In [48]:
locations['datetime'].map(lambda dt:dt.date()).unique()

array([datetime.date(2014, 10, 15)], dtype=object)

### 数据过滤的手段

In [50]:
locations=locations[locations['duration'].notnull()]
locations_filtered = locations[locations['duration']>10]
locations_filtered.head()

Unnamed: 0,phone,latitude,longitude,datetime,duration
0,00050C1891C1DBF6C9ECA6ADED2BE6DE,19.289,-99.005997,2014-10-15 22:10:41,48
1,00050C1891C1DBF6C9ECA6ADED2BE6DE,19.289,-99.005997,2014-10-15 21:09:28,78
2,0006EBC091C7E79FA9BF9164528929F8,19.463314,-99.146431,2014-10-15 13:01:33,22
3,0006EBC091C7E79FA9BF9164528929F8,19.471701,-99.140297,2014-10-15 07:07:51,29
4,0006EBC091C7E79FA9BF9164528929F8,19.463314,-99.146431,2014-10-15 12:12:09,26
