# 经纬度及API调用

* 经度(longitude):东西方向为经，范围是[-180,180]
* 纬度(latitude)：南北方向为纬，范围是[-90,90]

## 经纬度编码格式转换&求距离

[坐标系参考](https://www.jianshu.com/p/01f1bc49ba97)


* **WGS－84原始坐标系**:一般用国际GPS纪录仪记录下来的经纬度，通过GPS定位拿到的原始经纬度，Google和高德地图定位的的经纬度（国外）都是基于WGS－84坐标系的；
    
    但是在国内是不允许直接用WGS84坐标系标注的，必须经过加密后才能使用；


* **GCJ－02坐标系**:又名“火星坐标系”，是我国国测局独创的坐标体系，由**WGS－84**加密而成。在国内必须使用GCJ－02坐标系或者使用GCJ－02加密后再进行加密成 **bd-09** 的坐标系，如百度坐标系。

    高德和Google在国内都是使用GCJ－02坐标系，可以说GCJ－02是国内最广泛使用的坐标系；


* **百度坐标系(bd-09)**:百度坐标系是在GCJ－02坐标系的基础上再次加密偏移后形成的坐标系，只适用于百度地图。

    目前百度API提供了从其它坐标系转换为百度坐标系的API，但却没有从百度坐标系转为其他坐标系的API。

下面是坐标系转换代码，包含：

|原坐标系X|转换函数|转换后的坐标系Y|Y类坐标系|
| :--    |:--    |:--         |:--      |
|WGS-84  |wgs_gcj_encrypt|GCJ-02|高德 和 Google 坐标系|
|GCJ-02  |gcj_bd_encrypt|bd-09|百度坐标系|
|WGS-84  |wgs_baidu_encrypt|bd-09|百度坐标系|

In [2]:
# -*- coding: utf-8 -*-
# 我们拿到的TBOX的经纬度有一点点偏差，大概十几二十米的样子。
# 用在一般的分析上不影响，但如果画到地图上，会出现像车子在黄浦江里的情景
# 调用wgs_baidu_encrypt方法，传入一个latitude,longitude，以list返回纠偏后的lat和lon。

import math
x_pi = math.pi * 3000.0 / 180.0
AA = 6378245.0
EE = 0.00669342162296594323
r = 6371
# ------------- basic functions -------------------


def distance_of_two_points(point_1, point_2):  
    '''
    计算两个经纬度点之间的距离
    type(point) = list
    point = [lat, lon]

    '''
    lat1, lon1, lat2, lon2 = map(math.radians, point_1+point_2)

    diff_lon = lon2 - lon1
    diff_lat = lat2 - lat1
    a = math.sin(diff_lat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(diff_lon / 2) ** 2
    c = 2 * math.asin(math.sqrt(a))
    return c * r * 1000

'''
#  * 经纬度纠偏
#  * wgs_lat 纬度
#  * wgs_lon 经度
#  * GCJ-02转换BD-09
#  * 高德和Google地图经纬度转百度地图经纬度
'''


def wgs_baidu_encrypt(wgs_lat, wgs_lon):
    tmp_lat_lon = wgs_gcj_encrypts(wgLat=wgs_lat, wgLon=wgs_lon)
    return gcj_bd_encrypt(tmp_lat_lon[0], tmp_lat_lon[1])

        
'''
#  * gg_lat 纬度
#  * gg_lon 经度
#  * GCJ-02转换BD-09
#  * 高德和Google地图经纬度转百度地图经纬度
'''


def gcj_bd_encrypt(gg_lat, gg_lon):
    x = gg_lon
    y = gg_lat
    z = math.sqrt(x * x + y * y) + 0.00002 * math.sin(y * x_pi)
    theta = math.atan2(y, x) + 0.000003 * math.cos(x * x_pi)
    bd_lon = z * math.cos(theta) + 0.0065
    bd_lat = z * math.sin(theta) + 0.006
    return [round(bd_lat, 4), round(bd_lon, 4)]


'''
#  * wgLat 纬度
#  * wgLon 经度
#  * BD-09转换GCJ-02
#  * 百度转google
'''
def bd_google_encrypt(bd_lat, bd_lon):
    x = bd_lon - 0.0065
    y = bd_lat - 0.006;  
    z = math.sqrt(x * x + y * y) - 0.00002 * math.sin(y * x_pi)  
    theta = math.atan2(y, x) - 0.000003 * math.cos(x * x_pi) 
    gg_lon = z * math.cos(theta)  
    gg_lat = z * math.sin(theta) 
    return gg_lat, gg_lon


'''
#  * wgLat 纬度
#  * wgLon 经度
#  * WGS-84 到 GCJ-02 的转换（即 GPS 加偏）
'''
def wgs_gcj_encrypts(wgLat, wgLon):
    if outOfChina(wgLat, wgLon):
        return wgLat, wgLon
        
    dLat = transformLat(wgLon - 105.0, wgLat - 35.0)
    dLon = transformLon(wgLon - 105.0, wgLat - 35.0)
    radLat = wgLat / 180.0 * math.pi
    magic = math.sin(radLat)
    magic = 1 - EE * magic * magic
    sqrtMagic = math.sqrt(magic)
    dLat = (dLat * 180.0) / ((AA * (1 - EE)) / (magic * sqrtMagic) * math.pi)
    dLon = (dLon * 180.0) / (AA / sqrtMagic * math.cos(radLat) * math.pi)
    lat = wgLat + dLat
    lon = wgLon + dLon
    return lat, lon



def outOfChina(lat, lon):  
    if lon < 72.004 or lon > 137.8347:
        return True
    if lat < 0.8293 or lat > 55.8271:
        return True
    return False


def transformLat(x, y):
    ret = -100.0 + 2.0 * x + 3.0 * y + 0.2 * y * y + 0.1 * x * y + 0.2 * math.sqrt(abs(x))
    ret = ret + (20.0 * math.sin(6.0 * x * math.pi) + 20.0 * math.sin(2.0 * x * math.pi)) * 2.0 / 3.0
    ret = ret + (20.0 * math.sin(y * math.pi) + 40.0 * math.sin(y / 3.0 * math.pi)) * 2.0 / 3.0
    ret = ret + (160.0 * math.sin(y / 12.0 * math.pi) + 320 * math.sin(y * math.pi / 30.0)) * 2.0/3.0
    return ret


def transformLon(x, y):
    ret = 300.0 + x + 2.0 * y + 0.1 * x * x + 0.1 * x * y + 0.1 * math.sqrt(abs(x))
    ret = ret + (20.0 * math.sin(6.0 * x * math.pi) + 20.0 * math.sin(2.0 * x * math.pi)) * 2.0 / 3.0
    ret = ret + (20.0 * math.sin(x * math.pi) + 40.0 * math.sin(x / 3.0 * math.pi)) * 2.0 / 3.0
    ret = ret + (150.0 * math.sin(x / 12.0 * math.pi) + 300.0 * math.sin(x / 30.0 * math.pi)) * 2.0/3.0
    return ret


In [3]:
wgs_baidu_encrypt(wgs_lat=121.325621, wgs_lon=31.056428)

[121.3317, 31.0628]

In [4]:
distance_of_two_points(point_1=[121.3256,31.0564], point_2=[121.2814, 31.0245])

5248.999011991885

## 区域划分网格

In [1]:
# -*- coding: utf-8 -*-
"""
Created on Thu May 16 2018

@author: bofeng chen
"""
import pandas as pd
import itertools
import random
error_sign = '-1'
SH_latitude_range = (30.65, 31.89)
SH_longitude_range = (120.85, 122.04)


def area_split_num(left_point, right_point, diameter):
    """
    输入区域的lat/lon的范围和半径，输出网格数量
    比例尺度 0.01 == 1000miles
    :param left_point: lat/lon边的左端点
    :param right_point: lat/lon边的右端点
    :param diameter: int, 一个网格的半径
    :return: 网格的lat/lon边的分割网格数量
    """
    # 取area的长边、宽边的各自两个点
    lat_lon_len = diameter/float(1000)*0.01
    return int((right_point-left_point)/lat_lon_len)


def area_basic_para(latitude_tuple, longitude_tuple, diameter):
    """
    输入area范围和网格直径，输出网格基本信息
    :param latitude_tuple: tuple, area的纬度范围
    :param longitude_tuple: tuple, area的经度范围
    :param diameter: int, 直径
    :return: 区域lat/lon的基本参数，包括area的范围，对应的网格数量，网格单位长度；编码位数
    """
    # 网格的四个角
    area_left = longitude_tuple[0]
    area_right = longitude_tuple[1]
    area_down = latitude_tuple[0]
    area_up = latitude_tuple[1]

    split_num_list = map(lambda x: area_split_num(left_point=x[0], right_point=x[1], diameter=diameter),
                         [[area_left, area_right], [area_down, area_up]])

    encode_len = max([len(str(i)) for i in split_num_list])

    unit_len_list = [(longitude_tuple[1] - longitude_tuple[0])/split_num_list[0], (latitude_tuple[1] - latitude_tuple[0])/split_num_list[1]]
    longitude_dict = {'range': longitude_tuple, 'split_num': split_num_list[0], 'unit_len': unit_len_list[0]}
    latitude_dict = {'range': latitude_tuple, 'split_num': split_num_list[1], 'unit_len': unit_len_list[1]}

    return longitude_dict, latitude_dict, encode_len


def create_a_grid_table(latitude_dict, longitude_dict, encode_len):
    """
    输入区域基本参数，输出区域的网格编码表
    :param latitude_dict: dict, 区域单边的基本参数
    :param longitude_dict: dict, 区域单边的基本参数
    :param encode_len: 编码长度
    :return: DataFrame, 网格编码表
    """
    lat_num = latitude_dict['split_num']
    lon_num = longitude_dict['split_num']

    grid_info_list = []
    for lon_i in xrange(lon_num):
        lon_center = longitude_dict['range'][0] + lon_i*longitude_dict['unit_len'] + longitude_dict['unit_len']/2
        for lat_j in xrange(lat_num):
            encode_id = ''.join(map(lambda x: x.zfill(encode_len), [str(lon_i), str(lat_j)]))
            lat_center = latitude_dict['range'][0] + lat_j * longitude_dict['unit_len'] + latitude_dict['unit_len'] / 2
            grid_info_list.append([encode_id, str(lon_i), str(lat_j), lon_center, lat_center])
    grid_info_pd = pd.DataFrame(grid_info_list, columns=['grid_id', 'lon_cooridate', 'lat_cooridate', 'lon_center', 'lat_center'])
    return grid_info_pd


def area_side_encode_to_coordinate(side_para_dict, side_tude):
    """
    给出相应lat/lon对应的网格坐标
    :param side_para_dict: dict, 区域单边的基本参数
    :param side_tude: float, latitude or longitude
    :return: side_tude对应的坐标
    """

    side_range = side_para_dict['range']
    unit_len = side_para_dict['unit_len']
    if side_range[0] <= side_tude <= side_range[1]:
        # 进行编码
        side_coordinate = int((side_tude - side_range[0])/unit_len)
        if side_coordinate == side_para_dict['split_num']:
            side_coordinate -= 1
        return str(side_coordinate)
    else:
        return error_sign


def area_encode_to_id(latitude_dict, longitude_dict, encode_len, latitude, longitude):
    """
    输入区域的基本参数和一对经纬度，输出经纬度对应的网格id
    :param latitude_dict: dict, 区域单边的基本参数
    :param longitude_dict: dict, 区域单边的基本参数
    :param encode_len: 编码长度
    :param latitude: 纬度
    :param longitude: 经度
    :return: 经纬度对应的网格id
    """
    lat_cooridate = area_side_encode_to_coordinate(side_para_dict=latitude_dict, side_tude=latitude)
    lon_cooridate = area_side_encode_to_coordinate(side_para_dict=longitude_dict, side_tude=longitude)

    if lat_cooridate != error_sign and lon_cooridate != error_sign:
        encode_id = ''.join(map(lambda x: x.zfill(encode_len), [lon_cooridate, lat_cooridate]))
    else:
        encode_id = error_sign
    return encode_id


def random_choose_grid(grid_news_path, need_points_num, percent_list, random_times):
    """
    输入候选所需网格数量、网格数据在不同分位数下的分布，输出是比赛所需的网格id及相关基础信息
    :param grid_news_path: str，根据500米/1000米为半径的网格编码方式得到的不同分位数的网格以及9~22点之间相应周平均车流量数据csv路径
    :param need_points_num: int,比赛一共需要的网格数量
    :param percent_list: list, 各个分位数所需网格数量的百分比
    :param random_times: int,循环次数
    :return: DataFrame，选取到的网格id信息以及对应的中心点经纬度以及其他字段数据
    """

    grid_news = pd.read_csv(grid_news_path, dtype={'grid_id': str})
    need_news = grid_news[['percent_level', 'grid_id']].drop_duplicates()

    sum_0 = 0
    id_list = 0
    for x in xrange(random_times):
        print sum_0
        grid_list = []
        for label, label_df in need_news.groupby('percent_level'):
            num = int(percent_list[int(label)]*need_points_num)
            label_id_list = list(label_df['grid_id'])
            random.shuffle(label_id_list)
            grid_list.extend(label_id_list[0:num])

        combin_list = list(itertools.combinations(grid_list, 2))

        def distiance_two_int_str(str_tuple):
            int_list = map(lambda x: [int(x[0:3]), int(x[3:])], str_tuple)
            return abs(int_list[0][0] - int_list[1][0]) + abs(int_list[0][1] - int_list[1][1])

        sum_1 = sum(map(lambda x: distiance_two_int_str(x), combin_list))
        if sum_1 > sum_0:
            sum_0 = sum_1
            id_list = grid_list

    choose_grid_df = grid_news[grid_news.grid_id.isin(id_list)]
    df = choose_grid_df[['grid_id', 'lat_center', 'lon_center']].drop_duplicates()
    return choose_grid_df, df


def game_50_grid_basic_information(id_info_df, grid_type):
    """
    输入是网格id和中心经纬度，输出是网格encode后的id以及网格的边界信息
    :param id_info_df: DataFrame，字段包含网格id以及对应的中心点经纬度
    :param grid_type: int, 可选500 或 1000
    :return: DataFrame, 网格id和对应网格的lat上下边界、lon左右边界以及id相应的encode后的id
    """
    assert (grid_type == 500 or grid_type == 1000)
    diameter = grid_type/100000.0
    id_info_df['down_lat'] = id_info_df[['lat_center']].apply(lambda x: x-diameter/2)
    id_info_df['up_lat'] = id_info_df[['lat_center']].apply(lambda x: x + diameter / 2)
    id_info_df['left_lon'] = id_info_df[['lon_center']].apply(lambda x: x - diameter / 2)
    id_info_df['right_lon'] = id_info_df[['lon_center']].apply(lambda x: x + diameter / 2)
    id_info_df['encode_grid_id'] = range(id_info_df.shape[0])
    return id_info_df

# 要写入sql脚本的经纬度归属网格的计算公式
# 当半径定位500米时，
# 网格id的计算公式是：
# lon_cooridate = int((longitude - 120.85)/0.005)
# lat_cooridate = int((latitude - 30.65)/0.005)
# 对应的网格中心计算公式：
# center_longtitude = 120.85 + lon_coordinate * 0.005 +0.005 / 2
# center_latitude = 30.65 + lat_coordinate * 0.005 + 0.005 / 2
#
# 当半径定位1000米时，
# 网格id的计算公式是：
# lon_cooridate = int((longitude - 120.85)/0.01)
# lat_cooridate = int((latitude - 30.65)/0.01)
# 对应的网格中心计算公式：
# center_longtitude = 120.85 + lon_coordinate * 0.01 +0.01 / 2
# center_latitude = 30.65 + lat_coordinate * 0.01 + 0.01 / 2


# 本地测试
# if __name__ == "__main__":
 
#     # 测试网格编码表生成和经纬度所属网格id
#     longitude_dict, latitude_dict, encode_len = area_basic_para(latitude_tuple=SH_latitude_range, longitude_tuple=SH_longitude_range, diameter=1000)
#     grid_info_pd = create_a_grid_table(latitude_dict=latitude_dict, longitude_dict=longitude_dict, encode_len=encode_len)
#     # encode_id = area_encode_to_id(latitude_dict=latitude_dict, longitude_dict=longitude_dict, encode_len=encode_len, latitude=31.89, longitude=121.85)

#     # # 根据候选网格选取比赛所需网格
#     # choose_grid_df, df = random_choose_grid(
#     #     grid_news_path='input/hour_percent_grid_data_500miles_1000vins_e550_20170102_20170108.csv',
#     #     need_points_num=50, percent_list=[0.2, 0.2, 0.3, 0.3], random_times=100000)
#     # choose_grid_df.to_csv('output/50_grid_heat.csv')
#     game_50_grid_basic_information(id_info_df=grid_info_pd, grid_type=500).to_csv('output/encode_50_grid_basic_info.csv', index=False)


## 求站点之间距离和时间

In [3]:
import urllib2
import csv
import json


#读取XA站点和其经纬度
def read_csv_file_for_order_location(path):
    input_file = csv.reader(file(path))
    location_dict = dict()
    for i in input_file:
        location_dict[i[0]] = [float(i[1]), float(i[2])]
    return location_dict


#输入：站点的经纬度
#输出：站点之间的驾车模式的路线距离和时间
#主要通过调用Route Matrix API v2.0来得到输出
def get_distance_and_time_matrix(location_dict):
    distance_data = []

    for ID_1 in location_dict.keys():
        for ID_2 in location_dict.keys():
            if ID_2 != ID_1:
                Ori_Des = (location_dict[ID_1][0], location_dict[ID_1][1], location_dict[ID_2][0], location_dict[ID_2][1],
                           location_dict[ID_1][0], location_dict[ID_1][1], location_dict[ID_2][0], location_dict[ID_2][1])
                http = 'http://api.map.baidu.com/routematrix/v2/driving?output=json&origins=%2.6f,%3.6f|%2.6f,%3.6f&destinations=%2.6f,%3.6f|%2.6f,%3.6f&ak=Mv58QwKzsPcV43H9sMtUDjIe0D84FwDz' % Ori_Des
                request = urllib2.Request(http)
                f = urllib2.urlopen(request)
                data = f.read()
                js = json.loads(data)
                distance_time = ['TRUE',
                                 float(js[u'result'][1][u'distance'][u'value'])/1000,
                                 float(js[u'result'][1][u'duration'][u'value'])/3600,
                                 'TRUE',
                                 float(js[u'result'][2][u'distance'][u'value'])/1000,
                                 float(js[u'result'][2][u'duration'][u'value'])/3600]
                ID = [ID_1, ID_2]
                ID.extend(distance_time)
                distance_data.append(ID)

    csv_file = open('distance.csv', 'wb')
    writer = csv.writer(csv_file)
    writer.writerows(distance_data)
    csv_file.close()
    return distance_data

location_dict = read_csv_file_for_order_location('XA_Location.csv')
distance_data = get_distance_and_time_matrix(location_dict)

IOError: [Errno 2] No such file or directory: 'XA_Location.csv'

## [逆地理编码](http://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-geocoding-abroad)

**逆地理编码服务**：用户可通过该功能，将位置坐标解析成对应的行政区划数据以及周边高权重地标地点分布情况，整体描述坐标所在的位置。

In [2]:
# coding:utf-8
import urllib2
import json
import pandas as pd
from tqdm import tqdm
def desc_getpois(api_input_dict, add_columns_list):
    
    if 'extensions_poi' in api_input_dict and api_input_dict['extensions_poi'] is 'null':
        assert api_input_dict['pois'] == 0
    
    lat = api_input_dict['location']['latitude']
    lon = api_input_dict['location']['longitude']
    param_str_list = []
    for key, values in api_input_dict.items():
        if key is not 'location':
            param_str_list.append(key + '=' + str(values))
        else:
            param_str_list.append(key + '=' + str(lat) + ',' + str(lon))
     
    param_str = '&'.join(param_str_list)

    desc_geocoder_api = 'http://api.map.baidu.com/geocoder/v2/?{0}'.format(param_str)
    info_dict = json.loads(urllib2.urlopen(desc_geocoder_api, timeout=3).read())
    if info_dict['status'] == 0:
        return {key: info_dict['result']['addressComponent'][key] for key in ['province', 'city', 'district']}
    else:
        return {key: 'call_api_failed' for key in ['province', 'city', 'district']}

    
   
def df_call_baidu_geocoder_api(ak_df, lat_lon_df, latlon_columns_name_list, add_columns_list):
    
    lat_lon_df['province'] = ''
    lat_lon_df['city'] = ''
    lat_lon_df['district'] = ''
    
    lat_lon_df.reset_index(drop=True, inplace=True)
    for index in tqdm(lat_lon_df.index):
        ak_index = index% ak_df.shape[0]
        ak = ak_df.iloc[ak_index, 1]
        lat = lat_lon_df.loc[index, latlon_columns_name_list[0]]
        lon = lat_lon_df.loc[index, latlon_columns_name_list[1]]
        api_input_dict = {'location': {'latitude': lat, 'longitude': lon}, 
                    'pois':0,'ak':ak, 'output': 'json', 'extensions_poi': 'null', 'language': 'en'}
        info_dict = desc_getpois(api_input_dict=api_input_dict, add_columns_list=add_columns_list)
        lat_lon_df.loc[index, add_columns_list] = [info_dict[col] for col in add_columns_list]
    return lat_lon_df

In [None]:
api_input_dict = {'location': {'latitude': lat, 'longitude': lon}, 
                    'pois':0,'ak':ak, 'output': 'json', 'extensions_poi': 'null', 'language': 'en'}


## [地理编码](http://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-geocoding)

**地理编码服务**:用户可通过该功能，将结构化地址（省/市/区/街道/门牌号）解析为对应的位置坐标。地址结构越完整，地址内容越准确，解析的坐标精度越高。

`http://api.map.baidu.com/geocoder/v2/?address=北京市海淀区上地十街10号&output=json&ak=您的ak&callback=showLocation //GET请求`

In [93]:
# coding:utf-8
import requests
import json
import pandas as pd
from tqdm import tqdm

output_type = 'json'
ak = 'joEZrGxLI7hH0Bggf7rVBUPwB9hgooyx'
city = '上海市'
def getpois(address):
    
    getpois_api = 'http://api.map.baidu.com/geocoder/v2/?address={0}&output={1}&ak={2}&city={3}'.format(address,output_type,ak,city)
    info_dict = requests.get(getpois_api).json()
    
    precise = info_dict['result']['precise']
    confidence = info_dict['result']['confidence']
    comprehension = info_dict['result']['comprehension']
    level = info_dict['result']['level']
    lng = info_dict['result']['location']['lng']
    lat = info_dict['result']['location']['lat']
    return [address , lat, lng, precise, confidence, comprehension]


def df_add_poi(in_path,out_path):
    info_list = []
    old_df = pd.read_csv(path)
    for i in old_df['store_address']:
        info_list.append(getpois(address=i))
        info_df = pd.DataFrame(info_list,columns=['store_address', 'baidu_lat', 'baidu_lng','precise', 'confidence','comprehension'])
        new_df = old_df.merge(info_df,on='store_address',how='inner')
        new_df.to_csv(out_path,index=False,encoding='utf-8')
    return 

In [94]:
info_list = getpois(address='上海市杨浦区吉浦路3号')
print(info_list)

['上海市杨浦区吉浦路3号', 31.30492357706552, 121.49789529042646, 1, 80, 100]


## 返回指定线路交通态势

In [31]:
def get_load_info_rectangle(ak, left_lon, left_lat, right_lon, right_lat):
    '''
    返回矩形区域交通态势
    ak:密钥
    left_lon:矩形左下角的longitude
    left_lat: 矩形左下角的latitude
    right_lon:矩形右上角的longitude
    right_lat:矩阵右上角的latitude
    Note:矩阵对角线要小于10km
    '''
    if distance_of_two_points([left_lat, left_lon], [right_lat, right_lon]) > 10000:
        print "The diagonal of the rectangle should be less than  10km"
        return None
    url = "https://restapi.amap.com/v3/traffic/status/rectangle?rectangle=" + \
    "".join([str(left_lon), ",", str(left_lat), ";", \
             str(right_lon), ",", str(right_lat), "&key=", str(ak)])
    try:
        info = urllib.urlopen(url).read()
        return info
    except:
        return None


def get_load_info_circle(ak, center_lon, center_lat, radius):
    '''
    返回圆形区域交通态势
    ak:密钥
    center_lon:该圆形区域中心点longitude
    center_lat:该圆形区域中心的latitude
    radius:该圆形区域对应的半径
    Note:radius要小于5000米
    '''
    if radius > 5000:
        print "radius should be less than 5000"
        return None
    url = "https://restapi.amap.com/v3/traffic/status/circle?location=" + \
    "".join([str(center_lon), ",", str(center_lat), "&radius=", str(radius), "&key=", str(ak)])
    try:
        info = urllib.urlopen(url).read()
        return info
    except:
        return None


def get_load_info_road(ak, name, city=None, adcode=None):
    '''
    返回指定线路交通态势
    ak:密钥
    name:道路名
    city:城市名
    adcode:城市编码
    Note:city和adcode要填其中一个即可
    '''
    if city is None and adcode is None:
        print "please specify city or adcode"
        return None
    else:            
        if city is not None:
            url = "https://restapi.amap.com/v3/traffic/status/road?name="+\
            "".join([name,"&city=",city,"&key=",str(ak)])
        else:
            url = "https://restapi.amap.com/v3/traffic/status/road?name="+\
            "".join([name,"&adcode=",str(adcode),"&key=",str(ak)])  
        try:
            info = urllib.urlopen(url).read()
            return info    
        except:
            return None

# 爬虫类代码

## BeautifulSoup4.2.0文档练习

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc)
# 按照标准的缩进格式的结构输出
print soup.prettify()

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [5]:
# Tag
tag = soup.body
print type(tag)

# Name:每个tag都有自己的名字,通过 .name 来获取
print tag.name

<class 'bs4.element.Tag'>
body


### Attributes:一个tag可能有很多个属性

In [8]:
# tag 有一个 “class” 的属性,值为 “boldest” . tag的属性的操作方法与字典相同.
tag_p = tag.p
print tag_p['class']

#也可以直接”点”取属性, 比如: .attrs
print tag_p.attrs

#多值属性：最常见的多值的属性是 class (一个tag可以有多个CSS的class). 在Beautiful Soup中多值属性的返回类型是list。
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
print css_soup.p['class']

['title']
{'class': ['title']}
['body', 'strikeout']


In [9]:
id_soup = BeautifulSoup('<p  class="my id">ss</p>')
id_soup_1 = BeautifulSoup('<p  id="my id">ss</p>')
print id_soup.p.name
print id_soup.p.attrs
print id_soup.p['class']
print id_soup.p.string
print id_soup_1.p.attrs

p
{'class': ['my', 'id']}
['my', 'id']
ss
{'id': 'my id'}


In [11]:
# 可以遍历的字符串
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag_e = soup.b
print tag_e.string
print type(tag_e.string)

Extremely bold
<class 'bs4.element.NavigableString'>


一个 NavigableString 字符串与Python中的Unicode字符串相同,并且还支持包含在 遍历文档树 和 搜索文档树 中的一些特性. # 通过 unicode() 方法可以直接将 NavigableString 对象转换成Unicode字符串

In [13]:
unicode_str = unicode(tag_e.string)
print unicode_str

#tag中包含的字符串不能编辑,但是可以被替换成其它的字符串,用 replace_with() 方法
tag_e.string.replace_with('today is not a good day')
tag_e

Extremely bold


<b class="boldest">today is not a good day</b>

**注释及特殊字符串**

Tag , NavigableString , BeautifulSoup 几乎覆盖了html和xml中的所有内容,但是还有一些特殊对象.容易让人担心的内容是文档的注释部分

In [23]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
print type(comment)
print comment
print soup.b.prettify()

<class 'bs4.element.Comment'>
Hey, buddy. Want to buy a used parser?
<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>


In [24]:
soup = BeautifulSoup(html_doc)
print soup.prettify()
# tag的名字：操作文档树最简单的方法就是告诉它你想获取的tag的name.如果想获取 <head> 标签,只要用 soup.head 
print soup.head

print soup.title

#可以在文档树的tag中多次调用这个方法
print soup.body.b
print soup.b

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>
<head><title>The Dormouse's story</title></head>
<title>The Dormouse's story</title>
<b>The Dormouse's story</b>
<b>The Dormouse's story</b>


从结果来看，似乎可以直接从根节点访问嵌套tag

In [25]:
# 通过点取属性的方式只能获得当前名字的第一个tag
print soup.a

# 如果想要得到所有的<a>标签,或是通过名字得到比一个tag更多的内容的时候,就需要用到 Searching the tree 中描述的方法,比如: find_all()
print soup.find_all('a')

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


### contents 和 .children

In [26]:
# tag的 .contents 属性可以将tag的子节点以列表的方式输出
head_tag = soup.head
print head_tag

title_tag = head_tag.contents[0]
print title_tag
print title_tag.contents
print '--------------------'
for i in soup.body.contents:
    print i

<head><title>The Dormouse's story</title></head>
<title>The Dormouse's story</title>
[u"The Dormouse's story"]
--------------------


<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


<p class="story">...</p>




字符串没有 .contents 属性,因为字符串没有子节点


通过tag的 .children 生成器,可以对tag的子节点进行循环

In [27]:
for child in soup.body.children:
    print child



<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


<p class="story">...</p>




## 爬取[中国证监会机构部基金注册状态](http://ndes.csrc.gov.cn/alappl/home/gongshi)

In [29]:
# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
import sys
import re
from tqdm import tqdm
import pandas as pd

reload(sys)
sys.setdefaultencoding('utf-8')

'''发送连接网页的请求'''


def get_soup(url, timeout):

    request = urllib2.Request(url)
    response = urllib2.urlopen(url=request, timeout=timeout)
    contents = response.read()
    soup = BeautifulSoup(contents, "html.parser", from_encoding='utf-8')
    return soup

'''解析文本信息'''


def from_text_to_news(text, recode_dict):
    company = re.compile('^关于(.*?)的').findall(text)[0]
    key_new = re.compile('.*《(.*)》').findall(text)[0].split('—')
    event = key_new[0]
    if len(key_new) == 2:
        fund_name = key_new[1]
        recode_dict['基金名称'] = fund_name
    elif len(key_new) == 3 and key_new[1] == '':
        fund_name = key_new[2]
        recode_dict['基金名称'] = fund_name

    recode_dict['公司名称'] = company
    recode_dict['申请注册事项'] = event

    return recode_dict


''' 获取一页的所有信息 '''


def page_news(useful_url, page, timeout, page_news_list, recode_list):
    useful_url_page = useful_url + '&' + 'pageNo=' + str(page)
    soup_1 = get_soup(url=useful_url_page, timeout=timeout)

    for child in soup_1.find_all('div', attrs={'style': 'margin-top: -11px; margin-left: 12px;'}):
        recode_dict = {i: '' for i in recode_list}

        recode_dict['page_seq'] = page
        text = str(child.find('div', attrs={'class': 'titleshow'}).string[:])

        recode_dict = from_text_to_news(text, recode_dict=recode_dict)

        for child_0 in child.table.find_all('tr'):
            if child_0.attrs == {}:
                child_set = child_0.find_all('td')
                process_name = str([i for i in child_set[0].stripped_strings][0])
                process_time = [i for i in child_set[1].stripped_strings][0]
                recode_dict[process_name] = process_time
        page_news_list.append(recode_dict)
    return page_news_list

'''获取网站上所有页码的信息'''


def get_useful_news(url, timeout, part_name, recode_list):
    soup = get_soup(url=url, timeout=timeout)
    useful_url = soup.find('div', text=part_name).parent['href']
    # 获取总页数
    page_soup = get_soup(url=useful_url, timeout=timeout)
    page_num = int(re.findall(r"\d+\.?\d*", page_soup.find('span', attrs={'class': 'jump_text'}).string)[0])

    news_list = []
    for page in tqdm(xrange(1, page_num+1)):
        page_news_list = []
        page_news_list = page_news(useful_url=useful_url, page=page, timeout=timeout, page_news_list=page_news_list, recode_list=recode_list)
        news_list.extend(page_news_list)
    return news_list

'''初始化，全量爬虫'''


def initialization_information(recode_list, url, initialization_path):
    news_list = get_useful_news(url=url, timeout=100, part_name='机构部', recode_list=recode_list)
    news_pd = pd.DataFrame(map(lambda x: [x[i] for i in recode_list], news_list), columns=recode_list)
    news_pd.to_csv(initialization_path)
    return initialization_path

###########################################
# 主程序
# import time
# import pandas as pd
# start =time.time()
# if __name__ == '__main__':
#     recode_list = ['公司名称', '申请注册事项', '基金名称', '接收材料', '补正通知', '受理通知', '一次书面反馈', '行政许可决定书', 'page_seq']

#     url = "http://ndes.csrc.gov.cn/alappl/home/gongshi"

#     '''路径存放关键字为当前日期'''
#     initialization_path = 'news_' + time.strftime("%Y-%d-%m") + '.csv'
#     '''开始全量爬虫'''
#     initialization_information(recode_list=recode_list, url=url, initialization_path=initialization_path)

#     end = time.time()
#     time_diff = (end - start)/3600
#     print('Running time: %f Hours'%time_diff)

## 爬取市政局[高架封路数据](http://www.highway.sh.cn/#/website/public/closeRoad.html)和[夜间施工数据](http://www.highway.sh.cn/#/website/public/nightRecord.html)

In [30]:
# -*- coding: utf-8 -*-
import time
import sys
import requests, json
from datetime import datetime
from dateutil.parser import parse
import pandas as pd

reload(sys)
sys.setdefaultencoding('utf-8')

'''发送连接网页的请求'''


def get_night_record_data(url, pageSize, currentPage):
    assert pageSize == 10 or pageSize == 30
    res = requests.post(url, data={"currentPage": currentPage, "pageSize": pageSize}).text
    data = json.loads(res)
    return data


def get_gaojia_fenglu_data(url, date):
    res = requests.post(url=url, data={"CLOSEDATE": date}).text
    return json.loads(res)



'''解析文本信息'''


def resolve_night_json(json_data, recode_dict):
    values_dict = {recode_dict[i]: [] for i in recode_dict}
    for slice_news in json_data['rows']:
        for j in recode_dict:
            if j != 'begin_date_time' and j != 'end_date_time':
                values_dict[recode_dict[j]].append(slice_news[j])
            elif j is 'begin_date_time':
                values_dict[recode_dict[j]].append(parse('-'.join([str(slice_news['begindate']).split(' ')[0], str(slice_news['begintime'])])))
            elif j is 'end_date_time':
                values_dict[recode_dict[j]].append(parse('-'.join([str(slice_news['enddate']).split(' ')[0], str(slice_news['endtime'])])))
    slice_df = pd.DataFrame(values_dict)
    return slice_df


def night_initialization_information(night_url, pageSize, night_recode_dict, night_path):

    json_news_len = 1
    json_news_list = []
    page_seq = 1

    while json_news_len != 0:
        print page_seq
        json_data = get_night_record_data(url=night_url, pageSize=pageSize, currentPage=page_seq)
        json_news_len = len(json_data['rows'])
        slice_df = resolve_night_json(json_data=json_data, recode_dict=night_recode_dict)
        slice_df['page_seq'] = page_seq
        json_news_list.append(slice_df)
        page_seq += 1
    night_road_df = pd.concat(json_news_list, axis=0)
    night_road_df.to_csv(night_path, index=False)
    return night_road_df


def resolve_gaojia_json(json_data, recode_dict, key_columns):
    values_dict = {recode_dict[i]: [] for i in recode_dict}
    for slice_news in json_data:
        bool_list = map(lambda x: False if x not in slice_news else (slice_news[x] in values_dict[recode_dict[x]]), key_columns)
        if not len(bool_list) == sum(bool_list):
            for j in recode_dict:
                values_dict[recode_dict[j]].append(None if j not in slice_news else slice_news[j])
    slice_df = pd.DataFrame(values_dict)
    return slice_df


def gaojia_initialization_information(gaojia_url, gaojia_recode_dict, begin_date, end_date, gaojia_path, key_columns):
    date_list = [datetime.strftime(i, '%Y-%m-%d') for i in pd.date_range(start=begin_date, end=end_date)]
    gaojia_news_list = []
    for i in date_list:
        print i
        data = get_gaojia_fenglu_data(url=gaojia_url, date=i)
        slice_df = resolve_gaojia_json(json_data=data, recode_dict=gaojia_recode_dict, key_columns=key_columns)
        gaojia_news_list.append(slice_df)
    gaojia_road_df = pd.concat(gaojia_news_list, axis=0)
    gaojia_road_df.to_csv(gaojia_path, index=False)
    return gaojia_road_df

'''
if __name__=='__main__':
    gaojia_url = "http://www.highway.sh.cn/web/gjfl/search.action"
    gaojia_recode_dict = {'CLOSE_AREA': '封路区域', 'CLOSE_CM': '方向', 'CLOSE_CONTENT': '路段区域',
                         'CLOSE_DATE': '日期', 'CLOSE_END_TIME': '起始封路时间', 'CLOSE_START_TIME': '结束封路时间',
                         'ROAD_NAME': '高架名称', 'IMAGE_URL': '示意图网址'}
    key_columns = ['CLOSE_DATE', 'CLOSE_CONTENT']
    gaojia_path = night_road_path = 'output/nightRecord_' + time.strftime("%Y%m%d") + '.csv'
    gaojia_initialization_information(gaojia_url=gaojia_url, gaojia_recode_dict=gaojia_recode_dict, begin_date='20180101',
                                      end_date=time.strftime("%Y%m%d"), gaojia_path=gaojia_path, key_columns=key_columns)
'''

In [37]:
# -*- coding: utf-8 -*-
import time
import sys
import requests, json
from datetime import datetime
from dateutil.parser import parse
import pandas as pd

reload(sys)
sys.setdefaultencoding('utf-8')

'''发送连接网页的请求'''

res = requests.post(url='http://www.ks121.com/').text

## python 画图

**循环画子图**

`import matplotlib.pyplot as plt
    for i in range(len(stationID_list)):
        if i%4 == 0:
            fig = plt.figure(figsize=(10, 10))
        ax = fig.add_subplot(2, 2, i%4+1)
        id_df = merge_data[merge_data['stationID'] == stationID_list[i]].reset_index()
        l = ax.plot(id_df.index, id_df['inNums_{}'.format(day1)], 'green', id_df.index, id_df['inNums_{}'.format(day2)],'red')
        plt.xlabel("time_of_id_{}".format(stationID_list[i]))
        plt.ylabel('{}Day_to_{}Day'.format(tianqi_dict[day1], tianqi_dict[day2]))
        plt.legend(handles=l, labels=[str(day1), str(day2)], loc='best')
        if i%4 == 3 or i == len(stationID_list)-1:
            fig.savefig('figure/inNumTrend_{}Day_to_{}Day_endStation{}'.format(tianqi_dict[day1], tianqi_dict[day2],i))
    plt.close()
    return 0`