# Finish the search problem

## Demonstration of search agent

In [94]:
search('沙河高教园站','芍药居站')

沙河高教园站（北京地铁昌平线）
沙河站
巩华城站
朱辛庄站（换乘：北京地铁8号线）
育知路站
平西府站
回龙观东大街站
霍营站（换乘：北京地铁13号线）
立水桥站
北苑站
望京西站
芍药居站


In [1]:
import re
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import time
import random
import pickle
import sys
import networkx as nx

In [2]:
headers = {"User-Agent" : 
           "User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
idx = 'https://baike.baidu.com'

In [3]:
# url of all lines
path = idx + '/item/北京地铁/408485'
r = requests.get(path, 
                 allow_redirects=False,headers=headers,verify=False)
r.encoding='utf-8'
line_soup = BeautifulSoup(r.text)
lines = line_soup.find_all(href = re.compile("item"), text=re.compile("北京地铁"))
line_dict = {}
for line in lines:
    line_dict[line.string] = line['href']

In [11]:
#url for stations of each line
station_dict = {}
for line_name in line_dict:
    path = idx + line_dict[line_name]
    r = requests.get(path, 
                 allow_redirects=False,headers=headers,verify=False)
    r.encoding='utf-8'
    station_soup = BeautifulSoup(r.text)
    stations = station_soup.find_all(href = re.compile("item"), text=re.compile("站"))
    station_dict[line_name] = [[station.text, station['href']] for station in stations]

In [12]:
station_dict

{'北京地铁1号线': [['古城站', '/item/%E5%8F%A4%E5%9F%8E%E7%AB%99/2662893'],
  ['北京站', '/item/%E5%8C%97%E4%BA%AC%E7%AB%99/3021116'],
  ['古城站', '/item/%E5%8F%A4%E5%9F%8E%E7%AB%99/2662893'],
  ['苹果园站', '/item/%E8%8B%B9%E6%9E%9C%E5%9B%AD%E7%AB%99/6486355'],
  ['苹果园站', '/item/%E8%8B%B9%E6%9E%9C%E5%9B%AD%E7%AB%99/6486355'],
  ['复兴门站', '/item/%E5%A4%8D%E5%85%B4%E9%97%A8%E7%AB%99/2371198'],
  ['西单站', '/item/%E8%A5%BF%E5%8D%95%E7%AB%99/2482263'],
  ['天安门西站', '/item/%E5%A4%A9%E5%AE%89%E9%97%A8%E8%A5%BF%E7%AB%99/1230082'],
  ['福寿岭站', '/item/%E7%A6%8F%E5%AF%BF%E5%B2%AD%E7%AB%99/1827061'],
  ['四惠东站', '/item/%E5%9B%9B%E6%83%A0%E4%B8%9C%E7%AB%99/1229653'],
  ['北京站', '/item/%E5%8C%97%E4%BA%AC%E7%AB%99/18844'],
  ['北京站', '/item/%E5%8C%97%E4%BA%AC%E7%AB%99/3021116'],
  ['南礼士路站', '/item/%E5%8D%97%E7%A4%BC%E5%A3%AB%E8%B7%AF%E7%AB%99'],
  ['复兴门站', '/item/%E5%A4%8D%E5%85%B4%E9%97%A8%E7%AB%99/2371198'],
  ['长椿街站', '/item/%E9%95%BF%E6%A4%BF%E8%A1%97%E7%AB%99'],
  ['长椿街站', '/item/%E9%95%BF%E6%A4%BF%E8%A1%97%E7%AB%99'],

In [13]:
def get_response(url):#get response from a url
    r = requests.get(url,headers=headers,verify=False)
    r.encoding='utf-8'
    if r.status_code is not 200:
        for i in range(3):#访问失败时重试3次
            time.sleep(0.5)
            r = get_response(url)
            if r.status_code == 200:
                break
    return r

In [14]:
def get_connections(station_url):#get connections of a station
    r = get_response(station_url)
    if r.status_code is not 200: 
        print(r.status_code, station_url)
        return '无法访问：' + station_url, []
    soup = BeautifulSoup(r.text)
    print(r.status_code, soup.h1, station_url)
    connections = []
    if soup.h1:
        current_station = soup.h1.text
        for result in soup.find_all(text = re.compile("(^\w{1,8}站$)|(站）$)")):
            result_clean = re.findall(r"(\w+站)）", result)
            if re.search(r"(下一站)|(下站)", str(result.parent.parent)) and result is not current_station:
                if result.parent.has_attr('href'):
                    connections.append([re.findall(r"(\w+站)）", result) if re.findall(r"(\w+站)）", result) else result
                                       , result.parent['href']])
                else:
                    connections.append([re.findall(r"(\w+站)）", result) if re.findall(r"(\w+站)）", result) else result
                                        , ''])
        return current_station, connections
    else:
        connections.append(['非地铁站', ''])
        return '非地铁站', connections

In [15]:
def station_crawler(idx, start_path, seen = set()):# crawl Beijing railway stations and their connections starting with one of them
    
    station_connections = {}
    visited = [start_path]

    while visited:
        time.sleep(0.5)
        current_station = 'default'
        connections = []
        current_url = idx + visited.pop()
        if current_url in seen: continue
        current_station, connections = get_connections(current_url)
        station_connections[current_station] = connections
        for connection in connections:
            if connection[1] in seen: continue
            visited = [connection[1]] + visited
        seen.add(current_url)
    return station_connections

In [18]:
# 从各条线路爬下来的所有线路页面取得所有（有自己页面的）地铁站
# 包括地铁站的path、所属的line（可能多个）
# '复兴门站': {'path': '/item/%E5%A4%8D%E5%85%B4%E9%97%A8%E7%AB%99/2371198','line': ['北京地铁1号线', '北京地铁2号线']}

all_stations = dict()
for line_name in station_dict:
    stations = station_dict[line_name]
    for station in stations:
        if station[0] in all_stations.keys():
            if line_name not in all_stations[station[0]]['line']:
                all_stations[station[0]]['line'].append(line_name)
        else:
            all_stations[station[0]] = {'path': station[1] , 'line': [line_name]}

In [19]:
# 借助总的地铁站字典，接下来尝试从不同起点开始爬（防止有的地铁站的页面里没有正确的上一站/下一站）
station_connections = {}
seen = set()
unvisited_stations = set(all_stations.keys())

In [23]:
# 直到station_connections里边有了所有all_stations中的地铁站
while len(unvisited_stations) is not 0:
    start_path = all_stations[random.choice(list(unvisited_stations))]['path']
    station_connections = dict(station_connections, **station_crawler(idx, start_path, seen))
    unvisited_stations = set(all_stations.keys())-set(station_connections.keys())
    for key in station_connections:
        for connection in station_connections[key]:
            seen.add(connection[1])
    print('----------num of unvisited stations: '+ str(len(unvisited_stations)) + '----------')

200 <h1>达官营站</h1> https://baike.baidu.com/item/%E8%BE%BE%E5%AE%98%E8%90%A5%E7%AB%99
200 <h1>湾子站</h1> https://baike.baidu.com/item/%E6%B9%BE%E5%AD%90%E7%AB%99
200 <h1>广安门内站</h1> https://baike.baidu.com/item/%E5%B9%BF%E5%AE%89%E9%97%A8%E5%86%85%E7%AB%99
200 <h1>红莲南路站</h1> https://baike.baidu.com/item/%E7%BA%A2%E8%8E%B2%E5%8D%97%E8%B7%AF%E7%AB%99/18521577
200 <h1>木樨地站</h1> https://baike.baidu.com/item/%E6%9C%A8%E6%A8%A8%E5%9C%B0%E7%AB%99
200 <h1>北京西站</h1> https://baike.baidu.com/item/%E5%8C%97%E4%BA%AC%E8%A5%BF%E7%AB%99/17646705
200 <h1>菜市口站</h1> https://baike.baidu.com/item/%E8%8F%9C%E5%B8%82%E5%8F%A3%E7%AB%99
200 <h1>南礼士路站</h1> https://baike.baidu.com/item/%E5%8D%97%E7%A4%BC%E5%A3%AB%E8%B7%AF%E7%AB%99
200 <h1>玉渊潭东门站</h1> https://baike.baidu.com/item/%E7%8E%89%E6%B8%8A%E6%BD%AD%E4%B8%9C%E9%97%A8%E7%AB%99


KeyboardInterrupt: 

In [68]:
unvisited_stations #经人肉验证 剩下的这些不需要爬 爬取完成

{'中国美术馆站、珠市口站-瀛海站',
 '京张高铁（出站）',
 '北京市郊铁路S2线（出站）',
 '岛式站台',
 '清华东站',
 '西客站',
 '金鱼胡同站-前门站'}

In [108]:
# 清洗一下，且去除URL
station_connections_clean = {}
for key in station_connections:
    connections = station_connections[key]
    if key in all_stations.keys() and connections: # 去除错误访问加进来的地铁站、无连接（没开通）的地铁站
        station_connections_clean[key] = []
        for connection in connections:
            if type(connection[0]) is list: connection = connection[0]
            if connection[0] != key and connection[0] in all_stations.keys(): # 去除误加进来的地铁站本身
                station_connections_clean[key].append(connection[0])

In [112]:
station_connections_clean

{'良乡大学城站': ['良乡大学城北站', '良乡大学城西站'],
 '良乡大学城北站': ['广阳城站', '良乡大学城站'],
 '良乡大学城西站': ['良乡大学城站', '良乡南关站'],
 '广阳城站': ['篱笆房站', '良乡大学城北站'],
 '良乡南关站': ['良乡大学城西站', '苏庄站'],
 '篱笆房站': ['长阳站', '广阳城站'],
 '长阳站': ['稻田站', '篱笆房站'],
 '稻田站': ['大葆台站'],
 '大葆台站': ['郭公庄站', '稻田站'],
 '丽泽商务区站': ['终点站', '草桥站'],
 '顾册站': ['老城区站'],
 '瀛海站': ['德茂站'],
 '德茂站': ['五福堂站', '瀛海站'],
 '五福堂站': ['火箭万源站', '德茂站'],
 '火箭万源站': ['东高地站', '五福堂站'],
 '东高地站': ['和义站', '火箭万源站'],
 '和义站': ['珠市口站', '大红门南站', '瀛海站', '东高地站', '大红门南站'],
 '珠市口站': ['虎坊桥站', '桥湾站', '天桥站', '前门站'],
 '大红门南站': ['大红门站', '和义站', '大红门站', '和义站'],
 '虎坊桥站': ['菜市口站', '珠市口站'],
 '桥湾站': ['珠市口站', '磁器口站'],
 '天桥站': ['永定门外站', '珠市口站'],
 '前门站': ['和平门站', '崇文门站', '珠市口站', '王府井站'],
 '六营门站': ['五福堂站'],
 '菜市口站': ['陶然亭站', '宣武门站', '广安门内站', '虎坊桥站'],
 '永定门外站': ['瀛海站', '木樨园站', '陶然桥站'],
 '和平门站': ['宣武门站', '前门站'],
 '崇文门站': ['北京站', '前门站', '前门站', '北京站', '磁器口站', '东单站'],
 '王府井站': ['天安门东站', '东单站', '前门站', '金鱼胡同站'],
 '陶然亭站': ['北京南站', '菜市口站'],
 '宣武门站': ['长椿街站', '和平门站', '菜市口站', '西单站'],
 '广安门内站': ['达官营站', '菜市口站'],
 '木

In [111]:
# 检查缺失connection的车站
for key in station_connections_clean.keys():
    if not station_connections_clean[key]:
        print(key)

草桥站
小马庄站
云景东路站
黄厂村站
万盛南街西口站
侧式站台
菜户营站
老城区站
黑庄户站


In [45]:
# 没办法了，手动提取一下……缺失原因是“上一站”/“下一站”的格式与其他页面不同，提取失败
station_connections_clean['草桥站'] = ['纪家庙站','角门西站','景风门站','新发地站','丽泽商务区站']
station_connections_clean['小马庄站'] = ['高楼金站','云景东路站']
station_connections_clean['云景东路站'] = ['小马庄站','万盛南街西口站']
station_connections_clean['万盛南街西口站'] = ['小马庄站','黑庄户站']
station_connections_clean['黑庄户站'] = ['万盛南街西口站','大稿站','郎辛庄站']
station_connections_clean['菜户营站'] = ['西铁营站','丽泽商务区站']
station_connections_clean['老城区站'] = ['顾册站','饶乐府站']
station_connections_clean['黄厂村站'] = ['豆各庄站','焦化厂站']
# 16号线部分站未开通，没有前后连接
station_connections_clean['苏州桥站'] = ['万寿寺站','苏州街站']
station_connections_clean['万寿寺站'] = ['国家图书馆站','苏州桥站']
station_connections_clean['万泉河桥站'] = ['苏州街站','西苑站']
# 12号线部分也未开通，没有前后连接
station_connections_clean['蓟门桥站'] = ['大钟寺站','北太平庄站']
# 歧义词条进入后需要二次跳转，这个逻辑我没有写
station_connections_clean['西土城站'] = ['牡丹园站','蓟门桥站','知春路站']
# 不知道为啥丢了
station_connections_clean['北京西站'] = ['六里桥东站','军事博物馆站','湾子站']


In [59]:
from collections import defaultdict
scc = defaultdict(list)
for key in station_connections_clean.keys():
    if type(station_connections_clean[key]) is None:
        print (key)

In [67]:
with open ('./station_connections_clean.txt' , 'w') as f:
    f.write(str(station_connections_clean))
with open ('./station_line_paths.txt' , 'w') as f:
    f.write(str(all_stations))

In [65]:
# 默认的递归深度只有1000，无法保存文件，先改大
sys.setrecursionlimit(2000000)
# 保存成果……
with open ('./station_connections_clean' , 'wb') as f:
    pickle.dump(station_connections_clean, f)

with open ('./station_line_paths' , 'wb') as f:
    pickle.dump(all_stations, f)

D2: <dict object at 0x7fc5d0744360>
T4: <class 'bs4.element.NavigableString'>
# T4
D2: <dict object at 0x7fc5d071c630>
T4: <class 'bs4.element.Tag'>
# T4
D2: <dict object at 0x7fc5d0894b88>
T4: <class 'bs4.BeautifulSoup'>
# T4
T1: <class 'set'>
F2: <function _load_type at 0x7fc5d268b0d0>
# F2
# T1
D2: <dict object at 0x7fc5d071c678>
# D2
D2: <dict object at 0x7fc5d0894b40>
D2: <dict object at 0x7fc5d071c1b0>
# D2
D2: <dict object at 0x7fc5d071c5a0>
# D2
D2: <dict object at 0x7fc5d07304c8>
D2: <dict object at 0x7fc5d0730828>
D2: <dict object at 0x7fc5d0894af8>
D2: <dict object at 0x7fc5d07306c0>
# D2
D2: <dict object at 0x7fc5d0744a20>
D2: <dict object at 0x7fc5d073f7e0>
# D2
D2: <dict object at 0x7fc5d073f828>
D2: <dict object at 0x7fc5d073f9d8>
D2: <dict object at 0x7fc5d073f870>
# D2
D2: <dict object at 0x7fc5d073f948>
D2: <dict object at 0x7fc5d073f900>
# D2
# D2
# D2
# D2
# D2
D2: <dict object at 0x7fc5d0894a68>
D2: <dict object at 0x7fc5d0744ab0>
# D2
D2: <dict object at 0x7fc5d07

D2: <dict object at 0x7fc5d07afca8>
D2: <dict object at 0x7fc5f0a0c4c8>
# D2
D2: <dict object at 0x7fc5f0a0cca8>
D2: <dict object at 0x7fc5d07afc60>
D2: <dict object at 0x7fc5d071cf78>
# D2
D2: <dict object at 0x7fc5d073fea0>
D2: <dict object at 0x7fc5d07afc18>
D2: <dict object at 0x7fc5d0744090>
# D2
D2: <dict object at 0x7fc5f0bf5ea0>
D2: <dict object at 0x7fc5d075bb88>
D2: <dict object at 0x7fc5d075bc18>
D2: <dict object at 0x7fc5d07afbd0>
D2: <dict object at 0x7fc5d075bcf0>
# D2
D2: <dict object at 0x7fc5d075bd38>
D2: <dict object at 0x7fc5d07afb88>
D2: <dict object at 0x7fc5d075be10>
# D2
D2: <dict object at 0x7fc5d075be58>
D2: <dict object at 0x7fc5d07afb40>
D2: <dict object at 0x7fc5d075bf30>
# D2
D2: <dict object at 0x7fc5d075bf78>
D2: <dict object at 0x7fc5d07afaf8>
D2: <dict object at 0x7fc5d0760090>
# D2
D2: <dict object at 0x7fc5d0760120>
D2: <dict object at 0x7fc5d07afab0>
D2: <dict object at 0x7fc5d07601f8>
# D2
D2: <dict object at 0x7fc5d0760288>
D2: <dict object at 0x7f

# D2
D2: <dict object at 0x7fc5d07768b8>
D2: <dict object at 0x7fc5d0776948>
D2: <dict object at 0x7fc5d07acb88>
D2: <dict object at 0x7fc5d0776a20>
# D2
D2: <dict object at 0x7fc5d0776a68>
D2: <dict object at 0x7fc5d0776af8>
D2: <dict object at 0x7fc5d07acb40>
D2: <dict object at 0x7fc5d0776bd0>
# D2
D2: <dict object at 0x7fc5d0776c18>
D2: <dict object at 0x7fc5d0776ca8>
D2: <dict object at 0x7fc5d07acaf8>
D2: <dict object at 0x7fc5d0776d80>
# D2
D2: <dict object at 0x7fc5d0776dc8>
D2: <dict object at 0x7fc5d0776e58>
D2: <dict object at 0x7fc5d07acab0>
D2: <dict object at 0x7fc5d0776f30>
# D2
D2: <dict object at 0x7fc5d0776f78>
D2: <dict object at 0x7fc5d077c048>
D2: <dict object at 0x7fc5d077c0d8>
D2: <dict object at 0x7fc5d07aca68>
D2: <dict object at 0x7fc5d077c1b0>
# D2
D2: <dict object at 0x7fc5d077c240>
D2: <dict object at 0x7fc5d07aca20>
D2: <dict object at 0x7fc5d077c318>
# D2
D2: <dict object at 0x7fc5d077c360>
D2: <dict object at 0x7fc5d07ac9d8>
D2: <dict object at 0x7fc5d07

D2: <dict object at 0x7fc5d07ab948>
D2: <dict object at 0x7fc5d0793360>
# D2
D2: <dict object at 0x7fc5d07933a8>
D2: <dict object at 0x7fc5d07ab900>
D2: <dict object at 0x7fc5d0793480>
# D2
D2: <dict object at 0x7fc5d07934c8>
D2: <dict object at 0x7fc5d0793558>
D2: <dict object at 0x7fc5d07ab8b8>
D2: <dict object at 0x7fc5d0793630>
# D2
D2: <dict object at 0x7fc5d0793678>
D2: <dict object at 0x7fc5d0793708>
D2: <dict object at 0x7fc5d07ab870>
D2: <dict object at 0x7fc5d07937e0>
# D2
D2: <dict object at 0x7fc5d0793828>
D2: <dict object at 0x7fc5d07ab828>
D2: <dict object at 0x7fc5d0793900>
# D2
D2: <dict object at 0x7fc5d0793948>
D2: <dict object at 0x7fc5d07ab7e0>
D2: <dict object at 0x7fc5d0793a20>
# D2
D2: <dict object at 0x7fc5d0793a68>
D2: <dict object at 0x7fc5d07ab798>
D2: <dict object at 0x7fc5d0793b40>
# D2
D2: <dict object at 0x7fc5d0793b88>
D2: <dict object at 0x7fc5d07ab750>
D2: <dict object at 0x7fc5d0793c60>
# D2
D2: <dict object at 0x7fc5d0793ca8>
D2: <dict object at 0x7f

# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
# D2
D2: <dict object at 0x7fc5d0893ea0>
D2: <dict object at 0x7fc5d07b31f8>
# D2
D2: <dict object at 0x7fc5d07b3240>
D2: <dict object at 0x7fc5d0893e58>
D2: <dict object at 0x7fc5d07b3318>
# D2
D2: <

# D2
D2: <dict object at 0x7fc5d07c34c8>
D2: <dict object at 0x7fc5d0892e58>
D2: <dict object at 0x7fc5d07c35a0>
# D2
D2: <dict object at 0x7fc5d07c35e8>
D2: <dict object at 0x7fc5d0892e10>
D2: <dict object at 0x7fc5d07c36c0>
# D2
D2: <dict object at 0x7fc5d07c3708>
D2: <dict object at 0x7fc5d07c3798>
D2: <dict object at 0x7fc5d0892dc8>
D2: <dict object at 0x7fc5d07c3870>
# D2
D2: <dict object at 0x7fc5d07c38b8>
D2: <dict object at 0x7fc5d0892d80>
D2: <dict object at 0x7fc5d07c3990>
# D2
D2: <dict object at 0x7fc5d0892d38>
D2: <dict object at 0x7fc5d07c3a20>
# D2
D2: <dict object at 0x7fc5d07c3a68>
D2: <dict object at 0x7fc5d07c3af8>
D2: <dict object at 0x7fc5d07c3b88>
D2: <dict object at 0x7fc5d07c3c18>
D2: <dict object at 0x7fc5d07c3ca8>
D2: <dict object at 0x7fc5d07c3d38>
D2: <dict object at 0x7fc5d0892cf0>
D2: <dict object at 0x7fc5d07c3e10>
# D2
D2: <dict object at 0x7fc5d07c3ea0>
D2: <dict object at 0x7fc5d0892ca8>
D2: <dict object at 0x7fc5d07c3f78>
# D2
D2: <dict object at 0x7f

# D2
D2: <dict object at 0x7fc5d0890dc8>
D2: <dict object at 0x7fc5d07de168>
# D2
D2: <dict object at 0x7fc5d07de1b0>
D2: <dict object at 0x7fc5d07de3f0>
D2: <dict object at 0x7fc5d07de288>
# D2
D2: <dict object at 0x7fc5d07de2d0>
D2: <dict object at 0x7fc5d07de360>
# D2
# D2
# D2
# D2
D2: <dict object at 0x7fc5d07de438>
D2: <dict object at 0x7fc5d0890d80>
D2: <dict object at 0x7fc5d07de510>
# D2
D2: <dict object at 0x7fc5d07de558>
D2: <dict object at 0x7fc5d0890d38>
D2: <dict object at 0x7fc5d07de630>
# D2
D2: <dict object at 0x7fc5d07de6c0>
D2: <dict object at 0x7fc5d0890cf0>
D2: <dict object at 0x7fc5d07de798>
# D2
D2: <dict object at 0x7fc5d07de7e0>
D2: <dict object at 0x7fc5d0890ca8>
D2: <dict object at 0x7fc5d07de8b8>
# D2
D2: <dict object at 0x7fc5d07de948>
D2: <dict object at 0x7fc5d07de9d8>
D2: <dict object at 0x7fc5d0890c60>
D2: <dict object at 0x7fc5d07deab0>
# D2
D2: <dict object at 0x7fc5d07deaf8>
D2: <dict object at 0x7fc5d07deb88>
D2: <dict object at 0x7fc5d0890c18>
D2: 

# D2
D2: <dict object at 0x7fc5d07f91f8>
D2: <dict object at 0x7fc5d08365e8>
D2: <dict object at 0x7fc5d07f92d0>
# D2
D2: <dict object at 0x7fc5d07f9318>
D2: <dict object at 0x7fc5d08365a0>
D2: <dict object at 0x7fc5d07f93f0>
# D2
D2: <dict object at 0x7fc5d07f9438>
D2: <dict object at 0x7fc5d0836558>
D2: <dict object at 0x7fc5d07f9510>
# D2
D2: <dict object at 0x7fc5d07f95a0>
D2: <dict object at 0x7fc5d0836510>
D2: <dict object at 0x7fc5d07f9678>
# D2
D2: <dict object at 0x7fc5d07f96c0>
D2: <dict object at 0x7fc5d08364c8>
D2: <dict object at 0x7fc5d07f9798>
# D2
D2: <dict object at 0x7fc5d0836480>
D2: <dict object at 0x7fc5d07f9828>
# D2
D2: <dict object at 0x7fc5d07f9870>
D2: <dict object at 0x7fc5d07f9900>
D2: <dict object at 0x7fc5d0836438>
D2: <dict object at 0x7fc5d07f99d8>
# D2
D2: <dict object at 0x7fc5d07f9a20>
D2: <dict object at 0x7fc5d08363f0>
D2: <dict object at 0x7fc5d07f9af8>
# D2
D2: <dict object at 0x7fc5d08363a8>
D2: <dict object at 0x7fc5d07f9b88>
# D2
D2: <dict obje

D2: <dict object at 0x7fc5d0828678>
D2: <dict object at 0x7fc5d0828708>
D2: <dict object at 0x7fc5d08351b0>
D2: <dict object at 0x7fc5d08287e0>
# D2
D2: <dict object at 0x7fc5d0828828>
D2: <dict object at 0x7fc5d0835168>
D2: <dict object at 0x7fc5d0828900>
# D2
D2: <dict object at 0x7fc5d0835120>
D2: <dict object at 0x7fc5d0828990>
# D2
D2: <dict object at 0x7fc5d08289d8>
D2: <dict object at 0x7fc5d0828a68>
D2: <dict object at 0x7fc5d08350d8>
D2: <dict object at 0x7fc5d0828b40>
# D2
D2: <dict object at 0x7fc5d0828b88>
D2: <dict object at 0x7fc5d0835090>
D2: <dict object at 0x7fc5d0828c60>
# D2
D2: <dict object at 0x7fc5d0835048>
D2: <dict object at 0x7fc5d0828cf0>
# D2
D2: <dict object at 0x7fc5d0828d38>
D2: <dict object at 0x7fc5d0828dc8>
D2: <dict object at 0x7fc5d0830fc0>
D2: <dict object at 0x7fc5d0828ea0>
# D2
D2: <dict object at 0x7fc5d0830f78>
D2: <dict object at 0x7fc5d0828f30>
# D2
D2: <dict object at 0x7fc5d0828f78>
D2: <dict object at 0x7fc5d082e048>
D2: <dict object at 0x7f

AttributeError: 'NoneType' object has no attribute 'picklable'

In [49]:
all_stations

{'古城站': {'path': '/item/%E5%8F%A4%E5%9F%8E%E7%AB%99/2662893',
  'line': ['北京地铁1号线']},
 '北京站': {'path': '/item/%E5%8C%97%E4%BA%AC%E7%AB%99/3021116',
  'line': ['北京地铁1号线', '北京地铁14号线', '北京地铁2号线']},
 '苹果园站': {'path': '/item/%E8%8B%B9%E6%9E%9C%E5%9B%AD%E7%AB%99/6486355',
  'line': ['北京地铁1号线', '北京地铁6号线', '北京地铁S1线']},
 '复兴门站': {'path': '/item/%E5%A4%8D%E5%85%B4%E9%97%A8%E7%AB%99/2371198',
  'line': ['北京地铁1号线', '北京地铁2号线']},
 '西单站': {'path': '/item/%E8%A5%BF%E5%8D%95%E7%AB%99/2482263',
  'line': ['北京地铁1号线', '北京地铁4号线', '北京地铁大兴线']},
 '天安门西站': {'path': '/item/%E5%A4%A9%E5%AE%89%E9%97%A8%E8%A5%BF%E7%AB%99/1230082',
  'line': ['北京地铁1号线']},
 '福寿岭站': {'path': '/item/%E7%A6%8F%E5%AF%BF%E5%B2%AD%E7%AB%99/1827061',
  'line': ['北京地铁1号线']},
 '四惠东站': {'path': '/item/%E5%9B%9B%E6%83%A0%E4%B8%9C%E7%AB%99/1229653',
  'line': ['北京地铁1号线', '北京地铁八通线']},
 '南礼士路站': {'path': '/item/%E5%8D%97%E7%A4%BC%E5%A3%AB%E8%B7%AF%E7%AB%99',
  'line': ['北京地铁1号线', '北京地铁2号线']},
 '长椿街站': {'path': '/item/%E9%95%BF%E6%A4%BF%E8%A1%97%E

In [2]:
# 2,000 years later……
# 读取之前的结果……
with open ('./station_connections_clean' , 'rb') as f:
    station_connections_clean = pickle.load(f)

with open ('./station_line_paths' , 'rb') as f:
    all_stations = pickle.load(f)

In [69]:
from collections import defaultdict
scc_default = defaultdict(list)
for key in station_connections_clean.keys():
    scc_default[key] = station_connections_clean[key]

In [95]:
def search(start, end, station_connections = scc_default, all_stations = all_stations): 
    # 用bfs产出路径表
    paths = [[start]]
    route = []
    found = False
    while not found:
        path = paths.pop(0)
        frontier = path[-1]
        for successor in station_connections[frontier]:
            if successor in path: continue #不往回走
            new_path = path + [successor]
            if successor == end: 
                route = new_path
                found = True
                break
            paths.append(new_path)
    # 根据路径和地铁站所属的线，得出是否要换乘以及换乘详情（假设前后两站地铁属于且仅属于同一条线，不会同属于两条不同的线）
    common_lines = list(set(all_stations[route[0]]['line']) & set(all_stations[route[1]]['line']))
    is_transfer = [False for i in range(len(route))]


    for i, station in enumerate(route):
        if i is not 0:
            common_line = set(all_stations[route[i]]['line']) & set(all_stations[route[i-1]]['line'])
            common_lines.append('.'.join(common_line))
            if common_lines[i-1] != '.'.join(common_line):

                is_transfer[i-1] = '换乘：' + '.'.join(common_line)
                common_lines[i-1] = '.'.join(common_line)

    print(route[0] + '（' + common_lines[0] + '）')
    for i, info in enumerate(zip(route, common_lines, is_transfer)):
        if i is not 0:
            if info[2]:
                print(info[0] + '（' + info[2] + '）')
            else:
                print(info[0])

In [96]:
search('沙河高教园站','望京站')

沙河高教园站（北京地铁昌平线）
沙河站
巩华城站
朱辛庄站（换乘：北京地铁8号线）
育知路站
平西府站
回龙观东大街站
霍营站（换乘：北京地铁13号线）
立水桥站
北苑站
望京西站（换乘：北京地铁15号线）
望京站
