import urllib2
from bs4 import BeautifulSoup
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/ width=1000 height=2000></iframe>')
url = "http://www.hprc.org.cn/wxzl/wxysl/lczf/" #这是网址的前半部分
content = urllib2.urlopen(url).read().decode('gb18030') #将网站保存为中文文本!!!
soup = BeautifulSoup(content, 'html.parser') #使用html解析
links = soup.select('td.bl > a')
print links[0]['href']
print len(links)
print links[0]
print links[0]['href'].split('./')
print links[0]['href'].split('./')[1]
print url + links[0]['href'].split('./')[1]
hyperlinks = [url + i['href'].split('./')[1] for i in links]#这是所有的连接
hyperlinks[:5]
hyperlinks[9]
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html \
width=1000 height=500></iframe>')
# 2007年有分页
url_i = 'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html'
content = urllib2.urlopen(url_i).read().decode('gb18030')
soup = BeautifulSoup(content, 'html.parser')
scripts = soup.select('td script')[0]#找到script的内容
scripts
print scripts.text
countPage = int(''.join(scripts).split('countPage = ')[1].split('//')[0])#计算页数!
countPage
def crawler(url_i):#形成函数
content = urllib2.urlopen(url_i).read().decode('gb18030')
soup = BeautifulSoup(content, 'html.parser')
year = soup.find('span', {'class', 'huang16c'}).text[:4]
year = int(year)
report = ''.join(s.text for s in soup('p'))
# 找到分页信息,分页是script
scripts = soup.find_all('script')
countPage = int(''.join(scripts[1]).split('countPage = ')[1].split('//')[0])
if countPage == 1:
pass
else:
for i in range(1, countPage):
url_child = url_i.split('.html')[0] +'_'+str(i)+'.html'
content = urllib2.urlopen(url_child).read().decode('gb18030')
soup = BeautifulSoup(content)
report_child = ''.join(s.text for s in soup('p'))#每个子页的信息
report = report + report_child
return year, report
# 抓取47年政府工作报告内容
reports = {}
for link in hyperlinks:
year, report = crawler(link)
print year
reports[year] = report
with open('/Users/xiha/github/cjc-gh-pages/data/gov_reports1954-2017.txt', 'wb') as f:
for r in reports:
line = str(r)+'\t'+reports[r].replace('\n', '\t') +'\n'
f.write(line.encode('utf-8'))
import pandas as pd#结构化存储
df = pd.read_table('/Users/xiha/github/cjc-gh-pages/data/\
gov_reports1954-2017.txt', names = ['year', 'report'])