数据抓取:

抓取47年政府工作报告



In [1]:
import urllib2
from bs4 import BeautifulSoup
In [4]:
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/ width=1000 height=2000></iframe>')
Out[4]:
In [9]:
url = "http://www.hprc.org.cn/wxzl/wxysl/lczf/" #这是网址的前半部分
content = urllib2.urlopen(url).read().decode('gb18030') #将网站保存为中文文本!!!
soup = BeautifulSoup(content, 'html.parser') #使用html解析

links = soup.select('td.bl > a')
print links[0]['href']
./d12qgrdzfbg/201703/t20170317_389845.html
In [10]:
print len(links)
48
In [11]:
print links[0]
<a href="./d12qgrdzfbg/201703/t20170317_389845.html" target="_blank" title="2017年政府工作报告">2017年政府工作报告</a>
In [12]:
print links[0]['href'].split('./')
[u'', u'd12qgrdzfbg/201703/t20170317_389845.html']
In [13]:
print links[0]['href'].split('./')[1]
d12qgrdzfbg/201703/t20170317_389845.html
In [14]:
print url + links[0]['href'].split('./')[1]
http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201703/t20170317_389845.html
In [15]:
hyperlinks = [url + i['href'].split('./')[1] for i in links]#这是所有的连接
hyperlinks[:5]
Out[15]:
[u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201703/t20170317_389845.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201503/t20150318_319434.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201403/t20140315_270863.html',
 u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201402/t20140214_266528.html']
In [16]:
hyperlinks[9] 
Out[16]:
u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/200908/t20090817_27495.html'
In [17]:
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html \
width=1000 height=500></iframe>')
# 2007年有分页
Out[17]:
In [21]:
url_i = 'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html'
content = urllib2.urlopen(url_i).read().decode('gb18030')  
soup = BeautifulSoup(content, 'html.parser')
scripts = soup.select('td script')[0]#找到script的内容
scripts
Out[21]:
<script>\n\tvar currentPage = 0;//\u6240\u5728\u9875\u4ece0\u5f00\u59cb\n\tvar prevPage = currentPage-1//\u4e0a\u4e00\u9875\n\tvar \u4e0b\u4e00\u9875Page = currentPage+1//\u4e0b\u4e00\u9875\n\tvar countPage = 4//\u5171\u591a\u5c11\u9875\n\t//document.write("\u5171"+countPage+"\u9875&nbsp;&nbsp;");\n\t\n\t//\u5faa\u73af\n\tvar num = 17;\n\tfor(var i=0+(currentPage-1-(currentPage-1)%num) ; i<=(num+(currentPage-1-(currentPage-1)%num))&&(i<countPage) ; i++){\n\t\tif(countPage >1){\n\t\t\tif(currentPage==i)\n\t\t\t\tdocument.write("\u3010<span style=\\"color:#FF0000;\\" class=\\"hui14_30_h\\">"+(i+1)+"</span>\u3011&nbsp;");\n\t\t\telse if(i==0)\n\t\t\t\tdocument.write("<a href=\\"t20090818_27775.html\\" class=\\"hui14_30_h\\">\u3010"+(i+1)+"\u3011</a>&nbsp;");\n\t\t\telse\n\t\t\t\tdocument.write("<a href=\\"t20090818_27775"+"_" + i + "."+"html\\" class=\\"hui14_30_h\\">\u3010"+(i+1)+"\u3011</a>&nbsp;");\n\t\t}\t\n\t}\n\t\n\tdocument.write("<br><br>");\n\t//\u8bbe\u7f6e\u4e0a\u4e00\u9875\u4ee3\u7801\n\tif(countPage>1&&currentPage!=0&&currentPage!=1)\n\t\tdocument.write("<a href=\\"t20090818_27775"+"_" + prevPage + "."+"html\\"><span style=\\"color:#0033FF;font-weight:bold\\">\u4e0a\u4e00\u9875</span></a>&nbsp;");\n\telse if(countPage>1&&currentPage!=0&&currentPage==1)\n\t\tdocument.write("<a href=\\"t20090818_27775.html\\"><span style=\\"color:#0033FF;font-weight:bold\\">\u4e0a\u4e00\u9875</span></a>&nbsp;");\n\t//else\n\t//\tdocument.write("\u4e0a\u4e00\u9875 &nbsp;");\n\t\n\t\n\t//\u8bbe\u7f6e\u4e0b\u4e00\u9875\u4ee3\u7801 \n\tif(countPage>1&&currentPage!=(countPage-1))\n\t\tdocument.write("<a href=\\"t20090818_27775"+"_" + \u4e0b\u4e00\u9875Page + "."+"html\\" ><span style=\\"color:#0033FF;font-weight:bold\\">\u4e0b\u4e00\u9875</span></a> &nbsp;");\n\t//else\n\t//\tdocument.write("\u4e0b\u4e00\u9875 &nbsp;");\n\t\t\t\t\t \n\t</script>
In [22]:
print scripts.text
	var currentPage = 0;//所在页从0开始
	var prevPage = currentPage-1//上一页
	var 下一页Page = currentPage+1//下一页
	var countPage = 4//共多少页
	//document.write("共"+countPage+"页&nbsp;&nbsp;");
	
	//循环
	var num = 17;
	for(var i=0+(currentPage-1-(currentPage-1)%num) ; i<=(num+(currentPage-1-(currentPage-1)%num))&&(i<countPage) ; i++){
		if(countPage >1){
			if(currentPage==i)
				document.write("【<span style=\"color:#FF0000;\" class=\"hui14_30_h\">"+(i+1)+"</span>】&nbsp;");
			else if(i==0)
				document.write("<a href=\"t20090818_27775.html\" class=\"hui14_30_h\">【"+(i+1)+"】</a>&nbsp;");
			else
				document.write("<a href=\"t20090818_27775"+"_" + i + "."+"html\" class=\"hui14_30_h\">【"+(i+1)+"】</a>&nbsp;");
		}	
	}
	
	document.write("<br><br>");
	//设置上一页代码
	if(countPage>1&&currentPage!=0&&currentPage!=1)
		document.write("<a href=\"t20090818_27775"+"_" + prevPage + "."+"html\"><span style=\"color:#0033FF;font-weight:bold\">上一页</span></a>&nbsp;");
	else if(countPage>1&&currentPage!=0&&currentPage==1)
		document.write("<a href=\"t20090818_27775.html\"><span style=\"color:#0033FF;font-weight:bold\">上一页</span></a>&nbsp;");
	//else
	//	document.write("上一页 &nbsp;");
	
	
	//设置下一页代码 
	if(countPage>1&&currentPage!=(countPage-1))
		document.write("<a href=\"t20090818_27775"+"_" + 下一页Page + "."+"html\" ><span style=\"color:#0033FF;font-weight:bold\">下一页</span></a> &nbsp;");
	//else
	//	document.write("下一页 &nbsp;");
					 
	
In [23]:
countPage = int(''.join(scripts).split('countPage = ')[1].split('//')[0])#计算页数!
countPage
Out[23]:
4
In [24]:
def crawler(url_i):#形成函数
    content = urllib2.urlopen(url_i).read().decode('gb18030')  
    soup = BeautifulSoup(content, 'html.parser') 
    year = soup.find('span', {'class', 'huang16c'}).text[:4]
    year = int(year)
    report = ''.join(s.text for s in soup('p'))
    # 找到分页信息,分页是script
    scripts = soup.find_all('script')
    countPage = int(''.join(scripts[1]).split('countPage = ')[1].split('//')[0])
    if countPage == 1:
        pass
    else:
        for i in range(1, countPage):
            url_child = url_i.split('.html')[0] +'_'+str(i)+'.html'
            content = urllib2.urlopen(url_child).read().decode('gb18030') 
            soup = BeautifulSoup(content) 
            report_child = ''.join(s.text for s in soup('p'))#每个子页的信息
            report = report + report_child
    return year, report
In [25]:
# 抓取47年政府工作报告内容
reports = {}
for link in hyperlinks:
    year, report = crawler(link)
    print year
    reports[year] = report 
2017
2016
2015
2014
2013
2012
/Users/xiha/anaconda/anaconda/lib/python2.7/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 174 of the file /Users/xiha/anaconda/anaconda/lib/python2.7/runpy.py. To get rid of this warning, change code that looks like this:

 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))
2011
2010
2009
2008
2007
2006
2005
2004
2003
2002
2001
2000
1999
1998
1997
1996
1995
1994
1993
1992
1991
1990
1989
1988
1987
1986
1985
1984
1983
1982
1981
1980
1979
1978
1975
1964
1959
1960
1957
1956
1955
1954
In [ ]:
 
In [33]:
with open('/Users/xiha/github/cjc-gh-pages/data/gov_reports1954-2017.txt', 'wb') as f:
    for r in reports:
        line = str(r)+'\t'+reports[r].replace('\n', '\t') +'\n'
        f.write(line.encode('utf-8'))
In [34]:
import pandas as pd#结构化存储
df = pd.read_table('/Users/xiha/github/cjc-gh-pages/data/\
gov_reports1954-2017.txt', names = ['year', 'report'])
In [ ]: