#### 2020-8-11 爬数据与入库, by DJun

In [2]:
# http客户端
import requests

# html解析器
from lxml import html

In [3]:
# 从url请求数据（未经js渲染）
req = requests.get("https://news.163.com/rank/")

In [4]:
# 设定解析html的编码
req.encoding = 'gbk'

In [5]:
# 预览一下
req.text[:1000]

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<meta name="model_url" content="http://news.163.com/special/0001386F/index_rank.html" />\n    <meta http-equiv="Content-Type" content="text/html; charset=gb2312" />\n    <meta http-equiv="Content-Language" content="zh-CN" />\n    <title>新闻排行榜_网易新闻</title>\n    <base target="_blank" />\n    <meta name="keywords" content="" />\n    <meta name="description" content="" />\n    <meta name="author" content="网易" />\n    <meta name="Copyright" content="网易版权所有" />\n  <script>\nif (window.location.protocol == \'http:\') {\n    window.location.href = window.location.href.replace(\'http\', \'https\')\n }\n  </script>\n<script type="text/javascript" data-type="touch/index">!function e(t,n,r){function o(c,a){if(!n[c]){if(!t[c]){var u="function"==typeof require&&require;if(!a&&u)return u(c,!0);if(i)return i(c,!0);var s=new Err

In [6]:
# 将html文本解析为树形对象
et = html.fromstring(req.text)

In [7]:
# 在原网页上按F12键呼出开发者工具，查找需要抓取元素的特征，写出XPath表达式
# XPath入门：https://www.runoob.com/xpath/xpath-tutorial.html

# 举例抓取页面上“快速跳转”后面所有链接按钮的文字
btn_text_list = et.xpath("//div[contains(@class, 'subNav')]/a/text()")
btn_text_list

['新闻',
 '娱乐',
 '体育',
 '财经',
 '科技',
 '汽车',
 '女人',
 '房产',
 '游戏',
 '旅游',
 '教育',
 '全站',
 '图集排行榜']

In [8]:
# 提取分类标题、分类标题下的链接区域
# 这里只提取“点击榜”
titles = et.xpath("//div[@class='titleBar']/h2/text()")
area_left_content_list = et.xpath("//div[@class='area-half left']")

titles

['全站', '新闻', '娱乐', '体育', '财经', '科技', '汽车', '女人', '房产', '游戏', '旅游', '教育']

In [9]:
# 检查分类数与链接区域数量是否一致
len(titles), len(area_left_content_list)

(12, 12)

In [14]:
headers = ["分类", "标题", "点击数", "链接"]
data = []
for t, lc in zip(titles, area_left_content_list):
    items = lc.xpath(".//div[@class='tabContents']//tr[td/a]")
    for i in items:
        row = [t]
        row.append(i.xpath(".//td[1]/a/text()")[0])
        row.append(i.xpath(".//td[2]/text()")[0])
        row.append(i.xpath(".//td[1]/a/@href")[0])
        data.append(row)

headers, data

(['分类', '标题', '点击数', '链接'],
 [['全站',
   '沙宝亮出轨?与戴笑盈牵手回酒店 女方团队:不知情',
   '227749',
   'https://ent.163.com/20/0810/14/FJM3MQOL00038FO9.html'],
  ['全站',
   '北京25万/平复式豪宅 家里限量版椅子价值100',
   '213839',
   'https://home.163.com/20/0804/07/FJ5TC798001081EI.html'],
  ['全站',
   '恭喜！ 陈宝国闫妮获得白玉兰奖视帝后',
   '202889',
   'https://ent.163.com/20/0807/20/FJF3IUN000038FO9.html'],
  ['全站',
   '广州650平豪宅堪称国内最壕 朝江的客厅就长达2',
   '186965',
   'https://home.163.com/20/0806/07/FJB2NUPT001081EI.html'],
  ['全站',
   '50岁张嘉译又改名 新名字曝光被猜是为了健康',
   '182740',
   'https://ent.163.com/20/0807/07/FJDNJES200038FO9.html'],
  ['全站',
   '网曝潘玮柏老婆夜店视频 穿泳衣打碟和半裸男狂欢',
   '181911',
   'https://ent.163.com/20/0806/08/FJB8OP8800038FO9.html'],
  ['全站',
   '香港50岁夫妻每月赚9万 却乐意花90万元去装修',
   '178707',
   'https://home.163.com/20/0808/07/FJG6V1S000108GL2.html'],
  ['全站',
   '大妈花1万买下波音飞机 爆改成三室一厅乐坏孙子',
   '169861',
   'https://home.163.com/20/0809/07/FJIPJ6K100108GL2.html'],
  ['全站',
   '逆转14分！北京胜广东总分1-1 书豪25分尤度',
   '160752',
   'https://sports.16

#### 输出抓取好的数据，到文件，到数据库，到……

In [15]:
# 举例，输出到csv文件。csv是一种结构化文本数据，像excel表格，每列数据默认用逗号分隔，每行数据默认用换行符分隔

# 抓取到数据后，可以按这些字段“分类 标题 点击数 链接”，存入csv文件中
# Python自带的csv模块 使用方法：https://docs.python.org/3.6/library/csv.html

# 存入csv文件中
import csv

with open("data.csv", "w", encoding="gb18030", newline='') as fp:
    writer = csv.writer(fp)
    writer.writerow(headers)
    writer.writerows(data)

In [16]:
# 存入sqlite数据库中
import sqlite3

db = sqlite3.connect("data.db")
cursor = db.cursor()

In [17]:
# 建表
cursor.execute("""create table if not exists my_data (
  id INTEGER PRIMARY KEY AUTOINCREMENT,
  category TEXT,
  title TEXT,
  count INTEGER,
  link TEXT
)""")

<sqlite3.Cursor at 0x213bd343ce0>

In [18]:
# 插入数据
for row in data:
    cursor.execute("""INSERT INTO my_data (category, title, count, link) values (?, ?, ?, ?) """, row)
db.commit()

In [19]:
# 用后随手关
cursor.close()
db.close()