Skip to content

Commit

Permalink
Merge pull request #577 from myshero/feature-post-to-custom-api
Browse files Browse the repository at this point in the history
issues_feature_post_api_576 实现通过POST方式将数据推送到自定义接口
  • Loading branch information
dataabc committed Apr 28, 2024
2 parents d7de931 + c0c7525 commit d676cd2
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 4 deletions.
12 changes: 12 additions & 0 deletions docs/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,3 +239,15 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为
- **publish_tool**:存储微博的发布工具。

</details>

## 设置API接口POST联动(可选)

本部分是可选部分,如果不需要将爬取信息通过POST请求发送到指定API接口,可跳过这一步

请求数据格式为 `content-type : application/json`,接口响应返回也需要是 `content-type : application/json`,HTTP状态码为 `200`

数据主体与 `write_mode` 配置的 `json` 输出格式一致,是整页获取数据json,每页POST发送一次

`api_url` 为指定的API接口地址

`api_token` 为接口鉴权TOKEN,将在 Request Headers 中添加 `api-token` 字段,根据需要配置
4 changes: 4 additions & 0 deletions weibo_spider/config_sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,9 @@
"connection_string": "mongodb://admin:password@localhost:27017/weibo",
"dba_name": "",
"dba_password": ""
},
"post_config": {
"api_url": "",
"api_token": ""
}
}
4 changes: 2 additions & 2 deletions weibo_spider/config_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,14 @@ def validate_config(config):
sys.exit()

# 验证write_mode
write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite', 'kafka']
write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite', 'kafka','post']
if not isinstance(config['write_mode'], list):
logger.warning(u'write_mode值应为list类型')
sys.exit()
for mode in config['write_mode']:
if mode not in write_mode:
logger.warning(
u'%s为无效模式,请从txt、csv、json、mongo、sqlite, kafka和mysql中挑选一个或多个作为write_mode',
u'%s为无效模式,请从txt、csv、json、post、mongo、sqlite, kafka和mysql中挑选一个或多个作为write_mode',
mode)
sys.exit()

Expand Down
2 changes: 1 addition & 1 deletion weibo_spider/parser/comment_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_long_weibo(self):
# 3. 去掉所有 HTML 标签,但保留标签内的有效文本
new_content = fromstring(html_string).text_content()
# 4. 替换多个连续的 \n 为一个 \n
new_content = re.sub(r'\n+', '\n', new_content)
new_content = re.sub(r'\n+\s*', '\n', new_content)
weibo_content = handle_garbled(new_content)
if weibo_content is not None:
return weibo_content
Expand Down
6 changes: 6 additions & 0 deletions weibo_spider/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def __init__(self, config):
self.sqlite_config = config.get('sqlite_config')
self.kafka_config = config.get('kafka_config')
self.mongo_config = config.get('mongo_config')
self.post_config = config.get('post_config')
self.user_config_file_path = ''
user_id_list = config['user_id_list']
if FLAGS.user_id_list:
Expand Down Expand Up @@ -284,6 +285,11 @@ def initialize_info(self, user_config):

self.writers.append(KafkaWriter(self.kafka_config))

if 'post' in self.write_mode:
from .writer import PostWriter

self.writers.append(PostWriter(self.post_config))

self.downloaders = []
if self.pic_download == 1:
from .downloader import (OriginPictureDownloader,
Expand Down
3 changes: 2 additions & 1 deletion weibo_spider/writer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
from .txt_writer import TxtWriter
from .sqlite_writer import SqliteWriter
from .kafka_writer import KafkaWriter
from .post_writer import PostWriter

__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter, KafkaWriter]
__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter, KafkaWriter, PostWriter]
59 changes: 59 additions & 0 deletions weibo_spider/writer/post_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import codecs
import json
import logging
import os
import requests

from .writer import Writer
from time import sleep
from requests.exceptions import RequestException

logger = logging.getLogger('spider.post_writer')

class PostWriter(Writer):
def __init__(self, post_config):
self.post_config = post_config
self.api_url = post_config['api_url']
self.api_token = post_config.get('api_token', None)
self.dba_password = post_config.get('dba_password', None)

def write_user(self, user):
self.user = user

def _update_json_data(self, data, weibo_info):
"""将获取到的微博数据转换为json输出模式一致"""
data['user'] = self.user.__dict__
if data.get('weibo'):
data['weibo'] += weibo_info
else:
data['weibo'] = weibo_info
return data

def send_post_request_with_token(self, url, data, token, max_retries, backoff_factor):
headers = {
'Content-Type': 'application/json',
'api-token': f'{token}',
}
for attempt in range(max_retries + 1):
try:
response = requests.post(url, json=data, headers=headers)
if response.status_code == requests.codes.ok:
return response.json()
else:
raise RequestException(f"Unexpected response status: {response.status_code}")
except RequestException as e:
if attempt < max_retries:
sleep(backoff_factor * (attempt + 1)) # 逐步增加等待时间,避免频繁重试
continue
else:
logger.error(f"在尝试{max_retries}次发出POST连接后,请求失败:{e}")

def write_weibo(self, weibos):
"""将爬到的信息POST到API"""
data = {}
data = self._update_json_data(data, [w.__dict__ for w in weibos])
if data:
self.send_post_request_with_token(self.api_url, data, self.api_token, 3, 2)
logger.info(u'%d条微博通过POST发送到 %s', len(weibos), self.api_url)
else:
logger.info(u'没有获取到微博,略过API POST')

0 comments on commit d676cd2

Please sign in to comment.