Permalink
Browse files

Fix netease news crawling and comments

  • Loading branch information...
fanfank committed Nov 26, 2016
1 parent 2976222 commit 7b555b12bc2427d44d1b1c2e580e12bd61dbc302
Showing with 26 additions and 9 deletions.
  1. +5 −0 .gitignore
  2. +15 −3 reconstruction/crawler/crawler.php
  3. +6 −6 reconstruction/website/actions/commentsAction.php
View
@@ -0,0 +1,5 @@
*.swp
*.swo
reconstruction/category/abstraction_service
reconstruction/category/categorize
reconstruction/category/lastmtime
@@ -297,7 +297,19 @@ public function parseNeteaseNews($page) {
//获取boardId
preg_match('#boardId = "(.*)"#', $page->getContent(), $matches_board_id);
//preg_match('#boardId = "(.*)"#', $page->getContent(), $matches_board_id);
preg_match('#"productKey" \: "(.*)"#', $page->getContent(), $product_key_matches);
$content_info = json_decode(
file_get_contents(
'http://sdk.comment.163.com/api/v1/products/'
. $product_key_matches[1]
. '/threads/'
. strval($matches[4])
. '?ibc=jssdk&callback=&_='
. strval(time())
),
true
);
$strTitle = trim($page->sel('//h1[@id=\'h1title\']', 0)->plaintext);
$arrOutput = array(
@@ -319,12 +331,12 @@ public function parseNeteaseNews($page) {
'source_name' => 'netease',
'content' => trim($strContent),
'source_news_link' => $strUrl,
'source_comment_link' => "http://comment.news.163.com/{$matches_board_id[1]}/{$matches[4]}.html",
'source_comment_link' => "http://comment.news.163.com/{$content_info['boardId']}/{$matches[4]}.html",
'source_news_id' => strval($matches[4]),
'source_comment_id' => strval($matches[4]),
'abstract_id' => 0,
'timestamp' => $timestamp,
'ext' => serialize(array('board_id' => $matches_board_id[1])),
'ext' => serialize(array('board_id' => $content_info['boardId'], 'product_key' => $product_key_matches[1])),
),
'pic_list' => $arrContent['pic_list'],
);
@@ -68,7 +68,7 @@ public function setCurlConf(&$arrCurlConf, $arrInfo) {
$arrExt = $arrInfo['entry']['ext'];
$strDomain = 'http://comment.news.163.com';
$strReq = "/data/{$arrExt['board_id']}/df/{$arrInfo['entry']['source_comment_id']}_1.html";
$strReq = "/api/v1/products/{$arrExt['product_key']}/threads/{$arrInfo['entry']['source_comment_id']}/comments/hotTopList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc";
break;
case 'sina':
@@ -139,16 +139,16 @@ public function getComments($strContent, $arrInfo) {
break;
case 'netease':
$res = preg_match('#^var \w+=({.*});$#', $strContent, $matches);
$res = preg_match("#getData\(\s+({.*})\);$#", $strContent, $matches);
$data = json_decode($matches[1], true);
$data = $data['hotPosts'];
$data = $data['comments'];
$arrPathDict = array(
'source' => 'netease',
'user' => array('1', 'n'),
'time' => array('1', 't'),
'content' => array('1', 'b'),
'user' => array('user', 'nickname'),
'time' => array('createTime'),
'content' => array('content'),
);
break;
case 'sina':

0 comments on commit 7b555b1

Please sign in to comment.