In [1]:
import logging
import requests
import sys
import urllib

from bs4 import BeautifulSoup
from collections import OrderedDict
from urllib.parse import urlencode

def get_list(comp, page):
    """Function to get  web list pages for a given company and page number.

    Args:
        comp: Company name.
        page: The page number.

    Returns:
        newsData: A dictionary with news title as its key and other details as values.

    """
    newsData = OrderedDict()
    href = 'http://search.sina.com.cn/?{comp1}&range=all&c=news&sort=time&page={page1}'.format(comp1 = comp, page1 = page) # comp -> first %s; page -> 2nd %s; col=1_7 -> financial news in sina
#     http://search.sina.com.cn/?q=%C1%F5%C7%BF%B6%AB&c=news&from=index&col=&range=&source=&country=&size=&time=&a=&page=7&pf=0&ps=0&dpc=1
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36", 
        "Cookie":"U_TRS1=00000059.b9a84d9b.5bc211a5.e8aed6c6; UOR=www.baidu.com,blog.sina.com.cn,; SINAGLOBAL=106.38.124.49_1539445361.606329; SCF=At08xTg6WL3a0dkY2OxT4PSkJwKJX3lMc8RLmkis-9zi3fwSSCMJ2jo9-4-gQXegKxrUP4ktZd953rYXOL8wPn8.; sso_info=v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLaMg5i0jKOgtY2DoLeJp5WpmYO0toyDmLSMo6C1jYOgtw==; vjuids=7ebce9e79.1674b308bb4.0.f3728c2f0b4b8; SUB=_2AkMrM2XSf8NxqwJRmPkdz2nqZIl-ywnEieKdb5QJJRMyHRl-yD83qh0StRB6ALNLPZLw-O-dKMOhESbVmhOYmlKwSM69; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5vzRG1LHM4CGq0TjdTurMK; vjlast=1545488586.1555950666.10; mYSeArcH=%u54C8%u54C8%u54C8%7CsEaRchHIS%7C%u963F%u91CC%u5DF4%u5DF4; U_TRS2=0000008a.453f4ea8.5cdbdabb.dbd49745; Apache=123.124.19.138_1557912252.971237; ULV=1557912265125:13:4:2:123.124.19.138_1557912252.971237:1557912252570"
        
    }
    print(href)
    html = requests.get(href, headers = headers)
    
    # Parsing html
    soup = BeautifulSoup(html.content, 'html.parser',from_encoding='gbk')
    divs = soup.findAll('div', {"class": "box-result clearfix"})

    for div in divs:
        head = div.findAll('h2')[0]
        # News title
        titleinfo = head.find('a')
        title = titleinfo.get_text()
        # News url
        url = titleinfo['href']
        # Other info
        otherinfo = head.find('span', {"class": "fgray_time"}).get_text()
        source, date, time = otherinfo.split()
        # News abstract
        abstract = div.find('p', {"class": "content"}).get_text()
        newsData[title] = [date, source, abstract, url]
    return newsData



if __name__ == "__main__":
    compRawStr = '阿里巴巴'
    # Dealing with character encoding
    comp = compRawStr.encode('gbk')
    d = {'q': comp}
    pname = urlencode(d)
    # Scraping and printing the first two pages
    for page in range(1, 4):
        newsData = get_list(pname, page)
        for ky in newsData:
            print('\001'.join([ky] + newsData[ky])) # "\001" as separator

http://search.sina.com.cn/?q=%B0%A2%C0%EF%B0%CD%B0%CD&range=all&c=news&sort=time&page=1
历经研究1600千米真人头发，戴森头发科学关怀你的发丝健康2019-05-15商讯 　　过去六年里 戴森头发科学实验室斥资近1亿英镑（1）研发戴森Supersonic吹风机和Airwrap美发造型器 旨在利用强劲可控的气流 减少对吹风机和造型器过多热量的依赖 实现快速干发与健康造型 防止过热损伤问题  http://nx.sina.com.cn/finance/2019-05-15/detail-ihvhiqax8913596.shtml
沪指高开高走涨1.9% 做多科技股再成最拥挤交易2019-05-15新浪财经_原创DIA ）涨约0.8% 标普500（ SPY ）涨约0.8% 纳指100（ QQQ ）涨约1.15% 阿里巴巴（ BABA ）周三盘前将发布财报    　　隔夜 美股三大指数收高  市场基准标普500指数 上涨0.8% 道指收高0.82% 成分股波音（ BA ）反弹1.68% 周一 创出年内最大单日跌幅 的纳指反弹1.14%    　　周三  在岸人民币 （https://finance.sina.com.cn/stock/usstock/clues/2019-05-15/doc-ihvhiews2001084.shtml
亚洲美食节启幕 口碑饿了么宣布将在200城打造本地生活“数字化一条街”2019-05-15新快报  5月15日 亚洲美食节正式开幕 在天猫618前夕 作为战略合作伙伴 阿里巴巴集团合伙人、阿里本地生活服务公司总裁王磊宣布 口碑饿了么所引领的餐饮行业全链路数字化体系已经成型   王磊同时强调 口碑饿了么将以亚洲美食节为契机http://mp.sina.cn/article/2019-05-15/detail-i-ihvhiqax8914332.d.html
2019全球新经济年会倒计时30天 来自中美德产业智能高峰论坛的邀请函2019-05-15新浪游戏   　　“伯乐”的眼光看人工智能   　　关于机遇 阿里巴巴创始人马云用四个词描述错失的原因：   　　 “看不见 看不起 看不懂 来不及”     　　稍纵即逝的商机是投资人眼