In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/My Drive/LDA"
os.chdir(path)

Mounted at /content/drive


In [None]:
import requests, time, random
from bs4 import BeautifulSoup, Comment
import sqlite3, argparse
import multiprocessing as mp


def create():
    connection = sqlite3.connect('papers.db')
    cursor = connection.cursor()
    cursor.execute('''CREATE TABLE IF NOT EXISTS Paper
                  (arxiv_id INTEGER PRIMARY KEY, authors TEXT, title TEXT, abstract TEXT, submit_time TEXT, source TEXT)''')

    connection.commit()
    connection.close()


def clear():
    conn = sqlite3.connect('papers.db')
    c = conn.cursor()
    c.execute('DELETE FROM Paper;',)
    print('We have deleted', c.rowcount, 'records from the table.')
    conn.commit()
    conn.close()

    conn = sqlite3.connect('papers.db')
    c = conn.cursor()
    c.execute("DROP TABLE Paper")
    print('We have deleted table Paper.')
    conn.commit()
    conn.close()


def insert(paper_info):
    conn = sqlite3.connect('papers.db')
    c = conn.cursor()
    print('=' * 20)
    print("Opened database successfully")
    print(paper_info)
    try:
        c.execute("INSERT INTO Paper ("
                  "arxiv_id,"
                  "authors,"
                  "title,"
                  "abstract,"
                  "submit_time,"
                  "source) \
              VALUES ({0})".format(str(paper_info).strip('[').strip(']')))
        print("Records created successfully")
    except sqlite3.Error as err:
        print('Query Failed: \nError: %s' % (str(err)))
        pass
    conn.commit()
    conn.close()


def filter(txt):
    if txt is None:
        return "None"
    return txt.get_text().replace("\n", "").replace("△ Less", "").replace("\'", "-").strip()

def get_page(url):
    res = requests.get(url)
    # print(url, " | ", res.statuts_code)
    assert res.status_code == 200
    return res.text


def get_abstract(result, kw):
    rst = result.find(class_=kw)
    if rst is not None:
        return rst.getText().replace(":", "").replace(";", "")
    else:
        return ""


def get_authors(result, kw):
    x = result.find(class_=kw).find_all(name='a', href=True)
    name = []
    for para in x:
        name.append(para.getText().replace("\'", "").replace('\\x00','').replace('\\x02','').replace('\\x03',''))
    return name
    # return ", ".join(name)


def get_time(result):
    if result is not None and result.find(class_='year_wr') and result.find(class_='year_wr').find(class_='kw_main'):
        return result.find(class_='year_wr').find(class_='kw_main').getText().strip()[:4]
    else:
        return ""


def get_time_patent(result):
    result = result.find_all(class_='common_wr')
    for r in result:
        if r.find(class_='label_l') is not None and "公告日期" in r.find(class_='label_l').getText():
            return r.find(class_='kw_main_l').getText().strip()
        else:
            return ""


def is_patent(soup):
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for c in comments:
        if "专利" in c:
            return True
    return False


def spider_baidu_by_pages(i):
    sleep_time = random.random()
    time.sleep(sleep_time)
    print("=" * 10, "Start Set a Spider on Page {}".format(i), "=" * 10)
    base_url = f"https://xueshu.baidu.com/s?wd={query}&pn={i * 10}&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D&sc_hit=1"
    print(base_url)
    text = get_page(base_url)
    soup = BeautifulSoup(text, features='html.parser')
    results = soup.find_all(class_='result sc_default_result xpath-log')
    count = 0

    for j, r in enumerate(results):
        # id = r['id']
        id = i * 10 + j
        x = r.find(class_='sc_content').find(class_="t c_font").find(name='a', href=True)
        href = x['href']
        # print(href)

        text = get_page(href)
        soup = BeautifulSoup(text, features='html.parser')
        rst = soup.find(class_='main-info')

        if rst is not None:
            if is_patent(soup):
                abstract = get_abstract(rst, 'abstract kw_main_l')
                date = get_time_patent(rst)
                authors = get_authors(rst, 'author_wr')
                title = x.getText()
                flag = "T"
            else:
                abstract = get_abstract(rst, 'abstract')
                authors = get_authors(rst, 'author_text')
                title = x.getText()
                date = get_time(rst)
                flag = "F"

            insert([id, "{}".format(authors), title, abstract, date, "", flag])
    print("=" * 10, "A Spider End on Page {}".format(i), "=" * 10)



def spider_baidu_by_step_by_stpe_pages(p):
    # if sc = 1 => SCI, sc=2 => EI, sc=3 => ESCI
    sc_description = ["NULL", 'SCI', 'EI', 'SCIE']
    sc_max = [0, 10, 16, 13]
    year, sc = p[0], p[1]

    if sc != 3:
      return

    sleep_time = random.randint(10, 30)
    # sleep_time = 1
    time.sleep(sleep_time)
    base_url = f"https://xueshu.baidu.com/s?wd=energy%20management%20strategy%20vehicle&pn=0&tn=SE_baiduxueshu_c1gjeupa&sc_hit=1&bcp=2&ie=utf-8&filter=sc_year%3D%7B{year}%2C{year}%7D%28sc_level%3A%3D%7B{sc}%7D%29/"
    text = get_page(base_url)
    soup = BeautifulSoup(text, features='html.parser')
    nums_ = filter(soup.find(class_='nums')).replace("条相关结果", "").replace("找到", "").replace("约", "").replace(",", "")
    if nums_ is "None":
        print(base_url)
        return
    nums = int(nums_)
    latest_r0_text = None

    for i in range(0, min(sc_max[sc], nums // 10 + 1)):
        sleep_time = random.randint(15, 75)
        time.sleep(sleep_time)
        print("=" * 10, "Start Set a Spider on Page {}".format(i), "=" * 10)
        print(year, sc, sc_description[sc])
        base_url = f"https://xueshu.baidu.com/s?wd=energy%20management%20strategy%20vehicle&pn={i * 10}&tn=SE_baiduxueshu_c1gjeupa&sc_hit=1&bcp=2&ie=utf-8&filter=sc_year%3D%7B{year}%2C{year}%7D%28sc_level%3A%3D%7B{sc}%7D%29/"
        print(base_url)
        text = get_page(base_url)
        soup = BeautifulSoup(text, features='html.parser')
        results = soup.find_all(class_='result sc_default_result xpath-log')
        r0 = results[0].find(class_='sc_content').find(class_="t c_font").find(name='a', href=True)['href']
        if latest_r0_text != r0:
            latest_r0_text = r0
        else:
            break

        for r in results:
            sleep_time = random.randint(35, 75)
            time.sleep(sleep_time)
            id = r['id']
            x = r.find(class_='sc_content').find(class_="t c_font").find(name='a', href=True)
            href = x['href']
            # print(href)
            text = get_page(href)
            soup = BeautifulSoup(text, features='html.parser')
            rst = soup.find(class_='main-info')
            # TODO SC_Level, 期刊来源也写上，如果百度学术能显示国家信息就爬国家信息，如果不能我再给你想其他办法
            if is_patent(soup):
                abstract = get_abstract(rst, 'abstract kw_main_l')
                date = get_time_patent(rst)
                authors = get_authors(rst, 'author_wr')
                title = x.getText()
            else:
                abstract = get_abstract(rst, 'abstract')
                authors = get_authors(rst, 'author_text')
                title = x.getText()
                date = get_time(rst)

            # print(id, authors, title, date)
            insert([id, "{}".format(authors), title, abstract, date, sc_description[sc]])
        print("=" * 10, "A Spider End on Page {}".format(i), "=" * 10)



if __name__ == '__main__':
    banner = """
     ____            __              ___                        ____                    __
    /\  _`\         /\ \            /\_ \                      /\  _`\           __    /\ \
    \ \,\L\_\    ___\ \ \___     ___\//\ \      __     _ __    \ \,\L\_\  _____ /\_\   \_\ \     __   _ __
     \/_\__ \   /'___\ \  _ `\  / __`\\ \ \   /'__`\  /\`'__\   \/_\__ \ /\ '__`\/\ \  /'_` \  /'__`\/\`'__\
       /\ \L\ \/\ \__/\ \ \ \ \/\ \L\ \\_\ \_/\ \L\.\_\ \ \/      /\ \L\ \ \ \L\ \ \ \/\ \L\ \/\  __/\ \ \/
       \ `\____\ \____\\ \_\ \_\ \____//\____\ \__/.\_\\ \_\      \ `\____\ \ ,__/\ \_\ \___,_\ \____\\ \_\
        \/_____/\/____/ \/_/\/_/\/___/ \/____/\/__/\/_/ \/_/       \/_____/\ \ \/  \/_/\/__,_ /\/____/ \/_/
                                                                            \ \_\
                                                                             \/_/
    """
    print(banner)

    on_Colab = True
    params = [[year, sc] for sc in [1, 2, 3] for year in range(2016, 2022)]
    clear()
    create()

    # if not on_Colab:
    #     parser = argparse.ArgumentParser(
    #         description='Collect abstractions of papers containing specific keywords from Baidu Academic and Google Academic.')
    #     parser.add_argument('--num_process', default=10, type=int,
    #                         help='Experiment in different settings.')
    #     args = parser.parse_args()
    #     num_process = mp.cpu_count() - 1 if args.num_process >= mp.cpu_count() else args.num_process
    # else:
    #     num_process = 10
    #     num_process = mp.cpu_count() - 1 if num_process >= mp.cpu_count() else num_process

    # pool = mp.Pool(num_process)
    # # pool_outputs = pool.map(spider_baidu_by_pages, inputs)
    # pool_outputs = pool.map(spider_baidu_by_step_by_stpe_pages, params)
    # pool.close()
    # pool.join()



     ____            __              ___                        ____                    __                  
    /\  _`\         /\ \            /\_ \                      /\  _`\           __    /\ \                 
    \ \,\L\_\    ___\ \ \___     ___\//\ \      __     _ __    \ \,\L\_\  _____ /\_\   \_\ \     __   _ __  
     \/_\__ \   /'___\ \  _ `\  / __`\ \ \   /'__`\  /\`'__\   \/_\__ \ /\ '__`\/\ \  /'_` \  /'__`\/\`'__       /\ \L\ \/\ \__/\ \ \ \ \/\ \L\ \_\ \_/\ \L\.\_\ \ \/      /\ \L\ \ \ \L\ \ \ \/\ \L\ \/\  __/\ \ \/ 
       \ `\____\ \____\ \_\ \_\ \____//\____\ \__/.\_\ \_\      \ `\____\ \ ,__/\ \_\ \___,_\ \____\ \_\ 
        \/_____/\/____/ \/_/\/_/\/___/ \/____/\/__/\/_/ \/_/       \/_____/\ \ \/  \/_/\/__,_ /\/____/ \/_/ 
                                                                            \ \_\                           
                                                                             \/_/                           
    
We have deleted 3330

In [None]:
import sqlite3
import pandas as pd

path = 'papers.db'
conn = sqlite3.connect(path)

header = []
c = conn.cursor()
for column in c.execute('PRAGMA table_info("Paper")'):
    header.append(column[1])
print(header)

df = pd.DataFrame(columns=header)
for raw in c.execute('SELECT * FROM Paper'):
    series = pd.Series(list(raw), index=df.columns)
    df = df.append(series, ignore_index=True)

df.to_csv('abstract0404.csv')
conn.close()

['arxiv_id', 'authors', 'title', 'abstract', 'submit_time', 'comment']


In [None]:
df

Unnamed: 0,arxiv_id,authors,title,abstract,submit_time,source
0,1,"['Sabri, M. F. M', 'KA Danapalasingam', 'MF Ra...",A review on hybrid electric vehicles architect...,Faced with environmental issues caused by foss...,2016,EI
1,2,"['K Ettihir', 'L Boulon', 'K Agbossou']",Optimization-based energy management strategy ...,This paper addresses the energy management str...,2016,EI
2,3,"['BV Padmarajan', 'A Mcgordon', 'PA Jennings']",Blended Rule-Based Energy Management for PHEV:...,This paper proposes a blended rule-based energ...,2016,EI
3,4,"['TH Pham', 'JTBA Kessels', 'PPJVD Bosch', 'RG...",Analytical Solution to Energy Management Guara...,This paper considers a parallel hybrid electri...,2016,EI
4,5,"['Maobing', 'Hui', 'Weimin', 'Liu', 'Yin', 'Fa...",The structure and control method of hybrid pow...,"In this paper, an electric vehicle powertrain ...",2016,EI
...,...,...,...,...,...,...
221,223,"['J Chen', 'X Kong', 'P Li']",Analysis of the control strategy of range exte...,,2020,ESCI
222,224,"['C Shi', 'C Chen', 'Z Yu', 'R Li']",Control Strategy of Hybrid Electric Vehicle ba...,,2020,ESCI
223,225,"['Q Li', 'G Min', 'P Chen', 'Y Liu', 'W Zhang']",Computer vision-based techniques and path plan...,Unmanned aerial vehicle is a typical field rob...,2020,ESCI
224,226,"['X Wang', 'Q Li', 'H Zha', 'B Wang']",Integrated active steering control strategy fo...,An integrated active steering control strategy...,2020,ESCI
