In [15]:
# -*- encoding: utf-8 -*-
# from fake_useragent import UserAgent
# ua = UserAgent()
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

from bs4 import BeautifulSoup
import datetime
from snowballstemmer import EnglishStemmer
import time, sys
import os, errno
import shutil
import pickle

timeout = 40

def check_page(search_name, page_required, driver):
    
    search_url = "http://ieeexplore.ieee.org/search/searchresult.jsp?queryText=%s&pageNumber=1" % (search_name)
    driver.get(search_url)

    # Wait and scroll down page (3 times for 25 results/page)
    for i in range(3):
        try:
            element_presence = EC.presence_of_element_located((By.TAG_NAME, "h2"))
            element = WebDriverWait(driver, timeout).until(element_presence)
        except TimeoutException:
            print("MSG: Timeout for scrolling %d" % i)
        finally:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Scroll to page bottom, wait last-page-arrow navigation appear and click it
    try:
        element_presence = EC.presence_of_element_located((By.CSS_SELECTOR, ".last .ng-binding"))
        element = WebDriverWait(driver, timeout).until(element_presence)
    except TimeoutException:
        print("MSG: Timeout for last-page button")
    finally:
        try:  # click last page button
            last_page = driver.find_element_by_css_selector(".last .ng-binding").click()
            last_page_num = int(driver.current_url.split("pageNumber=")[1].strip())
        except NoSuchElementException:
            raise

    while last_page_num == 1:  # Loading last page number
        last_page_num = int(driver.current_url.split("pageNumber=")[1].strip())

    print("MSG: search word: ", search_name)
    print("MSG: page required: ", page_required)
    print("MSG: total pages: ", last_page_num)

    if last_page_num < page_required:
        return last_page_num
    else:
        return page_required

def get_url(search_name, page_required, driver, project_id, db_urls):

    per_page = 25
    url_set = set()
    paper_urls = dict()
    basic_url = "http://ieeexplore.ieee.org"

    page_required = check_page(search_name, page_required, driver)

    # Check all urls be fetched
    for page_number in range(1, page_required+1):

        while len(paper_urls) < page_number * per_page:

            search_url = "http://ieeexplore.ieee.org/search/searchresult.jsp?queryText=%s&pageNumber=%d" \
                         % (search_name, page_number)
            driver.get(search_url)

            # Wait and scroll down page (3 times for 25 results/page)
            for i in range(3):
                try:
                    element_presence = EC.presence_of_element_located((By.TAG_NAME, "h2"))
                    element = WebDriverWait(driver, timeout).until(element_presence)
                except TimeoutException:
                    print("MSG: Timeout for scrolling %d" % i)
                finally:
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

            soup = BeautifulSoup(driver.page_source, 'lxml')

            for index, item in enumerate(soup.select("h2")):

                output_value = str()

                href = item.find('a').get('href')
                redirect_url = basic_url + href

                title = item.find('a').text.replace(" ", "_").replace("/", "")
                output_value += redirect_url + '\n' + title + '\n'

                filename = "./IEEE-TMP/" + str(index + page_number*25)

                # Collect all papers' filename and output(with url)
                if redirect_url not in url_set:
                    url_set.add(redirect_url)
                    paper_urls[filename] = output_value

    print("MSG: Finish article url collection, total %d urls" % len(paper_urls))
    return paper_urls

def download_article(paper_urls, driver, data_out, project_id):

    success_counter = 0
    
    if not os.path.exists('./data'):
        os.mkdir('./data')
    
    for filename, output_value in paper_urls.items():

        pass_article = False
        url = output_value.split('\n')[0]
        title_text = output_value.split('\n')[1]
        full_text = str()
        driver.get(url)

        soup_content = BeautifulSoup(driver.page_source, "lxml")
        for section in soup_content.select("#full-text-section"):
            if "This article is only available in PDF" in  section.text:
                pass_article = True
                break

        if pass_article:
            continue
        else:
            try:
                element_presence = EC.presence_of_element_located((By.CSS_SELECTOR, ".section"))
                element = WebDriverWait(driver, timeout).until(element_presence)
            except TimeoutException:
                print("MSG: Content section is empty, passed ", url)
                continue

            print("MSG: Redirect to ", url)
            soup_content = BeautifulSoup(driver.page_source, "lxml")

            # Extract Abstract
            for block in soup_content.select(".abstract-text"):
                full_text += "Abstract: \n%s\nContent:\n" % block.text

            # Extract contents
            for section in soup_content.select("#full-text-section"):
                for block in section.select(".section"):
                    # Article header
                    for item in block.select(".article-hdr"):
                        full_text += "%s\n" % item.text.strip()
                    # Article content
                    for item in block.select("p"):
                        full_text += "%s\n" % item.text

            if len(full_text.strip()) < 20:
                continue
            time = datetime.datetime.now()
            full_text_stem = getStemArticle(full_text)
            # tmplistLines = "{project_id:" + str(project_id) + ",title:'" + title_text + "',full_text:'" + full_text + "',resource:'IEEE',get_time:" + str(time) + ",url:'" + url + "',full_text_stem:'" + full_text_stem + "'},"
            # tmplistFile.write(tmplistLines)
            dic1={}
            dic1['project_id']=project_id
            dic1['title']=title_text
            dic1['full_text']=full_text
            dic1['full_text_stem']=full_text_stem
            dic1['resource']='ieee'
            dic1['get_time']=time
            dic1['url']=url
            data_new=[dic1]
            if os.path.exists(data_out):
                with open(data_out,'rb') as f:
                    data = pickle.load(f)
                data.extend(data_new)
            else:
                data=data_new
            with open(data_out, 'wb') as f:
                pickle.dump(data, f)
            
            success_counter += 1
            print("MSG: Download for : %s is done" % filename)
            
    print("MSG: Successful download %d articles" % success_counter)
    
    
def getStemArticle(full_text):
    full_text_stem = ""
    for word in full_text.split():
        full_text_stem += " "+EnglishStemmer().stemWord(word).lower().replace(',','').replace(';','').replace('"','').replace('.','').replace("'",'').replace('?','')

    return full_text_stem.strip()

def main(project_id, search_name, page_required, db_urls,data_out):

    option = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images": 2}  # do not load image (for speed up)
    option.add_experimental_option("prefs", prefs)
    option.add_argument('headless')
    driver_path = os.getcwd() + "\\chromedriver.exe"
    driver = webdriver.Chrome(executable_path=driver_path, chrome_options=option)
    print("MSG: Setup for web driver")

    paper_urls = get_url(search_name, page_required, driver, project_id, db_urls)
    print('len(paper_urls) :',len(paper_urls))
    print(paper_urls)
    #store the articles' infromation into the tmp.list file
    download_article(paper_urls, driver, data_out, project_id)
    driver.quit()


# 實際納入爬蟲爬蟲時有 "articleNull" 欄位用於判斷是否為第一次爬蟲  
# 等T3VM申請好，用T3 VM爬 x 頁看結果  
http://ieeexplore.ieee.org/search/searchresult.jsp?queryText="Silicon Photonics"&pageNumber=1  

search_url = "http://ieeexplore.ieee.org/search/searchresult.jsp?queryText=%s&pageNumber=1" % (search_name)




爬完文章檢查文章長短，當文章的長度小於20個字，不存此文章    
因mit有些文章為video沒有合適的文章內容，所以最終存的文章數與一開始抓到的數量不同

In [16]:
project_id = "A4BC1360-7B3D-4AE3-A1C8-8196FF878BA9"
search_name = "Silicon Photonics"
searchPages = 1

db_urls = []
# db_urls = ['http://www.zdnet.com/article/intel-launches-silicon-photonics/']
# data_out = '/home/t200/python/t3/data/article/tmp.list'
data_out = "data/ieee.list"
# sleep_time_min = 1
# sleep_time_max = 6

main(project_id, search_name, searchPages, db_urls,data_out)


MSG: Setup for web driver
MSG: search word:  Silicon Photonics
MSG: page required:  1
MSG: total pages:  545
MSG: Finish article url collection, total 25 urls
len(paper_urls) : 25
{'./IEEE-TMP/25': 'http://ieeexplore.ieee.org/document/8354532/\nIntegrated_silicon_photonics_directional_couplers_for_WDM_applications\n', './IEEE-TMP/27': 'http://ieeexplore.ieee.org/document/8354600/\nOn-chip_Silicon_photonics_assisted_frequency_doubling_and_pulse_generation\n', './IEEE-TMP/29': 'http://ieeexplore.ieee.org/document/8354462/\nSilicon_photonics_for_microwave_photonics\n', './IEEE-TMP/31': 'http://ieeexplore.ieee.org/document/8346170/\nIntegrated_Silicon_Photonics\n', './IEEE-TMP/33': 'http://ieeexplore.ieee.org/document/8346057/\nEnd-to-End_Multi-Core_Fibre_Transmission_Link_Enabled_by_Silicon_Photonics_Transceiver_with_Grating_Coupler_Array\n', './IEEE-TMP/35': 'http://ieeexplore.ieee.org/document/8346132/\nSoftware-Defined_Networking_Control_Plane_for_Seamless_Integration_of_Silicon_Photon

In [17]:
with open(data_out,'rb') as f:
    data = pickle.load(f)

In [18]:
len(data)

171

In [19]:
data[1]

{'full_text': 'Abstract: \nWe present an end-to-end multi-core fibre transmission link where the 8-core fibre and the corresponding 200 Gb/s silicon photonics transceiver chip were co-designed. We demonstrate equivalent performance between the MCF and a parallel SMF transmission links.\nContent:\nIntroduction\nToday, high bandwidth (BW) optical interconnects play an important role in improving the performance of large-scale data centres by reducing the bottleneck in the switch I/O BW and the front panel BW1. The optical channel density can be improved by replacing pluggable modules with on-board optics, and ASIC integrated optics are being intensively studied. At the same time, the space-division multiplexing (SDM) over the multi-core fibre (MCF) has been intensively studied for high-density/-capacity transmission[2], [3]. The use of MCF could improve the BW density of both pluggable modules and ASIC I/O interfaces. To fully leverage the density improvement by the MCF, a fully-integrat