# import

In [None]:
import os
import csv
import locale
import re
import pandas as pd
from datetime import datetime

In [None]:
# pandas の最大表示列数を設定 (max_rows で表示行数の設定も可能)
pd.set_option('display.max_columns', 30)

# memo

In [None]:
df = pd.read_csv(r"C:\Users\Really\GitHub\Stockyard\scrapy_project\yahoo_fundamental.csv")
df

In [None]:
df = df.sort_values(['date', 'code', 'get'])
df

In [None]:
df = df.drop_duplicates(['date', 'code'], keep='last').reset_index(drop=True)
df

In [None]:
df.to_csv('test.csv', index=False)

In [None]:
test = pd.read_csv('test.csv')
test

In [None]:
d = {'code': '1380', '銘柄名': '(株)秋川牧園', 'PER': '(連) 34.77', 'PBR': '(連) 1.91', 'EPS': '\n(連) 20.39', 'BPS': '\n(連) 370.29'}
d

In [None]:
for key in d:
    d[key] = re.sub('\n', '', d[key])
d

In [None]:
datetime.now().strftime("%Y/%m/%d %H:%M:%S")

# 新規プロジェクトの生成

In [None]:
!scrapy startproject scrapy_project

In [None]:
!tree scrapy_project

# プロジェクトのディレクトリに移動、新規spiderの生成

In [None]:
os.getcwd()

In [None]:
# os.chdir('/Users/Really/Stockyard/scrapy_project')
os.chdir(r"C:\Users\Really\GitHub\Stockyard\scrapy_project")
os.getcwd()

In [None]:
!scrapy genspider yahoo_stock stocks.finance.yahoo.co.jp/stocks/qi/

# 個々のファイルの編集

In [None]:
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class ScrapyProjectItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


class yahoo_fundamental(scrapy.Item):
    """
    個々の銘柄の指標を表すItem
    """

    code = scrapy.Field()
    symbol_name = scrapy.Field()
    per = scrapy.Field()
    pbr = scrapy.Field()
    eps = scrapy.Field()
    bps = scrapy.Field()


In [None]:
yahoo_stock.py

In [None]:
# -*- coding: utf-8 -*-
# エンコーディング宣言は Python2 用なので削除してもよい

import scrapy

from scrapy_project.items import yahoo_fundamental


class YahooStockSpider(scrapy.Spider):
    name = 'yahoo_stock'
    allowed_domains = ['stocks.finance.yahoo.co.jp/stocks']
    start_urls = (
        'http://stocks.finance.yahoo.co.jp/stocks/qi/',
    )

    def parse(self, response):
        """
        銘柄一覧ページから個々の銘柄へのリンクを抜き出してたどる
        """
        # listTable > table > tbody > tr:nth-child(2) > td.center.yjM > a
        # print(response.css('td.center.yjM a::attr("href")').extract())
        for url in response.css('td.center.yjM a::attr("href")').extract():
            yield scrapy.Request(response.urljoin(url), self.parse_fundamental, dont_filter=True)

    def parse_fundamental(self, response):
        """
        個々の銘柄ページでの処理
        """
        item = yahoo_fundamental()  # yahoo_fundamental オブジェクトを作成
        item['code'] = response.css('#stockinf dt').xpath('string()').extract_first() # 銘柄コード
        item['symbol_name'] = response.css('.symbol').xpath('string()').extract_first() # 銘柄名
        item['per'] = response.css('#rfindex strong').xpath('string()').extract()[4] # PER
        item['pbr'] = response.css('#rfindex strong').xpath('string()').extract()[5] # PBR
        item['eps'] = response.css('#rfindex strong').xpath('string()').extract()[6] # EPS
        item['bps'] = response.css('#rfindex strong').xpath('string()').extract()[7] # BPS
        yield item  # Itemをyieldして、データを抽出する

In [None]:
%%writefile yahoo_stock_crawl.py

# -*- coding: utf-8 -*-
# エンコーディング宣言は Python2 用なので削除してもよい

from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

from scrapy_project.items import yahoo_fundamental


class yahoo_stock_crawl_spider(CrawlSpider):
    name = 'yahoo_stock_crawl'
    allowed_domains = ['stocks.finance.yahoo.co.jp']
    start_urls = (
        'http://stocks.finance.yahoo.co.jp/stocks/qi/',
    )

    # リンクをたどるためのルールのリスト
    # https://stocks.finance.yahoo.co.jp/stocks/detail/?code=1301
    rules = (
        # 試験的に一覧の9ページ目まで。末尾の \d$ を \d+$ に変えれば10ページ以降も辿れるはず
        Rule(LinkExtractor(allow=r'/stocks/qi/\?&p=\d$')),
        Rule(LinkExtractor(allow=r'/stocks/detail/\?code=\d+$'), callback='parse_fundamental'),
    )

    
    def parse_fundamental(self, response):
        """
        個々の銘柄ページでの処理
        """
        item = yahoo_fundamental()  # yahoo_fundamental オブジェクトを作成
        item['code'] = response.css('#stockinf dt').xpath('string()').extract_first() # 銘柄コード
        item['symbol_name'] = response.css('.symbol').xpath('string()').extract_first() # 銘柄名
        item['per'] = response.css('#rfindex strong').xpath('string()').extract()[4] # PER
        item['pbr'] = response.css('#rfindex strong').xpath('string()').extract()[5] # PBR
        item['eps'] = response.css('#rfindex strong').xpath('string()').extract()[6] # EPS
        item['bps'] = response.css('#rfindex strong').xpath('string()').extract()[7] # BPS
        yield item  # Itemをyieldして、データを抽出する