##### From the file "crawler4piotroski_fscore_selenium"

=> flipping stock codes

In [1]:
# importing packages 
from datetime import datetime

import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

# sqlalchemy packages 
from sqlalchemy import create_engine
from sqlalchemy import Table, Column, Integer, Numeric, String, DateTime, ForeignKey
from sqlalchemy.orm import sessionmaker, relationship, backref
from sqlalchemy.ext.declarative import declarative_base 

In [2]:
# Headless Chrome
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')

In [3]:
# Start Chrome with Selenium
driver = webdriver.Chrome('/Users/daesikkim/Downloads/chromedriver', chrome_options=options)
driver.implicitly_wait(3)

In [4]:
# Get the stock data from DB(postgresql)
db_string = "postgresql://daesik:@localhost/db_piotroski"

engine = create_engine(db_string, echo=True)

In [5]:
Session = sessionmaker(bind=engine)
session = Session()

In [6]:
Base = declarative_base()

In [7]:
# Mapping 
class Stock(Base): 
    __tablename__ = 'stocks'
    
    stock_id = Column(Integer, primary_key=True)
    stock_code = Column(String, unique=True, nullable=False, primary_key=True)
    company = Column(String, index=True, unique=True, nullable=False)
    market_type = Column(Integer)
    industry = Column(String) 
    created_on = Column(DateTime, default=datetime.utcnow)
    updated_on = Column(DateTime, default=datetime.utcnow)
    
    base_param = relationship('BaseParam', back_populates='stock')
    bookmarket_param = relationship('BookMarketParam', back_populates='stock')
    
    # def __repr__(self):
    #    return "<Stock ==> id : {0}, code : {1}, company : {2}, market_type : {3}, created : {4}, updated : {5}>".format(self.stock_id, 
    #                                                                                                                     self.stock_code, 
    #                                                                                                                     self.company,
    #                                                                                                                     self.market_type,
    #                                                                                                                     self.created_on,
    #                                                                                                                     self.updated_on,)
    

In [8]:
class BaseParam(Base): 
    __tablename__ = 'base_params'
    
    baseparam_id = Column(Integer, primary_key=True)
    stock_code = Column(String, ForeignKey('stocks.stock_code'))
    date = Column(DateTime)
    price_open = Column(Integer)
    price_close = Column(Integer) 
    price_high = Column(Integer)
    price_low = Column(Integer)
    quant = Column(Integer)
    market_sum = Column(Integer)
    
    stock = relationship("Stock", back_populates="base_param")
    

In [9]:
class BookMarketParam(Base): 
    __tablename__ = 'bookmarket_params'
    
    bookmarketparam_id = Column(Integer, primary_key=True)
    stock_code = Column(String, ForeignKey('stocks.stock_code'))
    listed_stock_cnt = Column(Integer)
    property_total = Column(Integer)
    debt_total = Column(Integer)
    pbr = Column(Integer)
    
    stock = relationship('Stock', back_populates='bookmarket_param')

In [10]:
Base.metadata.create_all(engine)

2017-10-12 20:12:42,271 INFO sqlalchemy.engine.base.Engine select version()
2017-10-12 20:12:42,272 INFO sqlalchemy.engine.base.Engine {}
2017-10-12 20:12:42,277 INFO sqlalchemy.engine.base.Engine select current_schema()
2017-10-12 20:12:42,278 INFO sqlalchemy.engine.base.Engine {}
2017-10-12 20:12:42,283 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2017-10-12 20:12:42,285 INFO sqlalchemy.engine.base.Engine {}
2017-10-12 20:12:42,293 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2017-10-12 20:12:42,294 INFO sqlalchemy.engine.base.Engine {}
2017-10-12 20:12:42,296 INFO sqlalchemy.engine.base.Engine show standard_conforming_strings
2017-10-12 20:12:42,297 INFO sqlalchemy.engine.base.Engine {}
2017-10-12 20:12:42,301 INFO sqlalchemy.engine.base.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
20

In [11]:
code_n_name = session.query(Stock.stock_code, Stock.company).all()

2017-10-12 20:12:43,184 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2017-10-12 20:12:43,186 INFO sqlalchemy.engine.base.Engine SELECT stocks.stock_code AS stocks_stock_code, stocks.company AS stocks_company 
FROM stocks
2017-10-12 20:12:43,188 INFO sqlalchemy.engine.base.Engine {}


In [12]:
code_n_name

[('005930', '삼성전자'),
 ('000660', 'SK하이닉스'),
 ('005935', '삼성전자우'),
 ('005380', '현대차'),
 ('005490', 'POSCO'),
 ('051910', 'LG화학'),
 ('035420', 'NAVER'),
 ('028260', '삼성물산'),
 ('015760', '한국전력'),
 ('055550', '신한지주'),
 ('105560', 'KB금융'),
 ('032830', '삼성생명'),
 ('207940', '삼성바이오로직스'),
 ('012330', '현대모비스'),
 ('017670', 'SK텔레콤'),
 ('034730', 'SK'),
 ('096770', 'SK이노베이션'),
 ('006400', '삼성SDI'),
 ('090430', '아모레퍼시픽'),
 ('033780', 'KT&amp;G'),
 ('066570', 'LG전자'),
 ('086790', '하나금융지주'),
 ('003550', 'LG'),
 ('051900', 'LG생활건강'),
 ('010950', 'S-Oil'),
 ('251270', '넷마블게임즈'),
 ('018260', '삼성에스디에스'),
 ('011170', '롯데케미칼'),
 ('000810', '삼성화재'),
 ('000270', '기아차'),
 ('000030', '우리은행'),
 ('034220', 'LG디스플레이'),
 ('002790', '아모레G'),
 ('036570', '엔씨소프트'),
 ('035720', '카카오'),
 ('010130', '고려아연'),
 ('009540', '현대중공업'),
 ('024110', '기업은행'),
 ('009150', '삼성전기'),
 ('030200', 'KT'),
 ('035250', '강원랜드'),
 ('161390', '한국타이어'),
 ('021240', '코웨이'),
 ('004020', '현대제철'),
 ('267250', '현대로보틱스'),
 ('006800', '미래에셋대우'),
 (

In [17]:
# A function to crawl a balance sheet
def crawl_balsheet(code):
    
    # click Bal Sheet Tab 
    balsheet_button = driver.find_element_by_css_selector("#rpt_tab2")
    balsheet_button.click
    #driver.implicitly_wait(1)
    
    # getting the page source and making the soup! 
    html_balsheet = driver.page_source
    soup_balsheet = BeautifulSoup(html_balsheet, 'lxml')
    
    # getting values from "Balance Sheet" 
    total_asset = soup_balance.select('#table-content > table > tbody > tr:nth-of-type(1) > td.num')
    lt_debt = soup_balance.select('#table-content > table > tbody > tr:nth-of-type(152) > td.num')
    lt_borrowing = soup_balance.select('#table-content > table > tbody > tr:nth-of-type(158) > td.num')
    current_asset = soup_balance.select('#table-content > table > tbody > tr:nth-of-type(2) > td.num')
    current_liabilities = soup_balance.select('#table-content > table > tbody > tr:nth-of-type(110) > td.num')
    shareholder_equity = soup_balance.select('#table-content > table > tbody > tr:nth-of-type(188) > td.num')
    
    # convert string --> float 
    balsheet_vals = list(map(lambda l: [float(i.string.replace(',','')) if i.string != u'\xa0' else float(i.string.replace(u'\xa0','0')) for i in l[0:5]], [total_asset, lt_debt, lt_borrowing, current_asset, current_liabilities, shareholder_equity]))
    
    return balsheet_vals

In [19]:
# A function to crawl an income statement 
def crawl_incomestate(code):
    
    # click Income Statement Tab 
    incomestate_button = driver.find_element_by_css_selector("#rpt_tab1")
    incomestate_button.click
    #driver.implicitly_wait(1)
    
    # getting the page source and making the soup
    html_incomestate = driver.page_source
    soup_incomestate = BeautifulSoup(html_incomestate, 'lxml') 
    
    # getting values from "Income Statement" 
    operating_profit = soup_income.select('#table-content > table > tbody > tr:nth-of-type(59) > td.num')
    extra_income = soup_income.select('#table-content > table > tbody > tr:nth-of-type(144) > td.num')
    total_sales = soup_income.select('#table-content > table > tbody > tr:nth-of-type(1) > td.num')
    gross_profit = soup_income.select('#table-content > table > tbody > tr:nth-of-type(26) > td.num')
    cogs = soup_income.select('#table-content > table > tbody > tr:nth-of-type(15) > td.num')
    
    # convert string --> float 
    incomestate_vals = list(map(lambda l: [float(i.string.replace(',','')) if i.string != u'\xa0' else float(i.string.replace(u'\xa0','0')) for i in l[0:5]], [operating_profit, extra_income, total_sales, gross_profit, cogs]))
    
    return incomestate_vals

In [20]:
# A function to crawl a cash flow 
def crawl_cashflow(code):
    
    # click Cash Flow tab 
    cashflow_button = driver.find_element_by_css_selector("#rpt_tab3")
    cashflow_button.click
    driver.implicitly_wait(1)
    
    # getting the page source and making the soup 
    html_cashflow = driver.page_source 
    soup_cashflow = BeautifulSoup(html_cashflow, 'lxml')
    
    # getting values from "Income Statement" 
    cf_operation = soup_cash.select('#table-content > table > tbody > tr:nth-of-type(1) > td.num')
    
    # convert string --> float 
    cashflow_vals = list(map(lambda l: [float(i.string.replace(',','')) if i.string != u'\xa0' else float(i.string.replace(u'\xa0', '0')) for i in l[0:5]], [cf_operation]))
    
    return cashflow_vals

In [None]:
# One Big Loop through all company codes
for i in codes: 
    
# load the page! 
# move to the relavent frame 
# select to the financail statement tab