# Raw Data Collector
## 1. Import Modules

In [1]:
from http.client import HTTPConnection, HTTPSConnection
import urllib.parse
import re
import datetime
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from DB_Setup import Base, FullMarketDailyTrade, SingleStockDaily, StockNameID
import datetime

## 2. Connection with Database

In [2]:
engine = create_engine('sqlite:///taiwanstock.db')
# Bind the engine to the metadata of the Base class so that the
# declaratives can be accessed through a DBSession instance
Base.metadata.bind = engine
DBSession = sessionmaker(bind=engine)
# A DBSession() instance establishes all conversations with the database
# and represents a "staging zone" for all the objects loaded into the
# database session object. Any change made against the objects in the
# session won't be persisted into the database until you call
# session.commit(). If you're not happy about the changes, you can
# revert all of them back to the last commit by calling
# session.rollback()
session = DBSession()

## 3. Functions 
### a. Internet Data Query
* Using HTML POST method to scrape data on internet

In [3]:
def QueryDataPost(urlDomain, urlReq, bodyParams, headers={}, httpType='http'):
    if httpType is 'http':
        conn = HTTPConnection(urlDomain)
    elif httpType is 'https':
        conn = HTTPSConnection(urlDomain)
    #params = urllib.parse.urlencode(bodyParams)
    params = bodyParams
    conn.request('POST', urlReq, params, headers)
    connResp = conn.getresponse()
    connReadData = connResp.read().decode('utf-8', 'ignore')
    return connReadData

### b. Easy List Printer

In [4]:
def printList(dataList):
    for items in dataList:
        print(items)

### c. Daily Stock Data Extractor

In [5]:
def stockDayDataExtract(stockDataList):
    dataRe = r'^"\s*\d+/*\d+/*\d+"'
    floatRe = r'[+-]*\d+\.\d+'
    dateRe = r'\d+/\d+/\d+'
    specialData = 'X0.00'
    dataList = list()
    for item in stockDataList:
        if re.match(dataRe,item):
            dataList.append(item.split('","'))
    for i, items in enumerate(dataList):
        for j, data in enumerate(items):
            data = data.replace('"','')
            data = data.replace(',','')
            data = data.replace(' ','')
            if data.isdigit():
                data = int(data)
            elif re.match(floatRe, data):
                data = float(data)
            elif re.match(dateRe, data):
                y = int(data.split('/')[0])
                m = int(data.split('/')[1])
                d = int(data.split('/')[2])
                data = datetime.date(y, m, d)
            elif data == specialData:
                data = 0
            #print(data)
            dataList[i][j] = data
    return dataList

### d. Add StockID for SingleStock

In [6]:
def addStockID(data, ID):
    newDataList = list()
    for listdata in data:
        listdata.append(ID)
        newDataList.append(listdata)
    return newDataList

### e. HTML Parameters of SingleStock

In [7]:
def singleStockDayParams(year, month, stockNo):
    return 'download=csv&query_year=%d&query_month=%d&CO_ID=%s' % (year, month, stockNo)

### f. HTML Parameters of FullMarket

In [8]:
def fulMarketTradeParams(year, month):
    return 'download=csv&query_year=%d&query_month=%d' % (year, month)
    

### g. Raw Data to Database of FullMarket
* tradeVolume is stored as thousand base
* tradeValue is stored in thousan base
* Bypass the empty data

In [9]:
def createFullMarket(data):
    if 0 in data[1:5]:
        pass
    else:
        ratio = 1000
        stockDay = FullMarketDailyTrade(tradeDate=data[0], tradeVolume=data[1]/ratio,\
                                    tradeValue=data[2]/ratio, transaction=data[3],\
                                    TAIEX=data[4], change=data[5])
        try:
            session.add(stockDay)
            session.commit()
        except:
            print('Error occurs in this item:')
            print(data)
            return

### h. Raw Data to Database of SingleStock
* tradeVolume is stored as thousand base
* tradeValue is stored in thousan base
* Bypass the empty data

In [10]:
def createSingleStock(data):
    if 0 in data[1:7]:
        pass
    else:
        ratio = 1000
        stockDay = SingleStockDaily(tradeDate=data[0], tradeVolume=data[1]/ratio, tradeValue=data[2]/ratio,\
                                openingprice=data[3], highestprice=data[4],\
                                lowestprice=data[5], closingtprice=data[6],\
                                change=data[7], transaction=data[8], stockid=data[9])
        try:
            session.add(stockDay)
            session.commit()
        except:
            print('Error occurs in this item:')
            print(data)
            return

## 3. Variables

In [11]:
#####Changeable Variables#####
year = 2016
month = 12
stockNo = '2454'

#####Main Variables#####
downloadType = 'csv'
tesDomain = 'www.tse.com.tw'
headers = {"Content-type": "application/x-www-form-urlencoded","Accept": "text/plain"}
fulMarketTradeTitle = ["Date","Trade Volume","Trade Value","Transaction","TAIEX","Change"]
singleStockDayTitle = ["Date","Trade Volume","Trade Value","Opening Price","Highest Price",\
                       "Lowest Price","Closing Price","Change","Transaction"]

fulMarketTradeAPI = '/en/trading/exchange/FMTQIK/FMTQIK.php'
singleStocDaykAPI = '/en/trading/exchange/STOCK_DAY/STOCK_DAYMAIN.php'
#fulMarketTradeParams = 'download=csv&query_year=%d&query_month=%d' % (year, month)
#singleStockDayParams = 'download=csv&query_year=%d&query_month=%d&CO_ID=%d' % (year, month, stockNo)


# %%%Code Tester%%%

In [10]:
stocknamdidTable = session.query(StockNameID)[0:20]
for stock in stocknamdidTable:
    print(stock.stockid)

0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0061
006203
006204
006205
006206
006207
006208
00625K
00631L
00632R


In [12]:
stocknamdidTable = session.query(StockNameID)[0:1]
startYear = 2016
stopYear = 2018
for stock in stocknamdidTable:
    stockNo = stock.stockid
    #yearlySingleStock = list()
    for year in range(startYear,stopYear):
        for month in range(1,13):
            theStock = QueryDataPost(tesDomain, singleStocDaykAPI, singleStockDayParams(year, month, stockNo), headers)
            theStockList = theStock.split('\n')
            theStockData = stockDayDataExtract(theStockList)
            newData = addStockID(theStockData, stockNo)
            print(newData)
            #yearlySingleStock = yearlySingleStock + newData
            #createSingleStock(newData)
    print('%s\'s data from %d to %d is stored in database' % (stockNo, startYear, stopYear))

[[datetime.date(2016, 1, 4), 16928063, 1009176452, 60.8, 60.95, 59.05, 59.55, -1.2, 5730, '0050'], [datetime.date(2016, 1, 5), 21783969, 1291792220, 59.5, 59.75, 58.85, 59.05, -0.5, 6708, '0050'], [datetime.date(2016, 1, 6), 28939550, 1688039896, 59.05, 59.05, 57.9, 58.45, -0.6, 8406, '0050'], [datetime.date(2016, 1, 7), 39962301, 2289801488, 58.3, 58.3, 56.7, 57.35, -1.1, 11325, '0050'], [datetime.date(2016, 1, 8), 25450734, 1459152633, 57.0, 57.7, 56.85, 57.45, 0.1, 7024, '0050'], [datetime.date(2016, 1, 11), 24356269, 1375352256, 56.9, 56.95, 56.0, 56.55, -0.9, 7405, '0050'], [datetime.date(2016, 1, 12), 17649886, 998368462, 56.8, 57.0, 56.3, 56.5, -0.05, 5238, '0050'], [datetime.date(2016, 1, 13), 11848432, 677682867, 57.0, 57.5, 56.75, 57.15, 0.65, 5664, '0050'], [datetime.date(2016, 1, 14), 15487360, 871255829, 56.5, 56.7, 55.85, 56.7, -0.45, 5765, '0050'], [datetime.date(2016, 1, 15), 20847878, 1181841700, 57.1, 57.5, 56.25, 56.7, 0.0, 4936, '0050'], [datetime.date(2016, 1, 18),

In [13]:
#stockData = QueryDataPost(tesDomain, fulMarketTradeAPI, fulMarketTradeParams, headers)
#stockDataList = stockData.split('\n')
yearlySingleStock = list()
for year in range(2015,2018):
    for month in range(1,13):
        MediatekStock = QueryDataPost(tesDomain, singleStocDaykAPI, singleStockDayParams(year, month, stockNo), headers)
        MediatekStockList = MediatekStock.split('\n')
        fullstock = stockDayDataExtract(MediatekStockList)
        newData = addStockID(fullstock, stockNo)
        yearlySingleStock = yearlySingleStock + newData
    #printList(newData)
#printList(yearlySingleStock)

    

In [14]:
for data in yearlySingleStock:
    createSingleStock(data)

In [15]:
yearlyFullStock = list()
for year in range(2015,2018):
    for month in range(1,13):
        fullMarket = QueryDataPost(tesDomain, fulMarketTradeAPI, fulMarketTradeParams(year, month), headers)
        fullMarketList = fullMarket.split('\n')
        fullstock = stockDayDataExtract(fullMarketList)
        yearlyFullStock = yearlyFullStock + fullstock

In [16]:
for data in yearlyFullStock:
    createFullMarket(data)

# Appendix

## Test Code

In [47]:
url_req = '/ch/trading/exchange/FMTQIK/FMTQIK.php'
params = urllib.parse.urlencode({'download':'csv', 'query_year':2017, 'query_month':2})
headers = {"Content-type": "application/x-www-form-urlencoded","Accept": "text/plain"}
conn = HTTPConnection('www.tse.com.tw')
web = conn.request('POST', url_req, params, headers)
abc = conn.getresponse()
d = abc.read().decode('utf-8', 'ignore')
print(d)

106~02 T (A)
"","Ѽ","B","浧","oq[vѻ","^I"
" 106/02/02","5,119,534,614","147,930,853,630","1,131,546","9,428.97","-18.98",
" 106/02/03","5,068,371,711","102,335,722,986","1,056,855","9,455.56","26.59",
" 106/02/06","5,510,906,552","102,352,501,152","1,076,125","9,538.01","82.45",
" 106/02/07","4,932,873,379","95,709,593,373","988,506","9,554.56","16.55",
" 106/02/08","5,300,856,389","110,729,308,207","1,082,850","9,543.25","-11.31",
" 106/02/09","5,264,593,609","106,237,084,984","1,038,404","9,590.18","46.93",
" 106/02/10","6,240,806,015","136,811,826,048","1,209,286","9,665.59","75.41",
" 106/02/13","5,255,562,269","101,177,702,355","1,023,598","9,710.32","44.73",
" 106/02/14","5,531,133,656","113,724,121,218","1,072,534","9,718.78","8.46",
" 106/02/15","5,748,794,719","117,918,064,389","1,165,369","9,799.76","80.98",
" 106/02/16","5,167,818,860","108,303,272,962","1,033,581","9,771.25","-28.51",
" 106/02/17","4,358,105,024","95,178,870,089","916,682","9,759.76","-11.49",
" 106/02/18","

In [3]:
import urllib.request

In [8]:
fullmarket_api = 'http://www.tse.com.tw/ch/trading/exchange/FMTQIK/FMTQIK.php'
for month in range(1,3):
    for year in range(2017,2018):
        fullmarket_query = 'download=csv&query_year=%d&query_month=%d' % (year, month)
        fullmarket_url = fullmarket_api + '?' + fullmarket_query
        print(fullmarket_url)
        saveFile = str(year) + str(month) + '.csv'
        #urllib.request.urlretrieve(fullmarket_url, saveFile)

http://www.tse.com.tw/ch/trading/exchange/FMTQIK/FMTQIK.php?download=csv&query_year=2017&query_month=1
http://www.tse.com.tw/ch/trading/exchange/FMTQIK/FMTQIK.php?download=csv&query_year=2017&query_month=2


In [None]:
from http.client import HTTPConnection
import urllib.parse

In [62]:
import re
dataRe = r'^"\s*\d+/\d+/\d+"'
floatRe = r'[+-]*\d+\.\d+'
dateRe = r'\d+/\d+/\d+'
dataList = list()
for item in stockDataList:
    if re.match(dataRe,item):
        #print(item)
        dataList.append(item.split('","'))
for i, items in enumerate(dataList):
    for j, data in enumerate(items):
        data = data.replace('"','')
        data = data.replace(',','')
        data = data.replace(' ','')
        if data.isdigit():
            data = int(data)
        elif re.match(floatRe, data):
            data = float(data)
        elif re.match(dateRe, data):
            y = int(data.split('/')[0])+1911
            m = int(data.split('/')[1])
            d = int(data.split('/')[2])
            data = datetime.date(y, m, d)
        #print(data)
        dataList[i][j] = data
for items in dataList:
    print(items)

[datetime.date(2016, 1, 4), 3828317506, 77036676791, 823702, 8114.26, -223.8]
[datetime.date(2016, 1, 5), 4174711678, 80349248930, 835646, 8075.11, -39.15]
[datetime.date(2016, 1, 6), 4862380750, 99143623931, 944604, 7990.39, -84.72]
[datetime.date(2016, 1, 7), 5920713736, 117577039714, 1151598, 7852.06, -138.33]
[datetime.date(2016, 1, 8), 4534181205, 92853774729, 859799, 7893.97, 41.91]
[datetime.date(2016, 1, 11), 4324346686, 85321943005, 864273, 7788.42, -105.55]
[datetime.date(2016, 1, 12), 4774778088, 86552474918, 878969, 7768.45, -19.97]
[datetime.date(2016, 1, 13), 4191855350, 76588652185, 785161, 7824.61, 56.16]
[datetime.date(2016, 1, 14), 4276900082, 81779081768, 843642, 7742.88, -81.73]
[datetime.date(2016, 1, 15), 4228452132, 90430983043, 834866, 7762.01, 19.13]
[datetime.date(2016, 1, 18), 4556531786, 81693572616, 899532, 7811.18, 49.17]
[datetime.date(2016, 1, 19), 4401843015, 74328444326, 777581, 7854.88, 43.7]
[datetime.date(2016, 1, 20), 4995793415, 89909264962, 93414

In [30]:
b = 'aaaaaaaaa'
b.replace('a','b')
print(b)

aaaaaaaaa


In [61]:
import datetime
a = '105/01/04'
dateRe = r'\d+/\d+/\d+'
re.match(dateRe,a)
y = int(a.split('/')[0])+1911
m = int(a.split('/')[1])
d = int(a.split('/')[2])
dateformat = datetime.date(y, m, d)

print(dateformat)


2016-01-04


2011-01-20
