In [1]:
import time
from functools import wraps, partial
import pandas as pd
import numpy as np
import json
from datetime import date,datetime,timedelta
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup as BS
import re
from collections import deque

import sched

import asyncio
import nest_asyncio
import motor.motor_asyncio
nest_asyncio.apply()

from apscheduler.triggers.combining import OrTrigger
from apscheduler.triggers.cron import CronTrigger
from apscheduler.schedulers.background import BackgroundScheduler

### use in jupyter Notebook
#from ipywidgets import interactive,IntSlider,FloatSlider,Dropdown,Button,fixed,HBox,VBox,Layout,Play,jslink
from IPython.display import display,clear_output,HTML
import ipywidgets as widgets
import functools
import warnings
warnings.filterwarnings('ignore')

In [2]:
loop = asyncio.get_event_loop()
sched = BackgroundScheduler()
out = widgets.Output()

In [3]:
reCategory = r"^(tse_t|otc_o)([0-9]*[1-9])\.tw$"
reTopMarket= r"^(tse_t00|otc_o00)\.tw$"

g_collect_dq = {
    'tse': {
        'up': deque(maxlen=2),  # [0]: fetch update [1]: display
        'down': deque(maxlen=2)
    },
    'otc': {
        'up': deque(maxlen=2),
        'down': deque(maxlen=2)
    }
}

# time event 9:00 ~ 15:00
crawler_trigger = OrTrigger([
   CronTrigger(minute='*/1', second='30')
])

display_trigger = OrTrigger([
   CronTrigger(second='*/59')
])

# add to collect queue
def collect_to_deque(func):
    @wraps(func)
    def wrap_func(*args, **kwargs):
        try:
            exchange = kwargs['exchange']
            trend = kwargs['trend']
            df = func(*args, **kwargs)
            if len(g_collect_dq[exchange][trend]) == g_collect_dq[exchange][trend].maxlen:
                g_collect_dq[exchange][trend].popleft() 
            item = (df, datetime.now().isoformat())
            g_collect_dq[exchange][trend].append(item)
            #print ("exchange:{}.trend:{} update time {}".format(exchange, trend, datetime.now().isoformat()))
            
            file = './rank_market/{}/{}_{}_{}.csv'.format(exchange,date.today().isoformat(), exchange, trend)
            df.to_csv(file, encoding='cp950')
            return item
        except Exception as e:
            raise
    return wrap_func

# add delay to crawler 
#def add_delay_to_crawler

In [4]:
a = deque(maxlen=2)
a.append(1)
a.append(2)
len(a)
a[1]

2

In [5]:
#595.0000_596.0000_597.0000_598.0000_599.0000_
def split_bidask(bidask):
    return bidask.split('_')[0]

async def _quote_category_market(URL):
    rsp = requests.get(URL)
    data = json.loads(rsp.text)
    df = pd.DataFrame(data['msgArray'])
    df = df[['n', 'o', 'h', 'l', 'y', 'a', 'b']].replace(r'-', np.nan, regex=True).dropna()
    df['a'] = df['a'].apply(split_bidask)
    df['b'] = df['b'].apply(split_bidask)
    for col in ['o', 'h', 'l', 'y', 'a', 'b']:
        df[col] = df[col].astype(float)
    df['diff'] = ((df['a']+df['b'])/2 - df['y']) / df['y'] * 100
    df = df.round(2)
    return df

async def _quote_category_ids(URL):
    rsp = requests.get(URL)
    data = json.loads(rsp.text)
    df = pd.DataFrame(data['msgArray'])
    df['cat_id'] = df['ex'] + '_' + df['ch']
    return df['cat_id'].tolist()

In [6]:
async def _quote_top_market(URL):
    rsp = requests.get(URL)
    data = json.loads(rsp.text)
    df = pd.DataFrame(data['msgArray'])
    df['quote_id'] = df['ex'] + '_' + df['ch']
    df = df[['n', 'y', 'z', 'quote_id']].replace(r'-', np.nan, regex=True).dropna()
    for col in ['y', 'z']:
        df[col] = df[col].astype(float)
    df['diff'] = (df['z'] - df['y']) / df['y'] * 100
    df = df.round(2)
    return df    

In [7]:
def get_top_market(exchange='tse', trend='up', limit=10):
    if exchange == 'tse':
        top_list = 'tse_t00.tw|tse_TW50.tw|tse_TW50C.tw|tse_TWMC.tw|tse_TWIT.tw|tse_TWEI.tw|tse_TWDP.tw|tse_EMP99.tw|tse_HC100.tw|tse_CG100.tw|tse_FRMSA.tw|tse_t001.tw|tse_t002.tw|tse_t003.tw|tse_SC300.tw|tse_t011.tw|tse_t031.tw|tse_t051.tw|tse_t01.tw|tse_t02.tw|tse_t03.tw|tse_t04.tw|tse_t05.tw|tse_t06.tw|tse_t07.tw|tse_t21.tw|tse_t22.tw|tse_t08.tw|tse_t09.tw|tse_t10.tw|tse_t11.tw|tse_t12.tw|tse_t13.tw|tse_t24.tw|tse_t25.tw|tse_t26.tw|tse_t27.tw|tse_t28.tw|tse_t29.tw|tse_t30.tw|tse_t31.tw|tse_t14.tw|tse_t15.tw|tse_t16.tw|tse_t17.tw|tse_t18.tw|tse_t23.tw|tse_t20.tw|tse_TTDRL2.tw|tse_TTDRIN.tw|tse_EDRL2.tw|tse_EDRIN.tw|tse_IX0103.tw|tse_IX0108.tw|tse_IX0109.tw|tse_IX0125.tw|tse_IX0133.tw|tse_IX0139.tw|tse_IR0129.tw|tse_IR0131.tw|tse_IR0135.tw|tse_IX0142.tw|tse_IX0143.tw|tse_IX0145.tw'        
    else:
        top_list = 'otc_o00.tw|otc_IX0118.tw|otc_IX0138.tw|otc_GTSM50.tw|otc_GTHD.tw|otc_EMP88.tw|otc_GTCI.tw|otc_TPCGI.tw|otc_IX0134.tw|otc_GAME.tw|otc_o13.tw|otc_o04.tw|otc_o05.tw|otc_o21.tw|otc_o22.tw|otc_o10.tw|otc_o24.tw|otc_o25.tw|otc_o26.tw|otc_o27.tw|otc_o28.tw|otc_o29.tw|otc_o30.tw|otc_o31.tw|otc_o32.tw|otc_o14.tw|otc_o15.tw|otc_o16.tw|otc_o20.tw|otc_TWTBI.tw|tse_IX0103.tw|tse_IX0108.tw|tse_IX0109.tw|tse_IX0133.tw|tse_IX0139.tw|tse_IR0131.tw|tse_IR0135.tw|tse_IX0142.tw|tse_IX0143.tw|tse_IX0145.tw|otc_IR0118.tw|otc_IR0138.tw|otc_IR0140.tw|otc_IR0141.tw'
    URL = 'https://mis.twse.com.tw/stock/api/getStockInfo.jsp?ex_ch={}|&json=1&delay=0'.format(top_list)
    df = loop.run_until_complete(_quote_top_market(URL))
    df1 = df[(df['quote_id'].str.contains(reCategory, regex=True))]
    df2 = df[(df['quote_id'].str.contains(reTopMarket, regex=True))]
    df1 = df1.sort_values(by=['diff'], ascending=False if trend == 'up' else True)
    dfr = pd.concat([df2, df1[:limit]], axis=0)
    return dfr

In [8]:
def get_category_market(exchange='tse', catid='01',trend='up', limit=10):
    catURL = 'https://mis.twse.com.tw/stock/api/getCategory.jsp?ex={}&i={}'.format(exchange, catid)
    cat_list = loop.run_until_complete(_quote_category_ids(catURL))
    cat_dfs = []
    for i in range(0, len(cat_list), 100): 
        time.sleep(1)
        scat_list = cat_list[i:i+100]
        itemURL = 'https://mis.twse.com.tw/stock/api/getStockInfo.jsp?ex_ch={}|&json=1'.format('|'.join(scat_list))
        cat_df = loop.run_until_complete(_quote_category_market(itemURL))
        cat_dfs.append(cat_df)
    dfr = pd.concat(cat_dfs, axis=0)
    dfr = dfr.sort_values(by=['diff'], ascending=False if trend == 'up' else True)
    dfr = dfr[:limit]
    return dfr

In [9]:
topdf = get_top_market(exchange='tse', limit=10)
topdf

Unnamed: 0,n,y,z,quote_id,diff
0,發行量加權股價指數,17814.33,17847.52,tse_t00.tw,0.19
33,半導體類指數,412.59,420.1,tse_t24.tw,1.82
32,電子類指數,834.84,845.83,tse_t13.tw,1.32
17,機電類指數,4657.05,4715.74,tse_t051.tw,1.26
36,通信網路類指數,131.83,133.15,tse_t27.tw,1.0
40,其他電子類指數,106.22,107.2,tse_t31.tw,0.92
34,電腦及週邊設備類指數,138.93,140.18,tse_t25.tw,0.9
31,汽車類指數,342.16,344.72,tse_t12.tw,0.75
27,玻璃陶瓷類指數,96.7,97.38,tse_t08.tw,0.7
19,食品類指數,1938.48,1951.42,tse_t02.tw,0.67


In [10]:
def split_quote_id(quite_id):
    m = re.search(reCategory, quite_id)
    if m:
        return m.group(2)
    
@collect_to_deque
def mixin_all_market(exchange='tse', trend='up', limit=10, mixin=10):
    topdf = get_top_market(exchange, trend, limit)
    catids = topdf.iloc[1:10]['quote_id'].apply(split_quote_id).tolist()
    cat_top5 = [[None]*mixin]
    for catid in catids:
        #print ("test:ex:{}.cat:{}".format(exchange, catid))
        try:
            catdf = get_category_market(exchange, catid, trend, mixin)
            catit = (catdf['n'] + ':' + catdf['diff'].astype(str)).tolist()
            if len(catit) < mixin:
                catit+=[None]*(mixin-len(catit))
            cat_top5.append(catit)
        except Exception as e:
            cat_top5.append([None]*mixin)
            continue
    cat_top5+=[[None]*mixin]*(limit-10+1)
    for i, item in enumerate(zip(*cat_top5)):
        topdf['Top_{}'.format(i)] = item
    return topdf

def wrap_exist_catid(topdf, exchange='tse'):
    topdf['exist'] = [0]*topdf.index
    catids = topdf.iloc[1:]['quote_id'].apply(split_quote_id).tolist()
    for i, catid in enumerate(catids):
        try:
            catdf = get_category_market(exchange=exchange, catid=catid)
            if not catdf.empty:
                topdf['exist'].iloc[1+i] = 1
        except Exception as e:
            pass
    return topdf

In [11]:
topdf = get_top_market(exchange='tse', limit=50)
topdf = wrap_exist_catid(topdf, exchange='tse')
topdf[topdf['exist']==0]

Unnamed: 0,n,y,z,quote_id,diff,exist
0,發行量加權股價指數,17814.33,17847.52,tse_t00.tw,0.19,0
32,電子類指數,834.84,845.83,tse_t13.tw,1.32,0
17,機電類指數,4657.05,4715.74,tse_t051.tw,1.26,0
11,未含金融保險股指數,15670.36,15698.33,tse_t001.tw,0.18,0
15,水泥窯製類指數,840.64,838.53,tse_t011.tw,-0.25,0
16,塑膠化工類指數,979.67,975.08,tse_t031.tw,-0.47,0
24,化學生技醫療類指數,117.23,115.75,tse_t07.tw,-1.26,0
12,未含電子股指數,20550.43,20228.08,tse_t002.tw,-1.57,0
13,未含金融電子股指數,18302.41,17897.17,tse_t003.tw,-2.21,0


In [12]:
topdf = get_top_market(exchange='otc', limit=50)
topdf = wrap_exist_catid(topdf, exchange='otc')
topdf[topdf['exist']==0]

Unnamed: 0,n,y,z,quote_id,diff,exist
0,櫃檯指數,219.32,218.09,otc_o00.tw,-0.56,0
10,電子工業類指數,362.04,363.29,otc_o13.tw,0.35,0


In [13]:
df, update = mixin_all_market(exchange='otc', trend='down')

In [14]:
mixin_all_market(exchange='tse', trend='down')

(            n         y         z     quote_id  diff      Top_0       Top_1  \
 0   發行量加權股價指數  17814.33  17847.52   tse_t00.tw  0.19       None        None   
 42      航運類指數    345.41    315.28   tse_t15.tw -8.72   榮運:-9.86   宅配通:-7.88   
 28      造紙類指數    489.50    472.27   tse_t09.tw -3.52    士紙:-6.4    榮成:-4.75   
 13  未含金融電子股指數  18302.41  17897.17  tse_t003.tw -2.21       None        None   
 23    電器電纜類指數     84.12     82.40   tse_t06.tw -2.04   華電:-4.57     華榮:-4.5   
 30      橡膠類指數    351.98    345.32   tse_t11.tw -1.89   台橡:-5.27    申豐:-4.93   
 43      觀光類指數    106.63    104.78   tse_t16.tw -1.73   華園:-6.38    夏都:-2.92   
 21    紡織纖維類指數    686.55    675.51   tse_t04.tw -1.61   集盛:-9.44    聚隆:-5.95   
 12    未含電子股指數  20550.43  20228.08  tse_t002.tw -1.57       None        None   
 26    生技醫療類指數     72.07     70.98   tse_t22.tw -1.51  亞諾法:-5.21  寶齡富錦:-5.21   
 24  化學生技醫療類指數    117.23    115.75   tse_t07.tw -1.26       None        None   
 
         Top_2     Top_3       Top_4  

In [15]:
mixin_all_market(exchange='tse', trend='up')

(             n         y         z     quote_id  diff     Top_0      Top_1  \
 0    發行量加權股價指數  17814.33  17847.52   tse_t00.tw  0.19      None       None   
 33      半導體類指數    412.59    420.10   tse_t24.tw  1.82   誠創:9.73   華邦電:9.27   
 32       電子類指數    834.84    845.83   tse_t13.tw  1.32      None       None   
 17       機電類指數   4657.05   4715.74  tse_t051.tw  1.26      None       None   
 36     通信網路類指數    131.83    133.15   tse_t27.tw  1.00   盛達:9.77    建漢:8.42   
 40     其他電子類指數    106.22    107.20   tse_t31.tw  0.92   和椿:9.54    尖點:3.75   
 34  電腦及週邊設備類指數    138.93    140.18   tse_t25.tw  0.90   研華:5.72   大眾控:5.17   
 31       汽車類指數    342.16    344.72   tse_t12.tw  0.75  怡利電:4.28    為升:4.21   
 27     玻璃陶瓷類指數     96.70     97.38   tse_t08.tw  0.70   台玻:1.17  凱撒衛:-1.03   
 19       食品類指數   1938.48   1951.42   tse_t02.tw  0.67   統一:1.23    味王:0.66   
 45     貿易百貨類指數    360.58    361.88   tse_t18.tw  0.36      None       None   
 
        Top_2     Top_3      Top_4    Top_5      T

In [16]:
mixin_all_market(exchange='otc', trend='up')

(             n       y       z    quote_id  diff     Top_0      Top_1  \
 0         櫃檯指數  219.32  218.09  otc_o00.tw -0.56      None       None   
 23     其他電子類指數  159.02  160.63  otc_o31.tw  1.01  光洋科:5.99    勤凱:3.53   
 16      半導體業指數  137.75  138.69  otc_o24.tw  0.68   點序:8.12     欣銓:8.1   
 13     化學工業類指數  125.36  126.16  otc_o21.tw  0.64  美琪瑪:6.87  誠泰科技:1.05   
 19     通信網路業指數   85.45   85.81  otc_o27.tw  0.42    天宇:6.0    璟德:5.96   
 10     電子工業類指數  362.04  363.29  otc_o13.tw  0.35      None       None   
 18       光電業指數   38.18   38.19  otc_o26.tw  0.03   東捷:7.33    均豪:5.52   
 17  電腦及週邊設備業指數   92.01   91.85  otc_o25.tw -0.17   欣技:5.15    濱川:3.96   
 25     建材營造類指數  162.70  162.41  otc_o14.tw -0.18    富宇:2.5    三豐:0.57   
 12     電機機械類指數  160.66  160.02  otc_o05.tw -0.40   皇田:4.24    易發:1.68   
 27     觀光事業類指數   68.35   67.88  otc_o16.tw -0.69      None       None   
 
        Top_2      Top_3     Top_4     Top_5      Top_6      Top_7     Top_8  \
 0       None       None      

In [17]:
def crawler_top():
    time.sleep(15)
    start_time = datetime.utcnow()
    for exchange in ['tse', 'otc']:
        for trend in ['up', 'down']:
            mixin_all_market(exchange=exchange, trend=trend)
    consume_secs = (datetime.utcnow() - start_time).total_seconds()
    print ("crawler {} top all consume {}/s".format(datetime.utcnow().isoformat(), consume_secs))

In [18]:
def display_out_all_market(out):
    def _df_to_html(df):
        if not df.empty:
            return df.to_html()

    for exchange in ['tse', 'otc']:
        for trend in ['up', 'down']:
            try:
                df, crawl_time = g_collect_dq[exchange][trend][1]
                if not df.empty:
                    out.append_display_data(HTML(df.to_html()))
                    out.append_stdout("display exchange:{}.trend:{} update time {}".format(
                        exchange, trend, crawl_time))
            except Exception as e:
                #print (e)
                pass
    return out

In [19]:
def display_top(out):
    out.clear_output()
    display_out_all_market(out)


In [20]:
display(out)

Output()

In [21]:
# may cause concurrent run jobs has too often accessed, sometimes will be blocked 
#sched.add_job(mixin_all_market, trigger, kwargs={'exchange':'tse', 'trend':'up'}, jitter=35)
#sched.add_job(mixin_all_market, trigger, kwargs={'exchange':'otc', 'trend':'up'}, jitter=35)
#sched.add_job(mixin_all_market, trigger, kwargs={'exchange':'tse', 'trend':'down'}, jitter=35)
#sched.add_job(mixin_all_market, trigger, kwargs={'exchange':'otc', 'trend':'down'}, jitter=35)  

sched.add_job(crawler_top, crawler_trigger, args=())  
sched.add_job(display_top, display_trigger, args=(out,))
sched.start()

In [22]:
#for i in range(1):
#    crawler_top()
#   # display_top(out)