In [1]:
import sys
import os
import math
import random
import heapq 
import time
import copy
import gc
import numpy as np
import pandas as pd
from functools import reduce
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import pdist
from sklearn.metrics import confusion_matrix,roc_curve,accuracy_score,auc 
from PIL import Image
import matplotlib.pyplot as plt
import cv2
import faiss 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import baostock as bs#pip install baostock
import mplfinance as mpf #pip install mplfinance
from matplotlib.pylab import date2num
import datetime
import smtplib
from email.mime.text import MIMEText
from email.header import Header
torch.cuda.set_device(2)
print (torch.cuda.current_device())

Loading faiss with AVX2 support.


2


1.Training model and output database(faiss)

In [None]:
#Generate Dataset
root_dir = '/data/fjsdata/qtsys/img/' #the path of images
data = pd.read_csv('/data/fjsdata/qtsys/label.csv') 
data = data.sample(frac=1).reset_index(drop=True) #shuffle
#Dataset
trN,trI, trY =[], [],[]
for _, row in data.iterrows():
    try:
        image_path = os.path.join(root_dir, row['name'])
        img = cv2.resize(cv2.imread(image_path).astype(np.float32), (256, 256))#(1600,800,3)->(256,256,3)
        trN.append(row['name'])
        trI.append(img)
        if row['label']=='B':
            trY.append(0) #buy
        else:# row['label']=='S':
            trY.append(1) #sell
    except:
        print(iname+":"+str(image_path))
    sys.stdout.write('\r{} / {} '.format(len(trY),data.shape[0]))
    sys.stdout.flush()
    
#Generate image pairs for model
def onlineGenImgPairs():
    if (len(trY) % 2) == 0: spls = len(trY)
    else:  spls = len(trY)-1
    idx_sf = random.sample(range(0, spls),spls)
    trI1_sf, trI2_sf, trY1_sf, trY2_sf = [],[],[],[]
    flag = 0
    for i in idx_sf:
        if flag==0:
            trI1_sf.append(trI[i])
            trY1_sf.append(trY[i])
            flag =1
        else:
            trI2_sf.append(trI[i])
            trY2_sf.append(trY[i])
            flag =0
    trY_sf = np.where((np.array(trY1_sf)-np.array(trY2_sf))!=0,1,0)
    return np.array(trI1_sf),np.array(trI2_sf),trY_sf

#define model: ASH
class SpatialAttention(nn.Module):#spatial attention layer
    def __init__(self):
        super(SpatialAttention, self).__init__()

        self.conv1 = nn.Conv2d(2, 1, kernel_size=3, padding=1, bias=False)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        x = self.conv1(x)
        return self.sigmoid(x)
    
class ResBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, stride=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(
                in_channels=in_channels, out_channels=out_channels,
                kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False),
            nn.BatchNorm2d(out_channels),
        )

        self.downsample_layer = None
        self.do_downsample = False
        if in_channels != out_channels or stride != 1:
            self.do_downsample = True
            self.downsample_layer = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 3, stride, 1, bias=False),
                nn.BatchNorm2d(out_channels),
            )

        # initialize weights
        self.apply(self.init_weights)

    def forward(self, x):
        identity = x
        out = self.net(x)

        if self.do_downsample:
            identity = self.downsample_layer(x)

        return F.relu(out + identity, inplace=True)

    @staticmethod
    def init_weights(m):
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
            nn.init.xavier_normal_(m.weight)
            
class ASHNet(nn.Module):
    def __init__(self, code_size: int):
        super().__init__()
        #Resnet
        self.net = nn.Sequential(
            ResBlock(in_channels=3, out_channels=16),
            ResBlock(in_channels=16, out_channels=16),
            ResBlock(in_channels=16, out_channels=16, stride=2),
        ) 
        #Attention 
        self.sa = SpatialAttention() 
        #fully connected
        self.linear = nn.Sequential(
            #nn.Linear(16*128*128, 4096),
            #nn.ReLU(inplace=True),
            nn.Linear(16*128*128, code_size),
            #nn.ReLU(inplace=True) #nn.Tanh()#[-1,1]
        )
        
        # initialize weights
        self.apply(self.init_weights)

    def forward(self, x):
        x = self.net(x)
        x = self.sa(x)*x
        x = x.view(x.size(0),-1)
        x = self.linear(x)
        return x

    @staticmethod
    def init_weights(m):
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
            nn.init.xavier_normal_(m.weight)

#dfine loss function:pairwise loss            
class HashLossFunc(nn.Module):
    def __init__(self, margin=0.5, alpha=0.01):
        super(HashLossFunc, self).__init__()
        self.alpha = alpha #regularization
        self.margin = margin #margin threshold
        self.mse_loss = nn.MSELoss(reduction='none')
        self.l1_loss = nn.L1Loss(reduction='mean')
    
    def forward(self,h1,h2,y):    
        margin_val = self.margin * h1.shape[1]
        squared_loss = torch.mean(self.mse_loss(h1, h2), dim=1)
        # T1: 0.5 * (1 - y) * dist(x1, x2)
        positive_pair_loss = (0.5 * (1 - y) * squared_loss)
        mean_positive_pair_loss = torch.mean(positive_pair_loss)
        # T2: 0.5 * y * max(margin - dist(x1, x2), 0)
        zeros = torch.zeros_like(squared_loss)
        marginMat = margin_val * torch.ones_like(squared_loss)
        negative_pair_loss = 0.5 * y * torch.max(zeros, marginMat - squared_loss)
        mean_negative_pair_loss = torch.mean(negative_pair_loss)

        # T3: alpha(dst_l1(abs(x1), 1)) + dist_l1(abs(x2), 1)))
        mean_value_regularization = self.alpha * (
                self.l1_loss(torch.abs(h1), torch.ones_like(h1)) +
                self.l1_loss(torch.abs(h2), torch.ones_like(h2)))

        loss = mean_positive_pair_loss + mean_negative_pair_loss + mean_value_regularization
        return loss


#train model
hash_size=12
model = ASHNet(code_size=hash_size).cuda()
criterion  = HashLossFunc(margin=0.5).cuda() #define loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) #define optimizer
#train model
best_net, best_loss = None, float('inf')
batchSize = 10
for epoch in range(50):#iteration
    trI1_sf, trI2_sf, trY_sf = onlineGenImgPairs()
    losses = []
    num_batches = len(trY_sf) // batchSize +1
    for i in range(num_batches):
        optimizer.zero_grad()#grad vanish
        min_idx = i * batchSize
        max_idx = np.min([len(trY_sf), (i+1)*batchSize])
        I1_batch = torch.from_numpy(trI1_sf[min_idx:max_idx]).type(torch.FloatTensor).cuda()
        I2_batch = torch.from_numpy(trI2_sf[min_idx:max_idx]).type(torch.FloatTensor).cuda()
        Y_batch = torch.from_numpy(trY_sf[min_idx:max_idx]).type(torch.FloatTensor).cuda()
        #forword
        X1_batch = model(I1_batch.permute(0, 3, 1, 2))#permute the dims of matrix
        X2_batch = model(I2_batch.permute(0, 3, 1, 2))
        #binary-like loss
        loss = criterion(X1_batch,X2_batch,Y_batch)
        #backward
        loss.backward()
        #update parameters
        optimizer.step()
        #show loss
        #sys.stdout.write('\r {} / {} : loss = {}'.format(i+1, num_batches, float('%0.6f'%loss.item())))
        #sys.stdout.flush()     
        losses.append(loss.item())
    #print("Eopch: %5d mean_loss = %.6f" % (epoch + 1, np.mean(losses)))
    if np.mean(losses) < best_loss:
        best_loss = np.mean(losses)
        best_net = copy.deepcopy(model)
print("best_loss = %.6f" % (best_loss))

#release gpu memory
model = model.cpu()
loss=loss.cpu()
torch.cuda.empty_cache()
#output the feature with best model
#torch.cuda.synchronize()
batchSize = 10
num_batches = len(trI) // batchSize +1
trF = []
for i in range(num_batches):
    min_idx = i * batchSize
    max_idx = np.min([len(trI), (i+1)*batchSize])
    I_batch = torch.from_numpy(np.array(trI[min_idx: max_idx])).type(torch.FloatTensor).cuda()
    X_batch = best_net(I_batch.permute(0, 3, 1, 2))#forword
    X_batch = torch.tanh(X_batch) #[-1,1]
    I_batch = I_batch.cpu()
    X_batch = X_batch.cpu()
    torch.cuda.empty_cache()#release gpu memory
    trF.extend(X_batch.data.numpy().tolist())
    sys.stdout.write('\r {} / {} '.format(i, num_batches))
    sys.stdout.flush()
    
# buliding index for retrieval
tstart = time.time()
cpu_index = faiss.IndexFlatL2(hash_size) #
gpu_index = faiss.index_cpu_to_all_gpus(cpu_index) #make all gpu usable
gpu_index.add(np.ascontiguousarray(trF, dtype=np.float32)) #add data(must be float32) to index
elapsed = time.time() - tstart    
print('Completed buliding index in %d seconds' % int(elapsed))

407 / 407 

2. Generate the K line and retrieve from database.

In [12]:
#Calculate MACD
def cal_macd_system(data,short_,long_,m):
    '''
    data=['Open','High','Low','Close','Volume']
    parameter: short_,long_,m
    return:data=['Open','High','Low','Close','Volume','diff','dea','macd']
    '''
    data['diff']=data['Close'].ewm(adjust=False,alpha=2/(short_+1),ignore_na=True).mean()-\
                data['Close'].ewm(adjust=False,alpha=2/(long_+1),ignore_na=True).mean()
    data['dea']=data['diff'].ewm(adjust=False,alpha=2/(m+1),ignore_na=True).mean()
    data['macd']=2*(data['diff']-data['dea'])
    return data
def macd_zero(macd):
    pos_signal, neg_signal = [],[]
    for idx,value in macd.iteritems():
        if value > 0:
            pos_signal.append(value)
            neg_signal.append(np.nan)
        else:
            neg_signal.append(value)
            pos_signal.append(np.nan)
    return pos_signal,neg_signal

#http://baostock.com/baostock/index.php/Python_API
#generate market chart
lg = bs.login() #login
#read stocks information
rs_hs300 = bs.query_hs300_stocks() 
rs_zh500 = bs.query_zz500_stocks()
rs_sh50 = bs.query_sz50_stocks()
hs_stocks = []
while (rs_hs300.error_code == '0') & rs_hs300.next():
    hs_stocks.append(rs_hs300.get_row_data()[1])
while (rs_zh500.error_code == '0') & rs_zh500.next():
    hs_stocks.append(rs_zh500.get_row_data()[1])
while (rs_sh50.error_code == '0') & rs_sh50.next():
    hs_stocks.append(rs_sh50.get_row_data()[1])
hs_stocks = list(set(hs_stocks)) #get rid of repeated stock
#read k data
fields= "Date,Code,Open,High,Low,Close,Volume"
for code in hs_stocks:
    #read transaction data
    rs = bs.query_history_k_data(code=code, fields=fields, \
                                 start_date='2020-02-23', end_date='2020-03-21', \
                                 frequency="30",adjustflag="3") #20days，one k line per 30 minutes
    data_list = []
    while (rs.error_code == '0') & rs.next():
        data_list.append(rs.get_row_data())
    result = pd.DataFrame(data_list, columns=rs.fields)
    result=result.apply(pd.to_numeric, errors='ignore')
    if result.shape[0] ==160:
        #plot K line 
        result = result[['Open','High','Low','Close','Volume']]
        result.index=pd.to_datetime(result.index)#turn index to datatime
        result = cal_macd_system(result,12,26,9)
        pos_macd, neg_macd  = macd_zero(result['macd']) 
        apds = [ mpf.make_addplot(result['diff'],panel='lower',color='b'),
                     mpf.make_addplot(result['dea'],panel='lower',color='y'),
                     mpf.make_addplot(pos_macd,panel='lower',color='r',scatter=True),
                     mpf.make_addplot(neg_macd,panel='lower',color='g',scatter=True)
                   ]
        kwargs = dict(type='candle',figratio =(16,8),volume=False,figscale=1)#line，mav=(5,10)
        Kline_path ='/data/fjsdata/qtsys/real0320/'+code+'.png'
        save = dict(fname=Kline_path,dpi=100, pad_inches=0.2)
        mpf.plot(result,**kwargs,addplot=apds,style='sas',savefig=save)#charles
        plt.close()
        Kline_img = cv2.resize(cv2.imread(Kline_path).astype(np.float32), (256, 256)) #read image 
        teI = []
        teI.append(Kline_img)
        #output feature with model
        teI = torch.from_numpy(np.array(teI)).type(torch.FloatTensor).cuda()
        teI = best_net(teI.permute(0, 3, 1, 2))#forword
        teI = torch.tanh(teI) #[-1,1]
        teI = teI.cpu().data.numpy().tolist()
        #retrieve from DB
        #np.linalg.norm(vec1 - vec2) #consine l2-norm 
        scores, neighbors = gpu_index.search(np.ascontiguousarray(teI, dtype=np.float32), k=1) #return top1
        if scores.flatten()[0]< 0.002: #similarity for sell
            label = trY[neighbors.flatten()[0]] 
            name = trN[neighbors.flatten()[0]]
            if label == 1:
                print('%s-S-%s'%(code,name))
            elif (label == 0 and scores.flatten()[0]< 0.001): #similarity for buy
                print('%s-B-%s'%(code,name))
            else:os.remove(Kline_path) #remove the image file if no handle
        else: os.remove(Kline_path) #remove the image file if no handle 
    #else: print('Error in collecting data:%s' % (code))
bs.logout()#logout

login success!
sh.600637-B-sh.600028-1.png
sz.000766-B-sh.600038-6.png
sz.000997-B-sh.600196-3.png
sh.603888-B-sh.600025-2.png
sz.002368-B-sz.002373-0319.png
sh.603377-B-sz.002739-1.png
sz.000425-B-sh.603986-3.png
sh.600703-B-sz.002736-0319.png
sz.300033-B-sh.601998-4.png
logout success!


<baostock.data.resultset.ResultData at 0x7f659079e290>

In [41]:
import tushare as ts

#ts.get_today_all()
df = ts.get_realtime_quotes('600036')
print(df)

   name    open pre_close   price    high     low     bid     ask    volume  \
0  招商银行  29.800    29.310  30.280  30.890  29.800  30.270  30.280  81172164   

           amount  ...    a2_p a3_v    a3_p a4_v    a4_p a5_v    a5_p  \
0  2469136983.000  ...  30.290  850  30.300   42  30.310   40  30.320   

         date      time    code  
0  2020-03-20  11:20:09  600036  

[1 rows x 33 columns]
