<a href="https://colab.research.google.com/github/cu7th0n/ssq/blob/master/ssq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import requests
from bs4 import BeautifulSoup
import xlwt
import time

#获取第一页的内容
def get_one_page(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.text
    return 

#解析第一页内容，数据结构化
def parse_one_page(html):

    soup = BeautifulSoup(html,'lxml')
    i = 0
    for item in soup.select('tr')[2:-1]:

        yield{
            'time':item.select('td')[i].text,
            'digit_1':item.select('td em')[0].text,
            'digit_2':item.select('td em')[1].text,
            'digit_3':item.select('td em')[2].text,
            'digit_4':item.select('td em')[3].text,
            'digit_5':item.select('td em')[4].text,
            'digit_6':item.select('td em')[5].text,
            'digit_7':item.select('td em')[6].text,
        }

#将数据写入Excel表格中
def write_to_excel():
    f = xlwt.Workbook()                             
    sheet1 = f.add_sheet('ssq',cell_overwrite_ok=True)
    row0 = ["date","digit_1","digit_2","digit_3","digit_4","digit_5","digit_6","digit_7"]
    #写入第一行
    for j in range(0,len(row0)):
        sheet1.write(0,j,row0[j])

    #依次爬取每一页内容的每一期信息，并将其依次写入Excel
    i=0
    for k in range(1,120):
        url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_%s.html' %(str(k))
        html = get_one_page(url)
        
        #写入每一期的信息
        for item in parse_one_page(html):
            sheet1.write(i+1,0,item['time'])
            sheet1.write(i+1,1,item['digit_1'])
            sheet1.write(i+1,2,item['digit_2'])
            sheet1.write(i+1,3,item['digit_3'])
            sheet1.write(i+1,4,item['digit_4'])
            sheet1.write(i+1,5,item['digit_5'])
            sheet1.write(i+1,6,item['digit_6'])
            sheet1.write(i+1,7,item['digit_7'])
            i+=1
    
    f.save('ssq.xls')
    print('%d页已保存。'%k)
    
def main():
    write_to_excel()

if __name__ == '__main__':
    main()

119页已保存。


In [0]:
import pandas as pd
data = pd.read_excel('ssq.xls')

data['date'] = pd.to_datetime(data['date'])
data = data.sort_values(by = 'date')
data.reset_index(inplace=True)
del data['index']
del data['date']

In [0]:
data.head()

Unnamed: 0,digit_1,digit_2,digit_3,digit_4,digit_5,digit_6,digit_7
0,10,11,12,13,26,28,11
1,4,9,19,20,21,26,12
2,1,7,10,23,28,32,16
3,4,6,7,10,13,25,3
4,4,6,15,17,30,31,16


In [0]:
D_1 = data['digit_1']
D_2 = data['digit_2']
D_3 = data['digit_3']
D_4 = data['digit_4']
D_5 = data['digit_5']
D_6 = data['digit_6']
D_7 = data['digit_7']

In [0]:
models = ['M1_model.h5', 'M2_model.h5', 'M3_model.h5',
          'M4_model.h5', 'M5_model.h5','M6_model.h5','M7_model.h5']
tdatas = [D_1, D_2, D_3, D_4, D_5, D_6, D_7]

In [0]:
import numpy as np

def create_interval_dataset(dataset, look_back):
    """
    :param dataset: input array of time intervals
    :param look_back: each training set feature length
    :return: convert an array of values into a dataset matrix.
    """
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back):
        dataX.append(dataset[i:i+look_back])
        dataY.append(dataset[i+look_back])
    return np.asarray(dataX), np.asarray(dataY)


In [0]:
def train_model(train_set,mname,look_back = 200,data_dim = 34,batch_size = 1):
  from keras.utils import to_categorical
  from keras.models import Sequential
  from keras.layers import LSTM, Dense, Dropout
  import numpy as np

  timesteps = look_back
 
  # Expected input batch shape: (batch_size, timesteps, data_dim)
  # Note that we have to provide the full batch_input_shape since the network is stateful.
  # the sample of index i in batch k is the follow-up for the sample i in batch k-1.
  model = Sequential()
  model.add(LSTM(data_dim, return_sequences=True, stateful=True,
               batch_input_shape=(batch_size, timesteps, data_dim)))
  model.add(LSTM(data_dim*2, return_sequences=True, stateful=True))
  model.add(Dropout(0.3))
  
  model.add(LSTM(data_dim*4, return_sequences=True, stateful=True))
  model.add(Dropout(0.3))
  
  model.add(LSTM(data_dim*8, return_sequences=True, stateful=True))
  model.add(Dropout(0.3))

  model.add(LSTM(data_dim*8, stateful=True))
  model.add(Dense(data_dim, activation='softmax'))

  model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

  df = to_categorical(train_set,data_dim)
  
  dataX, dataY = create_interval_dataset(df, look_back) 
  
  total = len(train_set)
  split = total*8//10

  X_train = dataX[:split]
  y_train = dataY[:split]

  X_val = dataX[split+1:total-1]
  y_val = dataY[split+1:total-1]
  
  model.fit(X_train, y_train,batch_size=batch_size, epochs=1,
            shuffle=False,validation_data=(X_val, y_val))
  model.save(mname)


In [10]:
models = ['M1_model.h5', 'M2_model.h5', 'M3_model.h5',
          'M4_model.h5', 'M5_model.h5','M6_model.h5','M7_model.h5']
tdatas = [D_1, D_2, D_3, D_4, D_5, D_6, D_7]

look_back=150
for (model,tdata) in zip(models,tdatas):
  train_model(tdata,model,look_back)

Train on 1904 samples, validate on 325 samples
Epoch 1/1
Train on 1904 samples, validate on 325 samples
Epoch 1/1
Train on 1904 samples, validate on 325 samples
Epoch 1/1
Train on 1904 samples, validate on 325 samples
Epoch 1/1
Train on 1904 samples, validate on 325 samples
Epoch 1/1
Train on 1904 samples, validate on 325 samples
Epoch 1/1
Train on 1904 samples, validate on 325 samples
Epoch 1/1


In [12]:
from keras.models import load_model
from keras.utils import to_categorical
models = ['M1_model.h5', 'M2_model.h5', 'M3_model.h5', 'M4_model.h5', 'M5_model.h5','M6_model.h5','M7_model.h5']
tdatas = [D_1,D_2,D_3,D_4,D_5,D_6,D_7]

for (model,tdata) in zip(models,tdatas):
    
    print(model)
    M_ssq = load_model(model)

    test_X = to_categorical(np.asarray([tdata[-look_back:]]),num_classes=34)

    pred = M_ssq.predict(test_X)
    ranks = np.argsort(pred[0])
    
    for i in range(1,33):
        print(str(ranks[-i]) + ' : %.2f%%' %(pred[0][ranks[-i]]*100))
    print('\n')

M1_model.h5
1 : 20.77%
2 : 17.15%
6 : 10.39%
3 : 10.21%
4 : 10.02%
9 : 8.66%
5 : 7.73%
7 : 6.79%
8 : 3.58%
12 : 1.00%
13 : 0.93%
10 : 0.92%
11 : 0.84%
14 : 0.33%
16 : 0.31%
15 : 0.22%
20 : 0.02%
17 : 0.01%
22 : 0.01%
19 : 0.01%
24 : 0.01%
26 : 0.01%
31 : 0.01%
0 : 0.01%
32 : 0.01%
25 : 0.01%
18 : 0.01%
33 : 0.01%
29 : 0.01%
27 : 0.01%
28 : 0.01%
30 : 0.00%


M2_model.h5
8 : 8.60%
6 : 7.94%
10 : 7.85%
3 : 7.71%
9 : 7.33%
11 : 7.00%
12 : 6.95%
13 : 6.15%
4 : 5.82%
7 : 5.78%
5 : 5.42%
16 : 4.90%
14 : 4.81%
17 : 3.33%
2 : 3.10%
15 : 2.69%
18 : 1.60%
21 : 0.88%
20 : 0.69%
22 : 0.37%
19 : 0.34%
23 : 0.12%
26 : 0.07%
25 : 0.07%
24 : 0.06%
28 : 0.06%
27 : 0.06%
29 : 0.05%
0 : 0.05%
32 : 0.05%
33 : 0.05%
1 : 0.04%


M3_model.h5
14 : 7.36%
10 : 7.31%
15 : 6.76%
13 : 6.62%
12 : 6.10%
11 : 6.03%
19 : 5.82%
16 : 5.57%
17 : 5.57%
18 : 5.52%
8 : 5.50%
21 : 4.53%
20 : 4.32%
7 : 4.05%
9 : 3.82%
22 : 3.79%
23 : 2.17%
24 : 2.02%
5 : 1.69%
6 : 1.36%
25 : 1.18%
4 : 1.11%
26 : 0.42%
28 : 0.36%
3 : 0.21%
27 