## 初始化配置信息

In [1]:
class Config(object):
    def __init__(self):
        self.dataPath = '.'
        self.trainRatio = '0.8'
        self.modelPath = './model/wz.pdparams'
        self.logPath = './logs'
        self.pointsPath = 'checkpoint'
        self.inferencePath = './inference/wz'
        self.use_gpu = 0

## 预测模型

In [2]:
import paddle
from paddle.nn import Linear
import paddle.nn.functional as F
from paddle.nn import Conv2D, MaxPool2D, Linear
import numpy as np

class WZPredict(paddle.nn.Layer):
    def __init__(self):
        super(WZPredict, self).__init__()
         
        self.fc1 = paddle.nn.Linear(in_features=29, out_features=60)
        self.fc2 = paddle.nn.Linear(in_features=60, out_features=40)
        self.fc3 = paddle.nn.Linear(in_features=40, out_features=20)
        self.fc4 = paddle.nn.Linear(in_features=20, out_features=10)
        self.fc5 = paddle.nn.Linear(in_features=10, out_features=6)
        self.fc6 = paddle.nn.Linear(in_features=6, out_features=2)
        self.relu = paddle.nn.ReLU()
    
    @paddle.jit.to_static  # 添加装饰器，使动态图网络结构在静态图模式下运行
    def forward(self, inputs):
        x = self.fc1(inputs)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.relu(x)
        x = self.fc5(x)
        x = self.relu(x)
        x = self.fc6(x)
        return x

## 数据加载

In [3]:
import paddle
import random
import numpy as np
import pandas as pd

class DataAsync(paddle.io.Dataset):
    def __init__(self, mode='train'):
        
        cf = Config()
        self.cf = cf
        self.dataPath = cf.dataPath
        self.trainRatio = float(cf.trainRatio)
        self.datas = []
        self.labels = []
        self.mean = []
        self.std = []
        
        train_df = pd.read_csv(self.dataPath+'/train.csv')
        labels = np.array(train_df['win']).astype('int64')
        train_df = train_df.drop(['id', 'timecc','win'], axis=1)
        datas = np.array(train_df.values.tolist())
        
        self.mean = np.mean(datas,axis = 0)
        self.std = np.std(datas,axis = 0)
    
        
        offset = int(datas.shape[0] * self.trainRatio)
        print(datas.shape[0], self.trainRatio, offset)
        self.labels = labels[:offset] if mode == 'train' else labels[offset:]
        self.datas = datas[:offset] if mode == 'train' else datas[offset:]

        print(self.labels.shape)
        print(self.datas.shape)
        
        
    def __getitem__(self, idx):
        data = (self.datas[idx] - self.mean)/self.std
        label = self.labels[idx]


        data = np.reshape(data, [29]).astype('float32')
        label = np.reshape(label, [1]).astype('int64')
        return data, label

    def __len__(self):
        return len(self.datas)

## 网络模型

In [4]:
import paddle
from paddle.nn import Linear
import paddle.nn.functional as F
from paddle.nn import Conv2D, MaxPool2D, Linear
import numpy as np

class WZPredict(paddle.nn.Layer):
    def __init__(self):
        super(WZPredict, self).__init__()
         
        self.fc1 = paddle.nn.Linear(in_features=29, out_features=60)
        self.fc2 = paddle.nn.Linear(in_features=60, out_features=40)
        self.fc3 = paddle.nn.Linear(in_features=40, out_features=20)
        self.fc4 = paddle.nn.Linear(in_features=20, out_features=10)
        self.fc5 = paddle.nn.Linear(in_features=10, out_features=6)
        self.fc6 = paddle.nn.Linear(in_features=6, out_features=2)
        self.relu = paddle.nn.ReLU()
    
    @paddle.jit.to_static  # 添加装饰器，使动态图网络结构在静态图模式下运行
    def forward(self, inputs):
        x = self.fc1(inputs)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.relu(x)
        x = self.fc5(x)
        x = self.relu(x)
        x = self.fc6(x)
        return x

## 训练

In [5]:
from operator import mod
import paddle
from paddle.nn import Linear
import paddle.nn.functional as F
from visualdl import LogWriter
import numpy as np
class Train(object):
    batch_size = 1400
    EPOCH_NUM = 50
    data = None
    def __init__(self):
        cf = Config()
        self.log_writer = LogWriter(cf.logPath)
        self.modelPath = cf.modelPath
        self.logPath = cf.logPath
        self.pointsPath = cf.pointsPath
        self.use_gpu = cf.use_gpu
        
    def run(self, startStp = 0):
        model = WZPredict()
        #开启GPU
        paddle.set_device('gpu:0') if self.use_gpu else paddle.set_device('cpu')

        train_dataset = DataAsync(mode='train')
        train_loader = paddle.io.DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=0, drop_last=True, use_shared_memory=False)

        #各种优化算法均可以加入正则化项，避免过拟合，参数regularization_coeff调节正则化项的权重
        opt = paddle.optimizer.Adam(learning_rate=0.0008, weight_decay=paddle.regularizer.L2Decay(coeff=1e-5), parameters=model.parameters())
        if startStp > 0:
            params_dict = paddle.load('./{}/wz_epoch{}'.format(self.pointsPath,startStp-1)+'.pdparams')
            opt_dict = paddle.load('./{}/wz_epoch{}'.format(self.pointsPath,startStp-1)+'.pdopt')

            # 加载参数到模型
            model.set_state_dict(params_dict)
            opt.set_state_dict(opt_dict)
        
        iter = 0
        for epoch_id in range(startStp, self.EPOCH_NUM):
            for batch_id, data in enumerate(train_loader()):
                #准备数据，变得更加简洁
                datas, labels = data
                datas = paddle.to_tensor(datas)
                labels = paddle.to_tensor(labels)

                
                #前向计算的过程
                predits = model(datas)
                acc = paddle.metric.accuracy(input=predits, label=labels)
                loss = F.cross_entropy(predits, labels)
                avg_loss = paddle.mean(loss)    
                
                #每训练了200批次的数据，打印下当前Loss的情况
                if batch_id % 100 == 0:
                    print("epoch: {}, batch: {}, loss is: {}, acc is {}".format(epoch_id, batch_id, avg_loss.numpy(), acc.numpy()))
                    self.log_writer.add_scalar(tag = 'acc', step = iter, value = acc.numpy())
                    self.log_writer.add_scalar(tag = 'loss', step = iter, value = avg_loss.numpy())
                    iter = iter + 100
                #后向传播，更新参数的过程
                avg_loss.backward()
                opt.step()
                opt.clear_grad()
            
            # 每个批次保存一次
            paddle.save(opt.state_dict(), './{}/wz_epoch{}'.format(self.pointsPath,epoch_id)+'.pdopt')
            paddle.save(model.state_dict(), './{}/wz_epoch{}'.format(self.pointsPath,epoch_id)+'.pdparams')

        # 保存模型
        paddle.save(model.state_dict(), self.modelPath)
        
train = Train()
train.run()


180000 0.8 144000
(144000,)
(144000, 29)


  return (isinstance(seq, collections.Sequence) and


epoch: 0, batch: 0, loss is: [0.69031024], acc is [0.5164286]
epoch: 0, batch: 100, loss is: [0.49212542], acc is [0.825]
epoch: 1, batch: 0, loss is: [0.48253977], acc is [0.8207143]
epoch: 1, batch: 100, loss is: [0.3623078], acc is [0.82857144]
epoch: 2, batch: 0, loss is: [0.36389598], acc is [0.83]
epoch: 2, batch: 100, loss is: [0.36219525], acc is [0.8192857]
epoch: 3, batch: 0, loss is: [0.32167023], acc is [0.85714287]
epoch: 3, batch: 100, loss is: [0.3407231], acc is [0.8521429]
epoch: 4, batch: 0, loss is: [0.34469593], acc is [0.84]
epoch: 4, batch: 100, loss is: [0.32857063], acc is [0.8442857]
epoch: 5, batch: 0, loss is: [0.35593274], acc is [0.8257143]
epoch: 5, batch: 100, loss is: [0.33116665], acc is [0.85642856]
epoch: 6, batch: 0, loss is: [0.3564794], acc is [0.83928573]
epoch: 6, batch: 100, loss is: [0.34613478], acc is [0.83214283]
epoch: 7, batch: 0, loss is: [0.35950345], acc is [0.82357144]
epoch: 7, batch: 100, loss is: [0.35375994], acc is [0.83357143]
ep

## 验证

In [6]:
import paddle
import paddle.nn.functional as F
from visualdl import LogWriter
class Eval(object):
    batch_size = 10
    EPOCH_NUM = 10
    data = None
    def __init__(self):
        cf = Config()
        self.log_writer = LogWriter(cf.logPath)
        self.modelPath = cf.modelPath
        self.logPath = cf.logPath
        self.pointsPath = cf.pointsPath
        self.use_gpu = cf.use_gpu
        
    def run(self):
        model = WZPredict()
        #开启GPU
        paddle.set_device('gpu:0') if self.use_gpu else paddle.set_device('cpu')

        valid_dataset = DataAsync(mode='vaild')
        valid_loader = paddle.io.DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=False, num_workers=0, drop_last=False, use_shared_memory=False)
       
        # 加载模型参数
        param_dict = paddle.load(self.modelPath)
        model.load_dict(param_dict)
        model.eval()
        for vaild_id, vaild_data in enumerate(valid_loader()):
            datas_v, labels_v = vaild_data
            datas_v = paddle.to_tensor(datas_v)
            labels_v = paddle.to_tensor(labels_v)
            predits_v = model(datas_v)
            acc_v = paddle.metric.accuracy(input=predits_v, label=labels_v)
            loss_v = F.cross_entropy(predits_v, labels_v)
            avg_loss_v = paddle.mean(loss_v)
            if vaild_id % 100 == 0:
                print("batch: {}, loss is: {}, acc is {}".format(vaild_id, avg_loss_v.numpy(), acc_v.numpy()))
        
eval = Eval()
eval.run()



180000 0.8 144000
(36000,)
(36000, 29)
batch: 0, loss is: [0.54243577], acc is [0.7]
batch: 100, loss is: [0.55123055], acc is [0.7]
batch: 200, loss is: [0.44265738], acc is [0.8]
batch: 300, loss is: [0.38558355], acc is [0.8]
batch: 400, loss is: [0.39668372], acc is [0.8]
batch: 500, loss is: [0.5111179], acc is [0.8]
batch: 600, loss is: [0.46419412], acc is [0.8]
batch: 700, loss is: [0.8092197], acc is [0.6]
batch: 800, loss is: [0.2705981], acc is [0.9]
batch: 900, loss is: [0.18662393], acc is [0.8]
batch: 1000, loss is: [0.5244156], acc is [0.7]
batch: 1100, loss is: [0.5667521], acc is [0.8]
batch: 1200, loss is: [0.36851034], acc is [0.7]
batch: 1300, loss is: [0.1477359], acc is [0.9]
batch: 1400, loss is: [0.81704557], acc is [0.4]
batch: 1500, loss is: [0.22093725], acc is [0.9]
batch: 1600, loss is: [0.26100057], acc is [0.9]
batch: 1700, loss is: [0.5651214], acc is [0.8]
batch: 1800, loss is: [0.3174451], acc is [0.8]
batch: 1900, loss is: [0.26437032], acc is [0.9]
b

## 导出模型

In [7]:
import paddle
from paddle.static import InputSpec

class Save(object):
    def __init__(self):
        cf = Config()
        self.dataPath = cf.dataPath
        self.modelPath = cf.modelPath
        self.inferencePath = cf.inferencePath
        self.pointsPath = cf.pointsPath
        self.use_gpu = cf.use_gpu
        
    def run(self):
        model = WZPredict()
    
        # 加载模型参数
        param_dict = paddle.load(self.modelPath)
        model.load_dict(param_dict)
        model.eval()
        
        # 保存inference模型
        paddle.jit.save(
        layer=model,
        path=self.inferencePath,
        input_spec=[InputSpec(shape=[None, 29], dtype='float32')])

        # input_spec 是根据模型 forward 参数的接入shape,多个参数接入目前不知道如何写
        print("==>Inference model saved in ", self.inferencePath)
        
save = Save()
save.run()

==>Inference model saved in  ./inference/wz


## 生成预测结果

In [8]:

import paddle
import os
import csv
import numpy as np
import pandas as pd
class Predict(object):
    data = None
    def __init__(self):
        cf = Config()
        self.dataPath = cf.dataPath
        self.modelPath = cf.modelPath
        self.logPath = cf.logPath
        self.pointsPath = cf.pointsPath
        self.use_gpu = cf.use_gpu
        self.inferencePath = cf.inferencePath

        
    def run(self):

        loaded_model = paddle.jit.load(self.inferencePath)
        paddle.set_device('gpu:0') if self.use_gpu else paddle.set_device('cpu')
       
        train_df = pd.read_csv(self.dataPath+'/train.csv')
        train_df = train_df.drop(['id', 'timecc','win'], axis=1)
        datas = np.array(train_df.values.tolist())
        mean = np.mean(datas,axis = 0)
        std = np.std(datas,axis = 0)


        test_df = pd.read_csv(self.dataPath+'/test.csv')
        test_df = test_df.drop(['id', 'timecc'], axis=1)
        datas = np.array(test_df.values.tolist())

        results = [['win']]

        for data in datas:
            data = (data - mean)/std
            data = np.reshape(data, [1,29]).astype('float32')
            data = paddle.to_tensor(data)
            label = loaded_model(data)
            label = paddle.argmax(label)
            results.append([int(label)])
        
        print(len(results))

        with open(self.dataPath + "/submission.csv", "w", newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(results)



predict = Predict()
predict.run()

20001


## 压缩

In [9]:
!zip submission.zip submission.csv

  adding: submission.csv (deflated 93%)
