# 常规赛：PALM病理性近视预测 第10名方案

本方案使用TNT模型进行训练与预测，在有限训练次数下，取得了较稳定的成绩：0.99515

**采取的训练图像预处理方案：**

* 随机垂直翻转
* 随机角度翻转--0~20度
* 缩放大小--（520，520）
* 归一化--mean:[0.2, 0.3, 0.5], std:[0., 0., 0.]

**采取的验证与预测图像预处理方案：**

* 缩放大小--（520，520）
* 归一化--mean:[0.2, 0.3, 0.5], std:[0., 0., 0.]

**数据集划分比例：**0.8

**TNT模型比较：**

* 微调参数： tnt_s_patch16_224 得分 > tnt_b_patch16_224 得分

**后期优化方向：**

* 更合适的处理方式
* patch大小与数量
* 图片输入大小等

# 一、解压数据集

In [None]:
!unzip -oq /home/aistudio/data/data85133/常规赛：PALM病理性近视预测.zip

# 二、导入相应的包

In [1]:
import pandas as pd
import os
import time
from tqdm import tqdm
import cv2 as cv
import numpy as np
import math

import paddle
from paddle import nn
from paddle import optimizer
from paddle import regularizer
from paddle import metric
from paddle.nn import loss
from paddle.nn import Layer

from paddle.io import Dataset, DataLoader
from paddle.vision import transforms

# 三、解析数据，制作Dataset

In [2]:
Image_path = '常规赛：PALM病理性近视预测/Train/fundus_image'
Train_data = pd.read_excel('常规赛：PALM病理性近视预测/Train/Classification.xlsx')
Train_data.head()

In [3]:
for i in range(len(Train_data)):
    Train_data.iloc[i, 0] = os.path.join(Image_path, Train_data.iloc[i, 0])
Train_data = Train_data.sample(frac=1.0).reset_index(drop=True)
Train_data.head()

In [4]:
Test_data = []
Test_path = '常规赛：PALM病理性近视预测/PALM-Testing400-Images'
for _, _, files in os.walk(Test_path):
    for i in files:
        Test_data.append([i, 0])
Test_data = np.asarray(Test_data)
Test_data = pd.DataFrame(Test_data)
Test_data = Test_data.sort_values(by=0, ascending=True).reset_index(drop=True)
for i in range(len(Test_data)):
    Test_data.iloc[i, 0] = os.path.join(Test_path, Test_data.iloc[i, 0])
Test_data.head()

In [5]:
class Train_Dataset(Dataset):
    '''加载训练集
        把数据加载函数拼进来
    '''
    def __init__(self, df, trans=None):
        super(Train_Dataset, self).__init__()

        self.df = df
        
        if trans is None:
            self.trans = transforms.Compose([
                transforms.RandomVerticalFlip(),
                transforms.RandomRotation(20),
                transforms.Resize(size=(520, 520)),
                transforms.ToTensor(),
                transforms.Normalize([0.2, 0.3, 0.5])
            ])
        else:
            self.trans = trans

        self.lens = len(df)

    def __getitem__(self, indexs):

        im_data, im_label = self._load_img_and_label(self.df, indexs)

        im_data = self.trans(im_data)

        return im_data, paddle.to_tensor(im_label)

    def _load_img_and_label(self, df, index):
        '''加载DF中的路径为图片和标签
            df: 输入DF
            index: 第几条数据
            mode: 加载训练集数据模式还是测试集模式--区别在于是否转换数据域
        '''
        assert index < self.lens, \
            'please check the index, which has more than the dataset length!'

        im_data = cv.imread(df.iloc[index, 0], cv.COLOR_BGR2RGB)  # 转为RGB数据

        im_label = int(df.iloc[index, 1])  # 标签
        
        return np.asarray(im_data).astype('float32'), im_label
    

    def __len__(self):
        return self.lens



class Test_Dataset(Dataset):
    '''加载测试集
        把数据加载函数拼进来
    '''
    def __init__(self, df, trans=None):
        super(Test_Dataset, self).__init__()

        self.df = df
        
        if trans is None:
            self.trans = transforms.Compose([
                transforms.Resize(size=(520, 520)),  # 保证迁移前后输入特征大小一致
                transforms.ToTensor(),
                transforms.Normalize([0.2, 0.3, 0.5])
            ])
        else:
            self.trans = trans

        self.lens = len(df)

    def __getitem__(self, indexs):

        im_data, im_label = self._load_img_and_label(self.df, indexs)

        im_data = self.trans(im_data)

        return im_data, paddle.to_tensor(im_label)

    def _load_img_and_label(self, df, index):
        '''加载DF中的路径为图片和标签
            df: 输入DF
            index: 第几条数据
            mode: 加载训练集数据模式还是测试集模式--区别在于是否转换数据域
        '''
        assert index < self.lens, \
            'please check the index, which has more than the dataset length!'

        im_data = cv.imread(df.iloc[index, 0], cv.COLOR_BGR2RGB)  # 转为RGB数据

        im_label = int(df.iloc[index, 1])  # 标签
        
        return np.asarray(im_data).astype('float32'), im_label
    

    def __len__(self):
        return self.lens

# 四、配置训练参数

In [6]:
# 训练参数-=dict
Train_Paramdict = {
    'data_length':len(Train_data),  # 数据长度
    'train_frac':0.8,              # 训练集比例
    'num_class':2,                  # 类别
    'epoches':100,                   # 训练轮次
    'batchsize':8,                 # 批量大小
    'lr':0.0001,                      # 学习率
    'l2':0.0005                    # L2正则化参数
}

In [7]:
# 数据集划分
Fit_data  = Train_data.iloc[:int(Train_Paramdict['data_length']*Train_Paramdict['train_frac'])]
Eval_data = Train_data.iloc[int(Train_Paramdict['data_length']*Train_Paramdict['train_frac']):]

In [8]:
# 数据加载
Fit_dataset = Train_Dataset(Fit_data)
Eval_dataset = Test_Dataset(Eval_data)
All_dataset = Train_Dataset(Train_data)

Fit_dataloader = DataLoader(Fit_dataset, batch_size=Train_Paramdict['batchsize'], shuffle=True)
Eval_dataloader = DataLoader(Eval_dataset, batch_size=Train_Paramdict['batchsize'])
All_dataloader = DataLoader(All_dataset, batch_size=Train_Paramdict['batchsize'], shuffle=True)

# 五、导入模型

TNT模型在TNT.py中

In [9]:
# 创建模型
from TNT import tnt_s_patch16_224, tnt_b_patch16_224
model = tnt_s_patch16_224(img_size=520, num_classes=2)
model = paddle.Model(model)

lr = optimizer.lr.LinearWarmup(
    learning_rate=Train_Paramdict['lr'],
    warmup_steps = 2000,
    start_lr = 0, 
    end_lr = Train_Paramdict['lr']
)

O = optimizer.Adam(lr, parameters=model.parameters(), weight_decay=regularizer.L2Decay(Train_Paramdict['l2']))
L = loss.CrossEntropyLoss()
M = metric.Accuracy()

model.prepare(O, L, M)

## 1. 训练

In [10]:
model.fit(
    Fit_dataloader,
    Eval_dataloader,
    epochs=Train_Paramdict['epoches']
)

## 2.加载测试数据

In [13]:
# 数据加载
Test_dataset = Test_Dataset(Test_data)
Test_dataloader = DataLoader(Test_dataset, batch_size=Train_Paramdict['batchsize'])

## 3.预测结果并生成提交结果

In [14]:
results = model.predict(Test_dataloader)

In [15]:
results = np.asarray(results)

In [17]:
import paddle.nn.functional as F

submit_result = []
for i in results[0]:
    i = paddle.to_tensor(i)
    i = F.softmax(i)
    result = i[:, 1]
    submit_result += result.numpy().tolist()
len(submit_result)

In [18]:
submit_result = np.asarray(submit_result)

In [19]:
Test_data.iloc[:, 1] = submit_result
Test_data.head()

In [20]:
Submit_data = Test_data.copy()
Submit_data.head()

In [21]:
Submit_data.columns = ['FileName', 'PM Risk']
Submit_data.head()

In [22]:
for i in range(len(Submit_data)):
    Submit_data.iloc[i, 0] = Submit_data.iloc[i, 0][-9:]
Submit_data.head()

In [23]:
Submit_data.to_csv('Classification_Results.csv', index=False, float_format="%.1f")