> 对于训练而言，批量加载训练数据是个基本的工作。如果使用PyTorch，以及python语言去编程，在数据集导入这块儿，比如读取csv文件，我们可以方便的使用pandas库去加载数据，对libtorch而言，虽然没有pandas类库，但是官方还是提供了一些方便的接口函数供大家使用。

In [3]:
#include <iostream>
#include <vector>  
#include <list>
#include <tuple>
#include <fstream>

/*a workaround to solve cling issue*/
#include "../macos_cling_workaround.hpp"
/*set libtorch path, load libs*/
#include "../load_libtorch.hpp"
/*set opencv4 path, load libs*/
#include "../load_opencv.hpp"
/*import custom defined macros*/
#include "../custom_def.hpp"
/*import libtorch header file*/
#include <torch/torch.h>
/*import opencv4*/
#include <opencv2/opencv.hpp>

std::cout << std::boolalpha;

# 1.加载mnist数据集

在libtorch官方例程中，提供了一个使用mnist数据集进行训练的[例子](https://pytorch.org/cppdocs/frontend.html)，其中关于加载数据集的代码如下:

```

  // Create a multi-threaded data loader for the MNIST dataset.
  auto data_loader = torch::data::make_data_loader(
      torch::data::datasets::MNIST("./data").map(
          torch::data::transforms::Stack<>()),
      /*batch_size=*/64);


  ...
  ...
  ...


  for (size_t epoch = 1; epoch <= 10; ++epoch) {
    size_t batch_index = 0;
    // Iterate the data loader to yield batches from the dataset.
    for (auto& batch : *data_loader) {
      ...
      ...
      // Execute the model on the input data.
      torch::Tensor prediction = net->forward(batch.data);
      // Compute a loss value to judge the prediction of our model.
      torch::Tensor loss = torch::nll_loss(prediction, batch.target);
      ...
      ...
      // Output the loss and checkpoint every 100 batches.
      if (++batch_index % 100 == 0) {
        std::cout << "Epoch: " << epoch << " | Batch: " << batch_index
                  << " | Loss: " << loss.item<float>() << std::endl;
      ...
      ...
      ...
      }
    }
  }


```

In [50]:
//定义一个MNIST数据集句柄，默认为训练数据集
auto mnist_train = torch::data::datasets::MNIST("../../dataset/mnist"/*数据集路径*/);
printT(mnist_train.is_train());

//定义一个MNIST数据集句柄，指定为验证数据集
auto mnist_val = torch::data::datasets::MNIST("../../dataset/mnist", torch::data::datasets::MNIST::Mode::kTest);
printT(mnist_val.is_train());

//加载好的图片存放在images()中
printT(mnist_train.images().dim());

auto s = mnist_train.images().sizes();

std::cout << "dataset info(imgs * chan * rows * cols):" << std::endl;
for(int i = 0; i < s.size(); i++) {
    std::cout << s[i];
    if (i < (s.size()-1)) std::cout << " * ";
}
std::cout << std::endl << std::endl;


//加载好的标注存放在targets()中
printT(mnist_train.targets().dim());

s = mnist_train.targets().sizes();

std::cout << "dataset info(targets):" << std::endl;
for(int i = 0; i < s.size(); i++) {
    std::cout << s[i];
    if (i < (s.size()-1)) std::cout << " * ";
}
std::cout << std::endl;


auto example = mnist_train.get(3);
// printT(example.data);
printT(example.target);

mnist_train.is_train() = 
true
<<--->>

mnist_val.is_train() = 
false
<<--->>

mnist_train.images().dim() = 
4
<<--->>

dataset info(imgs * chan * rows * cols):
60000 * 1 * 28 * 28

mnist_train.targets().dim() = 
1
<<--->>

dataset info(targets):
60000
example.target = 
1
[ CPULongType{} ]
<<--->>



# 2.构造自定义Dataset

注：本节参考[1](https://discuss.pytorch.org/t/libtorch-how-to-use-torch-datasets-for-custom-dataset/34221/2)， [2](https://github.com/mhubii/libtorch_custom_dataset/blob/master/custom_dataset.h) 以及libtorch中[MNIST的实现](https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/src/data/datasets/mnist.cpp);


*因为插件的限制，无法把所有代码归集到一个cell中执行，所以函数会写入多个cell，可能会比较分散，看官见谅；*

In [4]:
torch::Tensor parse_label_file(const std::string& label_file)
{
    torch::Tensor t_lab;
    //open lable file, read out all labels
    ///*/ label file format:
    ///*/ obj_cls,x,y,width,height
    ///*/ obj_cls : object class
    ///*/ x,y     : left-top point position
    ///*/ width   : width of anchor
    ///*/ height  : height of anchor
    
    return t_lab;
}

In [5]:
typedef std::tuple<torch::Tensor,torch::Tensor> data_sample;
typedef std::vector<data_sample> data_sample_list;

In [6]:
//写一个用于加载数据的接口函数，此处我们假想数据存储方式类似MSCOCO数据集，即图片和标注都存在一个txt文件中：
//即一行图片一行标注；
//加载数据集时，只需制定这个txt文件的路径即可；
data_sample_list read_data(const std::string& file_path) 
{
    std::fstream in(file_path, std::ios::in);
    std::string line;
    std::string name;
    std::string label;
    int line_idx = 0;
    data_sample_list sample_list;
    
    torch::Tensor t_img, t_lab;
    while (std::getline(in, line))
    {
      if (0 == line_idx % 2) {
        //读取到图片文件路径，将图片读取到并转成torch::Tensor
        cv::Mat image = cv::imread(line);
        /*
         * !!!
         * 由于xeus-cling插件问题，from_blob(...)在编译时会报错，因此本
         * 例子无法在jupyter notebook中演示，建议单独写cpp文件并用gcc
         * 编译；
         * !!!
         */
        t_img = torch::from_blob(image.data, {image.rows, image.cols, 3}, torch::kByte).clone();
      } else {
        //读取到label文件路径，将数据读取到并存成torch::Tensor
        t_lab = parse_label_file(line);
      }
      
      data_sample d(t_img, t_lab);
      sample_list.push_back(d);
      line_idx ++;
    }
    
    return sample_list;
}

In [7]:
class NewDataset : public torch::data::Dataset<NewDataset>
{
  private:
    //定义images和label，仿照MNIST
    //std::list<std::tuple<torch::Tensor/*image*/, torch::Tensor/*label*/>> 
    data_sample_list dataset_;

  public:
    //file_path 是数据集描述文件，里面存放着图片以及对应label文件的路径
    explicit NewDataset(const std::string& file_path/*csv or txt file*/)
    : dataset_(read_data(file_path)) {  };

    torch::data::Example<> get(size_t index) override {
      // You may for example also read in a .csv file that stores locations
      // to your data and then read in the data at this step. Be creative.
      return {std::get<0>(dataset_[index]), std::get<1>(dataset_[index])};
    }
    
    // Override the size method to infer the size of the data set.
    torch::optional<size_t> size() const override {
        return dataset_.size();
    };
}

In [8]:
//定义好的数据集可以使用data_loader进行加载
// Generate your data set. At this point you can add transforms to you data set, e.g. stack your
// batches into a single tensor.
std::string file_path = "../../dataset/ch02_samples/alldata.txt"; //此处填入数据文件地址
auto data_set = NewDataset(file_path).map(torch::data::transforms::Stack<>());
int batch_size = 64;

// Generate a data loader.
auto data_loader = torch::data::make_data_loader<torch::data::samplers::SequentialSampler>(
    std::move(data_set), 
    batch_size);

// // In a for loop you can now use your data.
// for (auto& batch : data_loader) {
//     auto data = batch.data;
//     auto labels = batch.target;
//     // do your usual stuff
// }

IncrementalExecutor::executeFunction: symbol '__emutls_v._ZSt11__once_call' unresolved while linking function '_GLOBAL__sub_I_cling_module_19'!
IncrementalExecutor::executeFunction: symbol '__emutls_v._ZSt15__once_callable' unresolved while linking function '_GLOBAL__sub_I_cling_module_19'!
