In [26]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

import os
import pathlib
from pathlib import Path
from typing import Tuple, Dict, List
# 查找指定文件夹中的classes

# 设置数据文件夹
DATA_PATH = Path("data/")
IMAGE_PATH = DATA_PATH / "wordlib"    #
IMAGE_PATH_LIST = list(IMAGE_PATH.glob("*.gif"))  

# 如果文件夹不存在，则创建一个... 
if IMAGE_PATH.is_dir():
    print(f"{IMAGE_PATH} 文件夹存在，可以使用...")
else:
    print(f"{IMAGE_PATH}文件平不存在，创建中...")
    IMAGE_PATH.mkdir(parents=True, exist_ok=True)

# ### 准备数据，查找指定文件夹中包含哪些文字，并设置其classes和labels

def find_classes(directory: str,ext:str='gif',scan_images=True) -> Tuple[List[str], Dict[str, int],List[str]]:
    """根据指定文件夹下的图片文件名的第一名字形成类别classes.
    
    书法图片文件命名规范为：字_字体_书法家_文件编号.gif，如：予_行书_鲜于枢_12046.gif.

    Args:
        directory (str): target directory to load distinct words from.

    Returns:
        Tuple[List[str], Dict[str, int]]: (list_of_class_names, dict(class_name: idx...))
    
    Example:
        data\wordlib\予_行书_鲜于枢_12046.gif 分割_前面的字符是书法对应的文字
        >>> (["予", "大",...], {"予": 203, ...})
    """
    image_classes_set = set()  #因为相同的字有多张图，所以使用set集合去重
    images_classes_list=[]
    images_name_list=[]
    # 创建汉字列表及包含其序号的dict
    class_to_idx=dict()

    if scan_images:

        # 1. 扫描路径下全部文件，通过文件名首字符为图片所对应的汉字这样的命名规则，得到该路径下的全部汉字。
        image_path_list = list(pathlib.Path(directory).glob(f"*.{ext}"))

        for  path in   image_path_list:
            image_classes_set.add(path.name.split('_')[0])
            images_name_list.append(path.name)
        classes=sorted([word for word in image_classes_set])
        
        # 2. 如果文件不存在或没有按要求命名，则报错
        if not classes:
            raise FileNotFoundError(f"{directory}路径下的文件可能不存在或没有按要求命名（文件命名规则为word_font_writer_number.gif)")
   
        for i,word in enumerate(classes):
            class_to_idx[word]=i   
        
        # 3. 将图片文件扫描及解析的结果保存到csv文件
        df_classes=pd.DataFrame(classes,columns=['classes'])
        df_class_to_idx=pd.DataFrame(list(class_to_idx.items()),columns=['classes','idx'])
        df_class_to_idx=df_class_to_idx.set_index('classes')
        df_images_name_list=pd.DataFrame(images_name_list,columns=['images'])
        df_classes.to_csv('data/bd_classes.csv',header=True,index=False)
        df_class_to_idx.to_csv('data/bd_class_to_idx.csv',header=True)
        df_images_name_list.to_csv('data/bd_images_names.csv',header=True,index=False)
    else:
        df_classes=pd.read_csv('data/bd_classes.csv',header=0)
        classes=list(df_classes['classes'])

        df_class_to_idx=pd.read_csv('data/bd_class_to_idx.csv',header=0)
        class_to_idx=dict(zip(df_class_to_idx['classes'],df_class_to_idx['idx']))

        df_images_name_list=pd.read_csv('data/bd_images_names.csv',header=0)
        images_name_list=list(df_images_name_list['images'])

    return classes, class_to_idx, images_name_list


##  是模型训练的基础数据，重要，不要改动
images_classes_list,word_classes_dict,images_name_list=find_classes(IMAGE_PATH,'gif') 

data\wordlib 文件夹存在，可以使用...


In [27]:
images_classes_list,word_classes_dict,images_name_list=find_classes(IMAGE_PATH,'gif',scan_images=False) 

In [38]:
def find_writer_classes(directory: str,ext:str='gif',scan_images=True) -> Tuple[List[str], Dict[str, int],List[str]]:
    """根据指定文件夹下的图片文件名的第一名字形成类别classes.
    
    书法图片文件命名规范为：字_字体_书法家_文件编号.gif，如：予_行书_鲜于枢_12046.gif.

    Args:
        directory (str): target directory to load distinct words from.

    Returns:
        Tuple[List[str], Dict[str, int]]: (list_of_class_names, dict(class_name: idx...))
    
    Example:
        data\wordlib\予_行书_鲜于枢_12046.gif 最后一个分割符_后面的字符是书法对应的作者writer
        >>> (["鲜于枢", "王羲之"], {"王羲之": 266, ...})
    """
    # 1. 扫描路径下全部文件，通过文件名首字符为图片所对应的汉字这样的命名规则，得到该路径下的全部汉字。
    
    image_writer_classes_set = set()  #因为相同的字有多张图，所以使用set集合去重
    images_writer_classes_list=[]
    images_writer_name_list=[]
    writer_class_to_idx=dict()

    if scan_images:
        # 1. 扫描路径下全部文件，通过文件名首字符为图片所对应的汉字这样的命名规则，得到该路径下的全部汉字。
        image_path_list = list(pathlib.Path(directory).glob(f"*.{ext}"))
        
        for  path in   image_path_list:
            image_writer_classes_set.add(path.name.split('_')[2])
            images_writer_name_list.append(path.name)

        writer_classes=sorted([word for word in image_writer_classes_set])

        # 2. 如果文件不存在或没有按要求命名，则报错
        if not writer_classes:
            raise FileNotFoundError(f"{directory}路径下的文件可能不存在或没有按要求命名（文件命名规则为word_font_writer_number.gif)")

        # 3. 创建汉字列表及包含其序号的dict

        for i,word in enumerate(writer_classes):
            writer_class_to_idx[word]=i   
        
        # 3. 将图片文件扫描及解析的结果保存到csv文件
        df_writer_classes=pd.DataFrame(writer_classes,columns=['writer_classes'])
        df_writer_class_to_idx=pd.DataFrame(list(writer_class_to_idx.items()),columns=['writer_classes','idx'])
        df_writer_class_to_idx=df_writer_class_to_idx.set_index('writer_classes')
        df_images_writer_name_list=pd.DataFrame(images_writer_name_list,columns=['images'])
        df_writer_classes.to_csv('data/bd_writer_classes.csv',header=True,index=False)
        df_writer_class_to_idx.to_csv('data/bd_writer_class_to_idx.csv',header=True)
        df_images_writer_name_list.to_csv('data/bd_images_writer_name.csv',header=True,index=False)
    else:
        df_writer_classes=pd.read_csv('data/bd_writer_classes.csv',header=0)
        writer_classes=list(df_writer_classes['writer_classes'])

        df_writer_class_to_idx=pd.read_csv('data/bd_writer_class_to_idx.csv',header=0)
        writer_class_to_idx=dict(zip(df_writer_class_to_idx['writer_classes'],df_writer_class_to_idx['idx']))

        df_images_writer_name_list=pd.read_csv('data/bd_images_writer_name.csv',header=0)
        images_writer_name_list=list(df_images_writer_name_list['images'])

    


    return writer_classes, writer_class_to_idx, images_writer_name_list

##  是模型训练的基础数据，重要，不要改动
images_writer_classes_list,word_writer_classes_dict,images_writer_name_list=find_writer_classes(IMAGE_PATH,'gif') 


In [39]:
images_writer_classes_list,word_writer_classes_dict,images_writer_name_list=find_writer_classes(IMAGE_PATH,'gif',scan_images=False) 

In [42]:
images_writer_name_list

['一_楷书_孙秋生造像_103084.gif',
 '一_楷书_柳公权_103080.gif',
 '一_楷书_柳公权_103081.gif',
 '一_楷书_欧阳询_103082.gif',
 '一_楷书_欧阳询_103083.gif',
 '一_楷书_虞世南_103087.gif',
 '一_楷书_褚遂良_103077.gif',
 '一_楷书_褚遂良_103078.gif',
 '一_楷书_颜真卿_103085.gif',
 '一_楷书_颜真卿_103086.gif',
 '一_楷书_高贞碑_103079.gif',
 '一_篆书_徐三庚_1.gif',
 '一_草书_吴镇_131000.gif',
 '一_草书_孙过庭_130992.gif',
 '一_草书_宋克_130991.gif',
 '一_草书_张旭_131013.gif',
 '一_草书_张瑞图_131012.gif',
 '一_草书_张芝_131014.gif',
 '一_草书_徐伯清_131002.gif',
 '一_草书_徐伯清_131003.gif',
 '一_草书_徐伯清_131004.gif',
 '一_草书_徐伯清_131005.gif',
 '一_草书_徐伯清_131006.gif',
 '一_草书_徐伯清_131007.gif',
 '一_草书_徐伯清_131008.gif',
 '一_草书_徐伯清_131009.gif',
 '一_草书_徐伯清_131010.gif',
 '一_草书_徐伯清_131011.gif',
 '一_草书_怀素_130982.gif',
 '一_草书_怀素_130983.gif',
 '一_草书_敬世江_130987.gif',
 '一_草书_文天祥_130999.gif',
 '一_草书_杜预_130981.gif',
 '一_草书_毛泽东_130989.gif',
 '一_草书_王守仁_130993.gif',
 '一_草书_王羲之_130994.gif',
 '一_草书_王羲之_130995.gif',
 '一_草书_王羲之_130996.gif',
 '一_草书_王羲之_130997.gif',
 '一_草书_王羲之_130998.gif',
 '一_草书_皇象_130984.gif',
 '一_草书_米芾_130990.gif',
 '一_