## 相似图片去重

In [None]:
import os
import cv2
import numpy as np

In [None]:
images_dir = r'D:\ProgramData\Temp\BP3730-2\L N'
n = 64  # 重设大小后的图片大小（n x n）
beta = 0.8  # 相关系数阈值

In [None]:
# Step 1: 获取图片路径列表
images_path_list = [os.path.join(images_dir, filename) for filename in os.listdir(images_dir)]

In [None]:
# Step 2: 读取并resize图片，并形成二维数组
images = [(cv2.resize((cv2.imdecode(np.fromfile(image_path, np.uint8), 0)), (n, n))).ravel() for image_path in images_path_list]

In [None]:
# Step 3: 计算相关系数,并将对角线元素设为0
corr_matrix = np.corrcoef(images)
np.fill_diagonal(corr_matrix, 0)

In [None]:
# Step 4: 找到相关系数大于beta的相似图片
similar_images = np.argwhere(corr_matrix > beta)

In [None]:
# Step 5: 构建相似组
groups = []
for pair in similar_images:
	added = False
	for group in groups:
		if pair[0] in group or pair[1] in group:
			group.add(pair[0])
			group.add(pair[1])
			added = True
			break
	if not added:
		groups.append(set(pair))

In [None]:
# Step 6: 删除相似组中的重复图片
for group in groups:
	keep_index = min(group)  # 保留相似组中的第一张图片
	for index in group:
		if index != keep_index:
			os.remove(images_path_list[index])


In [3]:
import os

path = r'D:\ProgramData\Temp\[Marugaogaski]'
for i in range(3):
	dir = os.path.join(path, '手' + str(i + 1))
	if not os.path.exists(dir):
		os.makedirs(dir)


In [2]:
def divide_array_into_groups(arr, num_groups):
    # 检查分组数量是否合理
    if num_groups <= 0:
        return "Number of groups should be greater than 0."

    # 计算每个分组的基本大小和余数
    group_size = len(arr) // num_groups
    remainder = len(arr) % num_groups

    # 初始化结果列表和起始索引
    result = []
    start = 0

    # 遍历分组数量
    for _ in range(num_groups):
        # 计算当前分组的结束索引，考虑余数
        end = start + group_size + (1 if remainder > 0 else 0)

        # 将当前分组添加到结果列表
        result.append(arr[start:end])

        # 更新起始索引和余数
        start = end
        remainder -= 1

    # 返回分组后的结果列表
    return result

# 示例
your_array = [1, 2, 3, 4, 5, 6, 7, 8]
number_of_groups = 3
result = divide_array_into_groups(your_array, number_of_groups)
print(result)


[[1, 2, 3], [4, 5, 6], [7, 8]]


## 文件去重

In [2]:
import os
import re
from collections import defaultdict

# 假设文件都在当前目录下  
directory = 'D:\Temp'

# 查找所有文件  
files = os.listdir(directory)

# 使用正则表达式匹配文件名中的前缀和数字后缀  
def parse_filename(filename):
	match = re.match(r'^(.*?)-(\d+)?$', filename)
	if match:
		prefix, suffix = match.groups()
		suffix = int(suffix) if suffix else None
		return prefix, suffix
	return None, None

# 对文件进行分类，按照前缀分组，并在每个组内按照后缀排序  
grouped_files = defaultdict(list)
for filename in files:
	prefix, suffix = parse_filename(filename)
	if prefix:
		grouped_files[prefix].append((suffix, filename))

	# 遍历每个前缀组，保留最新文件并删除其余文件  
for prefix, files_with_suffixes in grouped_files.items():
	# 按照后缀降序排序文件，没有后缀的文件排在最前面  
	sorted_files = sorted(files_with_suffixes, key=lambda x: x[0], reverse=True)

	# 保留最新的文件（即列表中第一个文件）并删除其余文件  
	latest_file_info = sorted_files[0]
	latest_suffix, latest_filename = latest_file_info
	for suffix, filename in sorted_files[1:]:
		file_path = os.path.join(directory, filename)
		os.remove(file_path)
		print(f"Deleted {file_path}")

		# 如果需要，去除最新文件名的后缀  
	if latest_suffix is not None:
		base_name, ext = os.path.splitext(latest_filename)
		new_base_name = base_name.rsplit('-', 1)[0]  # 移除数字后缀  
		new_filename = new_base_name + ext
		old_file_path = os.path.join(directory, latest_filename)
		new_file_path = os.path.join(directory, new_filename)
		os.rename(old_file_path, new_file_path)
		print(f"Renamed {old_file_path} to {new_file_path}")

KeyboardInterrupt: 