
 * @Author: cunyu277 2465899266@qq.com
 * @Date: 2025-04-11 00:27:45
 * @LastEditors: cunyu277 2465899266@qq.com
 * @LastEditTime: 2025-04-14 15:59:27
 * @FilePath: \crop_yield_prediction\cunuu\clean\one.ipynb
 * @Description: 
 * 
 * Copyright (c) 2025 by yh, All Rights Reserved. 



### 先对数据进行检查，看看是否存在缺失，以及是否对齐

In [2]:
import os
import rasterio
from collections import defaultdict

def check_modis_data(base_path="E:\\Crop\\NorthChina"):
    # 数据目录与对应标准
    data_dirs = {
        "ENTIRE": {"name": "反射率", "bands": 112},
        "FPAR": {"name": "FPAR", "bands": 16},
        "LAI": {"name": "叶面积指数", "bands": 16},
        "LULC": {"name": "土地利用", "bands": 1},
        "Temperature": {"name": "温度", "bands": 32}
    }
    
    results = defaultdict(list)
    
    for dir_key, spec in data_dirs.items():
        dir_path = os.path.join(base_path, dir_key)
        if not os.path.exists(dir_path):
            results['missing_dirs'].append(spec['name'])
            continue
            
        for file in os.listdir(dir_path):
            if not file.endswith('.tif'):
                continue
                
            # 检查文件名格式
            parts = file.split('_')
            if len(parts) < 3 or not parts[-1].split('.')[0].isdigit():
                results['invalid_files'].append(f"{spec['name']}/{file}")
                continue
                
            # 检查波段数
            try:
                with rasterio.open(os.path.join(dir_path, file)) as src:
                    if src.count != spec['bands']:
                        results['band_mismatch'].append(
                            f"{spec['name']}/{file} (应有{spec['bands']}波段，实际{src.count}波段)"
                        )
            except Exception as e:
                results['read_errors'].append(f"{spec['name']}/{file} ({str(e)})")
    
    # 打印检查结果
    print("=== 数据完整性检查报告 ===")
    print(f"\n[缺失目录] ({len(results.get('missing_dirs', []))}个):")
    print('\n'.join(results.get('missing_dirs', ['无'])))
    
    print(f"\n[无效文件名] ({len(results.get('invalid_files', []))}个):")
    print('\n'.join(results.get('invalid_files', ['无'])))
    
    print(f"\n[波段异常] ({len(results.get('band_mismatch', []))}个):")
    print('\n'.join(results.get('band_mismatch', ['无'])))
    
    print(f"\n[读取错误] ({len(results.get('read_errors', []))}个):")
    print('\n'.join(results.get('read_errors', ['无'])))

# 执行检查
check_modis_data()


=== 数据完整性检查报告 ===

[缺失目录] (0个):
无

[无效文件名] (0个):
无

[波段异常] (0个):
无

[读取错误] (0个):
无


In [3]:
def check_data_completeness(base_path="E:\\Crop\\NorthChina"):
    # 定义数据集目录映射
    data_types = {
        "ENTIRE": "反射率",
        "FPAR": "FPAR",
        "LAI": "叶面积指数",
        "LULC": "土地利用",
        "Temperature": "温度"
    }
    
    # 存储所有找到的文件名 {区域码: {年份: [数据类型]}}
    records = defaultdict(lambda: defaultdict(set))
    
    # 遍历所有数据类型目录
    for dir_name, data_name in data_types.items():
        dir_path = os.path.join(base_path, dir_name)
        if not os.path.exists(dir_path):
            print(f"警告：缺失目录 {dir_path}")
            continue
        
        # 解析文件名
        for filename in os.listdir(dir_path):
            if not filename.endswith('.tif'):
                continue
            
            try:
                # 解析省级码_区位码_年份
                parts = filename.split('_')
                if len(parts) < 3:
                    continue
                
                region_code = f"{parts[0]}_{parts[1]}"  # 省级码_区位码
                year = parts[2].split('.')[0]           # 去除.tif后缀
                
                records[region_code][year].add(data_name)
            except Exception as e:
                print(f"解析错误 {filename}: {str(e)}")
    
    # 检查完整性
    missing_data = defaultdict(list)
    for region, years in records.items():
        for year, datasets in years.items():
            if len(datasets) != 5:
                missing = set(data_types.values()) - datasets
                missing_data[region].append((year, missing))
    
    # 输出结果
    print("=== 数据完整性检查结果 ===")
    if not missing_data:
        print("所有区域年度数据完整（5种数据类型齐全）")
    else:
        for region, missing in missing_data.items():
            print(f"\n区域 {region} 缺失数据:")
            for year, datasets in missing:
                print(f"  {year} 年缺失: {', '.join(datasets)}")

# 执行检查
check_data_completeness()


=== 数据完整性检查结果 ===
所有区域年度数据完整（5种数据类型齐全）
