
 * @Author: cunyu277 2465899266@qq.com
 * @Date: 2025-04-16 23:49:22
 * @LastEditors: cunyu277 2465899266@qq.com
 * @LastEditTime: 2025-04-16 23:50:57
 * @FilePath: \crop_yield_prediction\cunuu\predict\view.ipynb
 * @Description: 
 * 
 * Copyright (c) 2025 by yh, All Rights Reserved. 



In [7]:
import numpy as np
import os

In [8]:
def check_npz_contents(directory: str):
    """
    检查目录中所有.npz文件的内容和形状
    
    参数:
        directory: 包含.npz文件的目录路径
    """
    for filename in os.listdir(directory):
        if filename.endswith('.npz'):
            filepath = os.path.join(directory, filename)
            try:
                with np.load(filepath, allow_pickle=True) as data:
                    print(f"\n文件: {filename}")
                    print("包含的数组:")
                    for key in data.files:
                        array = data[key]
                        if isinstance(array, np.ndarray):
                            print(f"- {key}: 形状 {array.shape}, 类型 {array.dtype}")
                        else:
                            print(f"- {key}: (非数组对象)")
            except Exception as e:
                print(f"无法加载文件 {filename}: {str(e)}")

In [9]:
# 使用示例
# output_dir = r"D:\Crop\NorthChina\processed_32"  # 替换为实际的输出目录
output_dir = r"./enhanced_npz"
check_npz_contents(output_dir)


文件: 130000_130100_2010_all.npz
包含的数组:
- ENTIRE: 形状 (16, 7, 32), 类型 int64
- EVI: 形状 (16, 1, 32), 类型 int64
- FPAR: 形状 (16, 1, 32), 类型 int64
- GNDVI: 形状 (16, 1, 32), 类型 int64
- LAI: 形状 (16, 1, 32), 类型 int64
- NDMI: 形状 (16, 1, 32), 类型 int64
- NDVI: 形状 (16, 1, 32), 类型 int64
- SIPI: 形状 (16, 1, 32), 类型 int64
- Temperature: 形状 (16, 2, 32), 类型 int64
- yield: 形状 (2,), 类型 float32

文件: 130000_130100_2011_all.npz
包含的数组:
- ENTIRE: 形状 (16, 7, 32), 类型 int64
- EVI: 形状 (16, 1, 32), 类型 int64
- FPAR: 形状 (16, 1, 32), 类型 int64
- GNDVI: 形状 (16, 1, 32), 类型 int64
- LAI: 形状 (16, 1, 32), 类型 int64
- NDMI: 形状 (16, 1, 32), 类型 int64
- NDVI: 形状 (16, 1, 32), 类型 int64
- SIPI: 形状 (16, 1, 32), 类型 int64
- Temperature: 形状 (16, 2, 32), 类型 int64
- yield: 形状 (2,), 类型 float32

文件: 130000_130100_2012_all.npz
包含的数组:
- ENTIRE: 形状 (16, 7, 32), 类型 int64
- EVI: 形状 (16, 1, 32), 类型 int64
- FPAR: 形状 (16, 1, 32), 类型 int64
- GNDVI: 形状 (16, 1, 32), 类型 int64
- LAI: 形状 (16, 1, 32), 类型 int64
- NDMI: 形状 (16, 1, 32), 类型 int64
- NDVI: 形状 (16, 

In [1]:
import numpy as np
import os
import re
from collections import defaultdict

def check_region_shapes(directory: str):
    """
    检查同一地区所有年份数据的形状一致性
    
    参数:
        directory: 包含.npz文件的目录路径
        
    返回:
        各地区形状检查结果的字典
    """
    # 文件名模式：{省代码}_{市代码}_{年份}_all.npz
    pattern = re.compile(r'(\d{6})_(\d{6})_(\d{4})_all\.npz')
    
    # 按地区分组文件
    region_files = defaultdict(list)
    for filename in os.listdir(directory):
        match = pattern.match(filename)
        if match:
            province, city, year = match.groups()
            region_files[(province, city)].append(filename)
    
    results = {}
    for (province, city), filenames in region_files.items():
        shapes = {}
        sample_file = None
        
        for filename in sorted(filenames):  # 按年份排序
            filepath = os.path.join(directory, filename)
            try:
                with np.load(filepath, allow_pickle=True) as data:
                    # 收集各数组的形状
                    current_shapes = {k: v.shape for k, v in data.items() 
                                    if isinstance(v, np.ndarray)}
                    
                    if not shapes:  # 第一个文件作为基准
                        shapes = current_shapes
                        sample_file = filename
                        continue
                    
                    # 检查形状是否一致
                    if current_shapes != shapes:
                        print(f"警告: {filename} 形状不一致!")
                        print(f"基准文件: {sample_file}")
                        print(f"基准形状: {shapes}")
                        print(f"当前形状: {current_shapes}")
                        break
                
            except Exception as e:
                print(f"处理文件 {filename} 出错: {str(e)}")
                continue
        else:
            # 所有文件形状一致
            results[f"{province}_{city}"] = {
                'sample_file': sample_file,
                'shapes': shapes,
                'year_count': len(filenames)
            }
    
    return results


In [3]:
# 使用示例
output_dir = r"D:\Crop\NorthChina\processed_32"
results = check_region_shapes(output_dir)

# 打印简洁结果
print("\n各地区形状一致性检查结果:")
for region, info in results.items():
    print(f"\n地区: {region}")
    print(f"样例文件: {info['sample_file']}")
    print(f"年份数量: {info['year_count']}")
    print("各指标形状:")
    for k, v in info['shapes'].items():
        print(f"- {k}: {v}")


各地区形状一致性检查结果:

地区: 130000_130100
样例文件: 130000_130100_2010_all.npz
年份数量: 13
各指标形状:
- ENTIRE: (16, 7, 32)
- EVI: (16, 1, 32)
- FPAR: (16, 1, 32)
- GNDVI: (16, 1, 32)
- LAI: (16, 1, 32)
- NDMI: (16, 1, 32)
- NDVI: (16, 1, 32)
- SIPI: (16, 1, 32)
- Temperature: (16, 2, 32)

地区: 130000_130200
样例文件: 130000_130200_2010_all.npz
年份数量: 13
各指标形状:
- ENTIRE: (16, 7, 32)
- EVI: (16, 1, 32)
- FPAR: (16, 1, 32)
- GNDVI: (16, 1, 32)
- LAI: (16, 1, 32)
- NDMI: (16, 1, 32)
- NDVI: (16, 1, 32)
- SIPI: (16, 1, 32)
- Temperature: (16, 2, 32)

地区: 130000_130300
样例文件: 130000_130300_2010_all.npz
年份数量: 13
各指标形状:
- ENTIRE: (16, 7, 32)
- EVI: (16, 1, 32)
- FPAR: (16, 1, 32)
- GNDVI: (16, 1, 32)
- LAI: (16, 1, 32)
- NDMI: (16, 1, 32)
- NDVI: (16, 1, 32)
- SIPI: (16, 1, 32)
- Temperature: (16, 2, 32)

地区: 130000_130400
样例文件: 130000_130400_2010_all.npz
年份数量: 13
各指标形状:
- ENTIRE: (16, 7, 32)
- EVI: (16, 1, 32)
- FPAR: (16, 1, 32)
- GNDVI: (16, 1, 32)
- LAI: (16, 1, 32)
- NDMI: (16, 1, 32)
- NDVI: (16, 1, 32)
- SIPI: 

In [4]:
import numpy as np
import os
import re
from collections import defaultdict

def check_shape_consistency(directory: str):
    """
    自动检测并报告存在形状不一致的地区
    
    返回:
        - consistent_regions: 形状一致的地区信息
        - inconsistent_regions: 形状不一致的地区及差异详情
    """
    pattern = re.compile(r'(\d{6})_(\d{6})_(\d{4})_all\.npz')
    region_files = defaultdict(list)
    
    # 1. 按地区分组
    for f in os.listdir(directory):
        if match := pattern.match(f):
            province, city, year = match.groups()
            region_files[(province, city)].append((year, f))

    consistent_regions = {}
    inconsistent_regions = {}

    # 2. 自动对比检查
    for (province, city), files in region_files.items():
        base_shapes = None
        inconsistencies = []
        
        for year, filename in sorted(files, key=lambda x: x[0]):  # 按年份排序
            filepath = os.path.join(directory, filename)
            try:
                with np.load(filepath, allow_pickle=True) as data:
                    current_shapes = {k: v.shape for k, v in data.items() 
                                    if isinstance(v, np.ndarray)}
                    
                    if base_shapes is None:
                        base_shapes = current_shapes
                        base_file = filename
                        continue
                    
                    if current_shapes != base_shapes:
                        diff = {k: (base_shapes[k], current_shapes[k]) 
                               for k in base_shapes 
                               if k in current_shapes and base_shapes[k] != current_shapes[k]}
                        inconsistencies.append({
                            'year': year,
                            'file': filename,
                            'differences': diff
                        })
                        
            except Exception as e:
                print(f"⚠️ 加载失败: {filename} - {str(e)}")
                continue

        # 3. 分类存储结果
        if inconsistencies:
            inconsistent_regions[f"{province}_{city}"] = {
                'base_file': base_file,
                'base_shapes': base_shapes,
                'inconsistent_files': inconsistencies,
                'total_years': len(files)
            }
        else:
            consistent_regions[f"{province}_{city}"] = {
                'sample_file': base_file,
                'shapes': base_shapes,
                'total_years': len(files)
            }

    # 4. 生成报告
    print("\n" + "="*50)
    print(f"✅ 一致性检查完成 (共检查 {len(region_files)} 个地区)")
    print(f"✔️ 一致地区: {len(consistent_regions)}")
    print(f"❌ 不一致地区: {len(inconsistent_regions)}")
    print("="*50 + "\n")

    if inconsistent_regions:
        print("⚠️ 发现不一致地区详情:")
        for region, info in inconsistent_regions.items():
            print(f"\n地区: {region} (共 {info['total_years']} 个年份)")
            print(f"基准文件: {info['base_file']}")
            print("不一致的年份文件:")
            for item in info['inconsistent_files']:
                print(f"  - {item['year']}: {item['file']}")
                for k, (base_shape, bad_shape) in item['differences'].items():
                    print(f"    {k}: 基准 {base_shape} ≠ 当前 {bad_shape}")
    
    return consistent_regions, inconsistent_regions


In [5]:
# 使用示例
output_dir = r"D:\Crop\NorthChina\processed_32"
consistent, inconsistent = check_shape_consistency(output_dir)


✅ 一致性检查完成 (共检查 45 个地区)
✔️ 一致地区: 45
❌ 不一致地区: 0



## 产量检查

In [10]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# 加载产量数据
yield_data = pd.read_excel(r"D:\Crop\NorthChina\target.xlsx")
print(f"原始产量数据记录数: {len(yield_data)}")

# 标准化地区编码格式
yield_data['区位码'] = yield_data['区位码'].astype(str).str.zfill(6)
yield_data['省级码'] = yield_data['省级码'].astype(str).str.zfill(6)

原始产量数据记录数: 568


In [11]:
# 准备NPZ文件列表
npz_files = [f for f in os.listdir(r"D:\Crop\NorthChina\processed_32") if f.endswith('.npz')]  # 替换为实际路径

# 创建匹配记录
matched = []
unmatched = []

for npz_file in tqdm(npz_files, desc="匹配文件中"):
    # 解析文件名格式: 省级码_区位码_年份_all.npz
    parts = npz_file.split('_')
    if len(parts) != 4:
        continue
    
    province_code, city_code, year = parts[0], parts[1], parts[2]
    
    # 在产量数据中查找匹配项
    match = yield_data[
        (yield_data['省级码'] == province_code) & 
        (yield_data['区位码'] == city_code) & 
        (yield_data['年份'] == int(year))
    ]
    
    if not match.empty:
        matched.append({
            'npz_file': npz_file,
            'yield_data': match.iloc[0].to_dict(),
            'year': year,
            'province': province_code,
            'city': city_code
        })
    else:
        unmatched.append(npz_file)

# 打印匹配结果
print(f"\n匹配成功: {len(matched)} 个文件")
print(f"未匹配: {len(unmatched)} 个文件")

if unmatched:
    print("\n未匹配的文件列表:")
    for f in unmatched[:10]:  # 只显示前10个未匹配项
        print(f)
    if len(unmatched) > 10:
        print(f"...(共 {len(unmatched)} 个未匹配文件)")


匹配文件中: 100%|██████████| 585/585 [00:00<00:00, 2239.04it/s]


匹配成功: 559 个文件
未匹配: 26 个文件

未匹配的文件列表:
130000_130700_2010_all.npz
130000_130700_2011_all.npz
130000_130700_2012_all.npz
130000_130700_2013_all.npz
130000_130700_2014_all.npz
130000_130700_2015_all.npz
130000_130700_2016_all.npz
130000_130700_2017_all.npz
130000_130700_2018_all.npz
130000_130700_2019_all.npz
...(共 26 个未匹配文件)





In [12]:
# 创建新目录保存增强后的数据
output_dir = "enhanced_npz"
os.makedirs(output_dir, exist_ok=True)

for item in tqdm(matched, desc="处理匹配文件"):
    npz_path = os.path.join(r"D:\Crop\NorthChina\processed_32", item['npz_file'])  # 替换为实际路径
    output_path = os.path.join(output_dir, item['npz_file'])
    
    try:
        # 加载原始NPZ数据
        with np.load(npz_path, allow_pickle=True) as data:
            npz_data = dict(data.items())
        
        # 添加产量数据
        npz_data['yield'] = np.array([
            item['yield_data']['单位面积产量（公斤/公顷）'],
            item['yield_data']['总产量（万吨）']
        ], dtype=np.float32)
        
        # 保存增强后的文件
        np.savez_compressed(output_path, **npz_data)
        
    except Exception as e:
        print(f"处理文件 {item['npz_file']} 出错: {str(e)}")


处理匹配文件: 100%|██████████| 559/559 [00:02<00:00, 209.89it/s]


In [13]:
# 生成地区-年份匹配矩阵
match_matrix = pd.DataFrame({
    'NPZ文件': [m['npz_file'] for m in matched],
    '省份': [m['yield_data']['省份'] for m in matched],
    '城市': [m['yield_data']['市'] for m in matched],
    '年份': [m['year'] for m in matched],
    '单产': [m['yield_data']['单位面积产量（公斤/公顷）'] for m in matched],
    '总产': [m['yield_data']['总产量（万吨）'] for m in matched]
})

# 保存匹配报告
match_matrix.to_excel("匹配报告.xlsx", index=False)
print("\n已生成匹配报告: 匹配报告.xlsx")

# 统计未匹配的年份和地区
if unmatched:
    unmatched_df = pd.DataFrame([{
        '文件': f,
        '省份码': f.split('_')[0],
        '城市码': f.split('_')[1],
        '年份': f.split('_')[2]
    } for f in unmatched])
    
    unmatched_stats = unmatched_df.groupby(['省份码','城市码', '年份']).size().reset_index(name='缺失数量')
    print("\n缺失数据统计:")
    print(unmatched_stats.to_string(index=False))



已生成匹配报告: 匹配报告.xlsx

缺失数据统计:
   省份码    城市码   年份  缺失数量
130000 130700 2010     1
130000 130700 2011     1
130000 130700 2012     1
130000 130700 2013     1
130000 130700 2014     1
130000 130700 2015     1
130000 130700 2016     1
130000 130700 2017     1
130000 130700 2018     1
130000 130700 2019     1
130000 130700 2020     1
130000 130700 2021     1
130000 130700 2022     1
130000 130800 2010     1
130000 130800 2011     1
130000 130800 2012     1
130000 130800 2013     1
130000 130800 2014     1
130000 130800 2015     1
130000 130800 2016     1
130000 130800 2017     1
130000 130800 2018     1
130000 130800 2019     1
130000 130800 2020     1
130000 130800 2021     1
130000 130800 2022     1
