In [1]:
import pandas as pd
import json
import ast
import glob
import os
from itables import show

In [3]:
folder_path = 'Pneuf_CDL'

# List of CSV files
csv_files = glob.glob(os.path.join(folder_path,'CDL_pixel_counts_HUC12_*.csv'))  # adjust path/pattern

# Load and combine
df_list = [pd.read_csv(f) for f in csv_files]
df = pd.concat(df_list, ignore_index=True)

In [4]:
with open('cdl_code_map.json', 'r') as f:
    cdl_code_map = json.load(f)

In [5]:
def parse_histogram(s):
    if pd.isna(s) or s.strip() == '':
        return {}
    s_fixed = s.replace('=', ':')
    return ast.literal_eval(s_fixed)

df['histogram_dict'] = df['histogram'].apply(parse_histogram)

In [6]:
hist_df = pd.json_normalize(df['histogram_dict'])
hist_df.columns = [col for col in hist_df.columns]

In [7]:
def safe_rename(col):
    code = str(col)
    if code in cdl_code_map:
        return f'{cdl_code_map[code].replace(" ", "_")}'
    else:
        return f'{code}'

hist_df.rename(columns=lambda c: safe_rename(c), inplace=True)

In [9]:
# hist_df.fillna(0, inplace=True)
# compute total pixels per row
total_pixels = hist_df.sum(axis=1).replace(0, 1)  # avoid division by zero

# compute percentage per crop
percent_df = hist_df.div(total_pixels, axis=0).multiply(100)

# merge counts + percentages
final_pix_df = pd.concat([df[['Year', 'huc12']], hist_df], axis=1)
final_pcent_df = pd.concat([df[['Year', 'huc12']], percent_df], axis=1)

In [10]:
final_pcent_df.sort_values('Year')

Unnamed: 0,Year,huc12,Safflower,Open_Water,Other_Hay/Non_Alfalfa,Deciduous_Forest,Oats,Fallow/Idle_Cropland,Sod/Grass_Seed,Winter_Wheat,...,Buckwheat,Dbl_Crop_Barley/Corn,Radishes,Cherries,Camelina,Soybeans,Nonag/Undefined,Water,Developed,Clouds/No_Data
873,2005,170402080404,,,,,0.000851,11.435455,,1.027731,...,,,,,,,0.146414,,1.745545,
847,2005,170402080502,,,,,,9.418411,,4.378855,...,,,,,,,0.078423,,1.584811,
846,2005,170402080501,,,,,0.002152,16.072317,,0.920079,...,,,,,,,0.069261,,0.740752,
845,2005,170402080206,,,,,0.012949,5.411643,,0.468678,...,,,,,,,0.136264,,1.126078,0.023544
844,2005,170402080103,,,,,,8.875553,,0.235431,...,,,,,,,0.016684,,0.181320,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718,2023,170402080203,0.031864,0.002655,2.867771,2.105365,0.015932,1.592873,,1.627820,...,,,,,,,,,,
717,2023,170402080201,0.074855,0.003119,0.166374,2.760724,0.030290,4.263626,0.038987,0.547876,...,,,,,,,,,,
716,2023,170402080306,2.037586,0.000401,2.538664,0.243081,0.111769,2.184821,0.009512,3.268955,...,,,,,,,,,,
714,2023,170402080604,0.013496,,0.022280,0.001311,,0.137537,,11.049987,...,,,,,,,,,,


In [11]:
final_pcent_df.sort_values('Year', inplace=True)
final_pix_df.sort_values('Year', inplace=True)

In [None]:
# final_pcent_df.to_csv('CDL_percent.csv', index=False)
# final_pix_df.to_csv('CDL_pixel_count.csv', index=False)