In [30]:
import pandas as pd
import json
import ast
import glob
import os

In [31]:
folder_path = 'Pneuf_NLCD_3P'

# List of CSV files
csv_files = glob.glob(os.path.join(folder_path,'NLCD_pixel_counts_3P_*.csv'))  # adjust path/pattern

# Load and combine
df_list = [pd.read_csv(f) for f in csv_files]
df = pd.concat(df_list, ignore_index=True).sort_values(by=['Year', 'UID']).reset_index(drop=True)

In [32]:
df

Unnamed: 0,Year,UID,histogram
0,2000,101,"{31=189.0, 95=11092.580392156862, 11=4740.0, 5..."
1,2000,102,"{31=79.0, 11=931.0, 95=21751.4, 52=1052019.690..."
2,2000,103,"{31=50.96078431372549, 11=65028.55294117641, 9..."
3,2001,101,"{31=119.0, 95=11006.580392156862, 11=1429.0, 5..."
4,2001,102,"{31=70.0, 11=855.0, 95=21733.4, 52=1055236.670..."
...,...,...,...
70,2023,102,"{52=1032486.8235294108, 95=23330.4, 11=1151.0,..."
71,2023,103,"{52=639767.4666666675, 95=29710.713725490197, ..."
72,2024,101,"{31=172.0, 11=4182.0, 95=10717.580392156862, 5..."
73,2024,102,"{31=67.0, 95=22920.4, 11=1128.0, 52=1026503.03..."


In [33]:
def parse_histogram(s):
    if pd.isna(s) or s.strip() == '':
        return {}
    s_fixed = s.replace('=', ':')
    return ast.literal_eval(s_fixed)

df['histogram_dict'] = df['histogram'].apply(parse_histogram)

In [34]:
df

Unnamed: 0,Year,UID,histogram,histogram_dict
0,2000,101,"{31=189.0, 95=11092.580392156862, 11=4740.0, 5...","{31: 189.0, 95: 11092.580392156862, 11: 4740.0..."
1,2000,102,"{31=79.0, 11=931.0, 95=21751.4, 52=1052019.690...","{31: 79.0, 11: 931.0, 95: 21751.4, 52: 1052019..."
2,2000,103,"{31=50.96078431372549, 11=65028.55294117641, 9...","{31: 50.96078431372549, 11: 65028.55294117641,..."
3,2001,101,"{31=119.0, 95=11006.580392156862, 11=1429.0, 5...","{31: 119.0, 95: 11006.580392156862, 11: 1429.0..."
4,2001,102,"{31=70.0, 11=855.0, 95=21733.4, 52=1055236.670...","{31: 70.0, 11: 855.0, 95: 21733.4, 52: 1055236..."
...,...,...,...,...
70,2023,102,"{52=1032486.8235294108, 95=23330.4, 11=1151.0,...","{52: 1032486.8235294108, 95: 23330.4, 11: 1151..."
71,2023,103,"{52=639767.4666666675, 95=29710.713725490197, ...","{52: 639767.4666666675, 95: 29710.713725490197..."
72,2024,101,"{31=172.0, 11=4182.0, 95=10717.580392156862, 5...","{31: 172.0, 11: 4182.0, 95: 10717.580392156862..."
73,2024,102,"{31=67.0, 95=22920.4, 11=1128.0, 52=1026503.03...","{31: 67.0, 95: 22920.4, 11: 1128.0, 52: 102650..."


In [35]:
hist_df = pd.json_normalize(df['histogram_dict'])
hist_df.columns = [col for col in hist_df.columns]

In [36]:
land_cover_dict = {
    11: "Open Water",
    12: "Perennial Ice/Snow",
    21: "Developed, Open Space",
    22: "Developed, Low Intensity",
    23: "Developed, Medium Intensity",
    24: "Developed, High Intensity",
    31: "Barren Land",
    41: "Deciduous Forest",
    42: "Evergreen Forest",
    43: "Mixed Forest",
    52: "Shrub/Scrub",
    71: "Grassland/Herbaceous",
    81: "Pasture/Hay",
    82: "Cultivated Crops",
    90: "Woody Wetlands",
    95: "Emergent Herbaceous Wetlands"
}

In [37]:
def safe_rename(col):
    code = col
    if code in land_cover_dict:
        return f'{land_cover_dict[code].replace(" ", "_")}'
    else:
        return f'{code}'

hist_df.rename(columns=lambda c: safe_rename(c), inplace=True)

In [38]:
hist_df

Unnamed: 0,Barren_Land,Emergent_Herbaceous_Wetlands,Open_Water,Shrub/Scrub,Grassland/Herbaceous,"Developed,_High_Intensity",Woody_Wetlands,Deciduous_Forest,Cultivated_Crops,Pasture/Hay,"Developed,_Medium_Intensity",Evergreen_Forest,"Developed,_Low_Intensity","Developed,_Open_Space"
0,189.000000,11092.580392,4740.000000,9.924971e+05,8921.796078,27.000000,9761.203922,50182.027451,243807.356863,32517.360784,539.000000,273596.745098,9680.694118,19758.564706
1,79.000000,21751.400000,931.000000,1.052020e+06,38246.658824,107.000000,4694.654902,35274.807843,249644.843137,38390.015686,3303.733333,196700.992157,20688.584314,28339.572549
2,50.960784,30151.058824,65028.552941,6.154461e+05,133696.196078,7041.109804,65919.768627,16739.525490,342720.521569,8381.482353,21356.956863,93580.188235,50385.658824,28505.215686
3,119.000000,11006.580392,1429.000000,9.944837e+05,10714.505882,28.000000,9739.203922,50149.027451,242342.356863,33850.360784,538.000000,273394.439216,9697.694118,19818.564706
4,70.000000,21733.400000,855.000000,1.055237e+06,47975.996078,107.000000,4695.654902,35353.807843,236265.650980,38301.015686,3303.733333,197106.866667,20725.584314,28441.572549
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,68.000000,23330.400000,1151.000000,1.032487e+06,24384.101961,164.000000,4652.564706,46666.278431,231776.043137,37377.266667,4118.647059,227107.054902,26057.388235,30832.384314
71,559.000000,29710.713725,60496.078431,6.397675e+05,135724.686275,7624.266667,69173.200000,23188.756863,276959.050980,9959.086275,28579.796078,105654.388235,65802.662745,25804.141176
72,172.000000,10717.580392,4182.000000,8.966345e+05,17440.054902,44.000000,9866.203922,60427.360784,269109.690196,33577.274510,796.901961,318962.745098,13761.066667,21619.117647
73,67.000000,22920.400000,1128.000000,1.026503e+06,26772.698039,166.000000,4637.564706,48317.341176,229952.043137,38123.266667,4147.647059,230005.337255,26183.388235,31248.227451


In [10]:
# compute total pixels per row
total_pixels = hist_df.sum(axis=1).replace(0, 1)  # avoid division by zero

# compute percentage per crop
percent_df = hist_df.div(total_pixels, axis=0).multiply(100)

# merge counts + percentages
final_pix_df = pd.concat([df[['Year', 'UID']], hist_df], axis=1)
final_pcent_df = pd.concat([df[['Year', 'UID']], percent_df], axis=1)

In [11]:
total_pixels

0     1.657310e+06
1     1.690172e+06
2     1.479003e+06
3     1.657310e+06
4     1.690172e+06
          ...     
70    1.690172e+06
71    1.479003e+06
72    1.657310e+06
73    1.690172e+06
74    1.479003e+06
Length: 75, dtype: float64

In [39]:
final_pcent_df.to_csv('NLCD_3P_percent.csv', index=False)
final_pix_df.to_csv('NLCD_3P_pixel_count.csv', index=False)