In [24]:
# Tried using Dit, but execution was too slow
# !apt-get install -y libgmp-dev libmpfr-dev libmpc-dev
# !pip install --upgrade pip setuptools wheel
# !pip install "pycddlib==2.1.4"
# !pip install "pypoman==1.0.0"
# !pip install "dit==1.5"
!pip install pyinform



In [25]:
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('PS')  # Ensure PostScript backend
import matplotlib.pyplot as plt
from scipy.stats import entropy

# Bladder

In [26]:
# Load data
X = np.load("data/bladder_features.npy")
y = np.load("data/bladder_target.npy")
index = np.load("data/bladder_index.npy")
feature_names = pd.read_csv("data/bladder_features.csv")["Feature"].tolist()

# Flatten feature array
n, t, p = X.shape
X_flat = X.reshape(n, t * p)
flattened_labels = [f"{feature}_t{ts}" for ts in range(t) for feature in feature_names]

# Create DataFrame
bladder_df = pd.DataFrame(X_flat, columns=flattened_labels)
bladder_df['fips'] = index[:, 0]
bladder_df['year'] = index[:, 1].astype(int)
bladder_df['target'] = y

# Preprocess data
exclude_cols = ['fips', 'year', 'target'] # exclude metadata columns
feature_columns = [col for col in bladder_df.columns if col not in exclude_cols]

# Display basic info
print(bladder_df.shape)
print(bladder_df.head())
print(bladder_df.columns)

(1056, 3699)
   Ozone_x_t0   SO2_t0   NO2_t0  Temperature_t0  Styrene_t0    CO_t0  \
0     15915.0  29614.0  22185.0             0.0         0.0  44210.0   
1     20949.0  26530.0  26096.0          4529.0         0.0  51647.0   
2     24638.0  22923.0  33594.0          8708.0         0.0  50282.0   
3     21637.0  17312.0  30272.0          8737.0         0.0  43298.0   
4     18483.0  11537.0  25883.0          8739.0         0.0  34732.0   

   n-Hexane_t0  PM10 Mass_t0  Methanol_t0  Ammonia_t0  ...  \
0          0.0         214.0          0.0         0.0  ...   
1          0.0         230.0          0.0         0.0  ...   
2          0.0         183.0          0.0         0.0  ...   
3          0.0         172.0          0.0         0.0  ...   
4          0.0         152.0          0.0         0.0  ...   

   Dazomet, sodium salt_t5  2,4-DP_t5  Vinyl fluoride_t5  \
0                      0.0        0.0                0.0   
1                      0.0        0.0                0.0   
2

In [27]:
# Calculate entropy for each column
entropy_dict = {}

for col in feature_columns:
    counts = bladder_df[col].value_counts(normalize=True)
    probabilities = counts / counts.sum()
    col_entropy = entropy(probabilities, base=2)
    entropy_dict[col] = col_entropy

# Convert to pandas Series for easy sorting
entropy_series = pd.Series(entropy_dict)

# Top k columns with highest & lowest entropy
k = 5
top_k_high_entropy = entropy_series.sort_values(ascending=False).head(k)
top_k_low_entropy = entropy_series[entropy_series > 0].sort_values(ascending=True).head(k)

print(f"Top {k} columns with highest entropy:")
print(top_k_high_entropy)

print(f"\nTop {k} columns with lowest entropy:")
print(top_k_low_entropy)

Top 5 columns with highest entropy:
Ozone_x_t3    8.965128
Ozone_x_t1    8.964627
Ozone_x_t5    8.964627
SO2_t5        8.952108
Ozone_x_t4    8.949512
dtype: float64

Top 5 columns with lowest entropy:
1,2-Dichloro-1,1,3,3,3-pentafluoropropane (HCFC-225da)_t4    0.010877
Phenytoin_t1                                                 0.010877
Manganese  And Manganese Compounds_t5                        0.010877
Dichlorophene_t1                                             0.010877
Michler's ketone_t5                                          0.010877
dtype: float64


In [28]:
# Choose most populated non-metadata columns for KL Divergence:
column_counts = bladder_df.notna().sum().sort_values(ascending=False) # get and sort the count of non-Nan values for each column
chemical_columns = [col for col in column_counts.index if col not in exclude_cols] # filter columns to exclude metadata columns
top_5 = chemical_columns[:5]
# print(top_5)
years = sorted(bladder_df['year'].unique()) # sort years for KL Divergence

# Store KL divergences
kl_divergences = {chemical: [] for chemical in top_5}
year_labels = [f"{years[i]}-{years[i+1]}" for i in range(len(years)-1)]

# Compute KL Divergence between distributions of consecutive years for each chemical
for chemical in top_5:
    print(f'\nKL Divergence for: {chemical}')

    # Prepare distributions per year
    year_distributions = {}
    for year in years:
        data = bladder_df[bladder_df['year'] == year][chemical].dropna() + 1e-9 # add small constant to avoid zeros

        # Bin the data to create a probability distribution (histogram)
        counts, bin_edges = np.histogram(data, bins=50, density=True)
        counts = counts + 1e-9  # avoid zeros
        year_distributions[year] = counts / counts.sum()  # normalize

    # Compare each pair of years consecutively
    for i in range(len(years) - 1):
        year1 = years[i]
        year2 = years[i + 1]

        p = year_distributions[year1]
        q = year_distributions[year2]

        kl_div = entropy(p, q)
        kl_divergences[chemical].append(kl_div)
        print(f'KL({year1} || {year2}): {kl_div:.4f}')

# Plot KL Divergence Trends
plt.figure(figsize=(12, 6))

for chemical in top_5:
    plt.plot(year_labels, kl_divergences[chemical], marker='o', label=chemical)

plt.title('KL Divergence Trends')
plt.xlabel('Year Pair')
plt.xticks(rotation=45, ha='right')
plt.ylabel('KL Divergence')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Save figure in eps format for paper
plt.savefig('bladder_KL_Divergence.eps', format='eps', bbox_inches='tight', dpi=300)


KL Divergence for: Ozone_x_t0
KL(1994 || 1995): 5.8941
KL(1995 || 1996): 7.2560
KL(1996 || 1997): 3.7074
KL(1997 || 1998): 5.2175
KL(1998 || 1999): 4.7365
KL(1999 || 2000): 4.3601
KL(2000 || 2001): 4.5102
KL(2001 || 2002): 2.3032
KL(2002 || 2003): 3.1221
KL(2003 || 2004): 3.8989
KL(2004 || 2005): 1.2280
KL(2005 || 2006): 1.5838
KL(2006 || 2007): 1.1836
KL(2007 || 2008): 0.9926
KL(2008 || 2009): 1.7777
KL(2009 || 2010): 0.9683
KL(2010 || 2011): 1.7218
KL(2011 || 2012): 1.3319
KL(2012 || 2013): 1.7099
KL(2013 || 2014): 1.7925
KL(2014 || 2015): 1.6612

KL Divergence for: Dazomet, sodium salt_t3
KL(1994 || 1995): 0.0000
KL(1995 || 1996): 0.0000
KL(1996 || 1997): 0.0000
KL(1997 || 1998): 0.0000
KL(1998 || 1999): 19.1138
KL(1999 || 2000): 0.0000
KL(2000 || 2001): 24.5431
KL(2001 || 2002): 0.0000
KL(2002 || 2003): 0.0000
KL(2003 || 2004): 0.0000
KL(2004 || 2005): 0.0000
KL(2005 || 2006): 0.0000
KL(2006 || 2007): 0.0000
KL(2007 || 2008): 0.0000
KL(2008 || 2009): 0.0000
KL(2009 || 2010): 0.000

  plt.show()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


# Lung

In [29]:
# Load data
X = np.load("data/lung_features.npy")
y = np.load("data/lung_target.npy")      
index = np.load("data/lung_index.npy")
feature_names = pd.read_csv("data/lung_features.csv")["Feature"].tolist()

# Flatten feature array
n, t, p = X.shape
X_flat = X.reshape(n, t * p)
flattened_labels = [f"{feature}_t{ts}" for ts in range(t) for feature in feature_names]

# Create DataFrame
lung_df = pd.DataFrame(X_flat, columns=flattened_labels)
lung_df['fips'] = index[:, 0]
lung_df['year'] = index[:, 1].astype(int)
lung_df['target'] = y

# Preprocess data
exclude_cols = ['fips', 'year', 'target'] # exclude metadata columns
feature_columns = [col for col in lung_df.columns if col not in exclude_cols]

# Display basic info
print(lung_df.shape)
print(lung_df.head())
print(lung_df.columns)

(932, 7283)
   Barometric Pressure_t0    CO_t0  HAPs_t0  Lead_x_t0   NO2_t0  NONOxNOy_t0  \
0                     0.0  15436.0    835.0       48.0  14032.0      13973.0   
1                     0.0  31947.0   1276.0      193.0  15522.0      19992.0   
2                     0.0  34000.0   1137.0      178.0  16874.0      27036.0   
3                     0.0  34916.0    791.0      182.0  17099.0      33412.0   
4                     0.0  44210.0    786.0      180.0  22185.0      44003.0   

   Ozone_x_t0  PM10 Mass_t0  PM10 Speciation_t0  PM2.5 FRM-FEM Mass_t0  ...  \
0      7976.0          56.0                77.0                    0.0  ...   
1     10422.0         147.0                61.0                    0.0  ...   
2     12119.0         167.0                80.0                    0.0  ...   
3     12261.0         178.0                92.0                    0.0  ...   
4     15915.0         214.0               102.0                    0.0  ...   

   Perfluorobutanoic acid_t9  \


In [30]:
# Calculate entropy for each column
entropy_dict = {}

for col in feature_columns:
    counts = lung_df[col].value_counts(normalize=True)
    probabilities = counts / counts.sum()
    col_entropy = entropy(probabilities, base=2)
    entropy_dict[col] = col_entropy

# Convert to pandas Series for easy sorting
entropy_series = pd.Series(entropy_dict)

# Top k columns with highest & lowest entropy
k = 5
top_k_high_entropy = entropy_series.sort_values(ascending=False).head(k)
top_k_low_entropy = entropy_series[entropy_series > 0].sort_values(ascending=True).head(k)

print(f"Top {k} columns with highest entropy:")
print(top_k_high_entropy)

print(f"\nTop {k} columns with lowest entropy:")
print(top_k_low_entropy)

Top 5 columns with highest entropy:
Ozone_x_t3    8.723965
Ozone_x_t7    8.721819
Ozone_x_t5    8.721778
Ozone_x_t9    8.715340
Ozone_x_t1    8.713194
dtype: float64

Top 5 columns with lowest entropy:
Phenytoin_t7                               0.012131
2,2-Bis(bromomethyl)-1,3-propanediol_t7    0.012131
Phenytoin_t6                               0.012131
Chloramben_t0                              0.012131
Nabam_t8                                   0.012131
dtype: float64


In [31]:
# Choose most populated non-metadata columns for KL Divergence:
column_counts = lung_df.notna().sum().sort_values(ascending=False) # get and sort the count of non-Nan values for each column
exclude_cols = ['geo_code', 'year', 'county_count'] # exclude metadata columns
chemical_columns = [col for col in column_counts.index if col not in exclude_cols] # filter columns to exclude metadata columns
top_5 = chemical_columns[:5]
# print(top_5)
years = sorted(lung_df['year'].unique()) # sort years for KL Divergence

# Store KL divergences
kl_divergences = {chemical: [] for chemical in top_5}
year_labels = [f"{years[i]}-{years[i+1]}" for i in range(len(years)-1)]

# For each chemical, compute KL Divergence between distributions of consecutive years
for chemical in top_5:
    print(f'\nKL Divergence for: {chemical}')

    # Prepare distributions per year
    year_distributions = {}
    for year in years:
        data = lung_df[lung_df['year'] == year][chemical].dropna() + 1e-9 # add small constant to avoid zeros

        # Bin the data to create a probability distribution (histogram)
        counts, bin_edges = np.histogram(data, bins=50, density=True)
        counts = counts + 1e-9  # avoid zeros
        year_distributions[year] = counts / counts.sum()  # normalize

    # Compare each pair of years consecutively
    for i in range(len(years) - 1):
        year1 = years[i]
        year2 = years[i + 1]

        p = year_distributions[year1]
        q = year_distributions[year2]

        kl_div = entropy(p, q)
        kl_divergences[chemical].append(kl_div)
        print(f'KL({year1} || {year2}): {kl_div:.4f}')

# Plot KL Divergence Trends
plt.figure(figsize=(12, 6))

for chemical in top_5:
    plt.plot(year_labels, kl_divergences[chemical], marker='o', label=chemical)

plt.title('KL Divergence Trends')
plt.xlabel('Year Pair')
plt.xticks(rotation=45, ha='right')
plt.ylabel('KL Divergence')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Save figure in eps format for paper
plt.savefig('lung_KL_Divergence.eps', format='eps', bbox_inches='tight', dpi=300)


KL Divergence for: Barometric Pressure_t0
KL(1991 || 1992): 13.8488
KL(1992 || 1993): 0.1467
KL(1993 || 1994): 1.4240
KL(1994 || 1995): 1.0424
KL(1995 || 1996): 1.6069
KL(1996 || 1997): 1.6906
KL(1997 || 1998): 2.2895
KL(1998 || 1999): 0.3039
KL(1999 || 2000): 2.4849
KL(2000 || 2001): 0.7194
KL(2001 || 2002): 1.5817
KL(2002 || 2003): 0.5984
KL(2003 || 2004): 0.9324
KL(2004 || 2005): 0.8611
KL(2005 || 2006): 0.9156
KL(2006 || 2007): 0.5948
KL(2007 || 2008): 0.9758
KL(2008 || 2009): 0.9457
KL(2009 || 2010): 1.0386
KL(2010 || 2011): 0.8048

KL Divergence for: N-Methylolacrylamide_t6
KL(1991 || 1992): 0.0000
KL(1992 || 1993): 0.0000
KL(1993 || 1994): 0.0000
KL(1994 || 1995): 0.0000
KL(1995 || 1996): 22.0703
KL(1996 || 1997): 0.0231
KL(1997 || 1998): 0.1182
KL(1998 || 1999): 1.7171
KL(1999 || 2000): 1.0205
KL(2000 || 2001): 1.2041
KL(2001 || 2002): 1.0225
KL(2002 || 2003): 1.7324
KL(2003 || 2004): 1.1620
KL(2004 || 2005): 0.7292
KL(2005 || 2006): 0.9031
KL(2006 || 2007): 1.2277
KL(2007 || 

  plt.show()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
