In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Set the display width to a larger value (e.g., 200 characters)
pd.set_option('display.width', 200)

# Allow each column to display more content without truncation
pd.set_option('colheader_justify','center','display.max_colwidth', None)

# 1.9.25 : after reviewing some of the general sentiment of ecology, it seems there is a popular index (the 
# Shannon index) that is used for biodiversity (interestingly a derivative of an entropy equation). Scrapping
# previous work in favour of pursuing this. Original work saved in separate python file

In [26]:
# Data to be worked with
obs_df = pd.read_csv("observations.csv")
spec_df = pd.read_csv("species_info.csv")

#  double-check/clean
def initial_check(df):
    print(df.count())
    print(df.head(3))
    print(df.describe())
    print(df.dtypes)
    return
obs_df = obs_df.drop_duplicates()
spec_df = spec_df.drop_duplicates()
# merge dataframes into one and reset the index
merged = pd.merge(obs_df, spec_df[['scientific_name','category','conservation_status']], on='scientific_name').sort_values(by='category', ascending=True).reset_index().drop(columns='index')

# easier to reference park list get rid of all ' National Park' to make it look slightly better for charts. Later dropped need for parks with exploration of data
merged['park_name'] = merged['park_name'].str.replace('National Park', '').str.strip()
merged = merged.drop_duplicates()
park_l = merged.park_name.unique()
print(merged.head())

         scientific_name         park_name    observations  category  conservation_status
0     Gastrophryne carolinensis  Yellowstone       235      Amphibian          NaN       
1  Desmognathus quadramaculatus  Yellowstone       255      Amphibian          NaN       
2       Batrachoseps diabolicus     Yosemite       155      Amphibian          NaN       
3       Batrachoseps diabolicus        Bryce        80      Amphibian          NaN       
4       Batrachoseps diabolicus  Yellowstone       282      Amphibian          NaN       


In [67]:
index_df = merged.groupby(['park_name','category']).agg(
    observation_sum = ('observations','sum'), conserv_stat_count = ('conservation_status','count'), unique_spec = ('scientific_name','nunique')).reset_index()
total_observations_per_park = {}
for x in park_l:
    total_observations_per_park[x] = index_df[index_df.park_name == x].observation_sum.sum()
park_map = index_df['park_name'].map(total_observations_per_park)
index_df['pop_proportion'] = index_df['observation_sum'] / park_map
index_df['weights'] = (index_df['observation_sum'] * index_df['pop_proportion']) / index_df['pop_proportion']
print(index_df)

         park_name             category       observation_sum  conserv_stat_count  unique_spec  pop_proportion   weights 
0                   Bryce          Amphibian         7380               7               79         0.012808        7380.0
1                   Bryce               Bird        51647              79              488         0.089635       51647.0
2                   Bryce               Fish        12587              12              125         0.021845       12587.0
3                   Bryce             Mammal        20680              41              176         0.035891       20680.0
4                   Bryce  Nonvascular Plant        32992               5              333         0.057258       32992.0
5                   Bryce            Reptile         7950               5               78         0.013797        7950.0
6                   Bryce     Vascular Plant       442959              46             4262         0.768766      442959.0
7   Great Smoky Mountain

In [None]:
# below was done mostly by ChatGPT when given dummy data

def weighted_generalized_mean(series, weights, p):
    """
    Calculate the weighted generalized mean of order p for a series.

    Args:
        series (list or np.ndarray): The data values.
        weights (list or np.ndarray): The weights corresponding to the data values.
        p (float): The order of the generalized mean.

    Returns:
        float: The weighted generalized mean.
    """
    # Ensure inputs are numpy arrays for easier math operations
    series = np.array(series)
    weights = np.array(weights)
    
    # Handle edge case for p = 0 (geometric mean)
    if p == 0:
        # Calculate geometric mean
        return np.exp(np.sum(weights * np.log(series)) / np.sum(weights))
    
    # General case for p != 0
    numerator = np.sum(weights * series**p)
    denominator = np.sum(weights)
    
    return (numerator / denominator) ** (1 / p)

# Example usage
values = [4, 7, 10]
weights = [1, 2, 3]
p = 2  # Quadratic mean

result = weighted_generalized_mean(values, weights, p)
print(f"The weighted generalized mean of order {p} is: {result}")
