In [1]:
import pandas as pd

df = pd.read_csv("AtlanticSalmonMeanForkLength.csv", encoding="windows-1252")
df

Unnamed: 0,river_rivière,age_âge,year_année,latitude_latitude,longitude_longitude,n_n,mean fork length_longueur moyenne à la fourche
0,Miramichi River,1,1971,46.933169,-65.650788,251,511.513944
1,Sand Hill River,1,1971,53.581429,-56.351120,389,544.455013
2,Western Arm Brook,1,1971,51.190752,-56.753424,77,527.909091
3,Miramichi River,1,1972,46.933169,-65.650788,687,520.181950
4,Nashwaak River,1,1972,46.117667,-66.589062,70,579.842857
...,...,...,...,...,...,...,...
883,LaHave River,2,2021,44.533330,-64.716670,17,729.235294
884,Miramichi River,2,2021,46.933169,-65.650788,329,760.145897
885,Nashwaak River,2,2021,46.117667,-66.589062,13,784.076923
886,Rivière Saint-Jean,2,2021,48.774378,-64.424531,10,771.700000


In [2]:
# first question I have is weather the combination of rivers + coordinates is unique
df_unqiue = df.loc[:, ["river_rivière", "latitude_latitude", "longitude_longitude"]].drop_duplicates()
assert df_unqiue.shape[0] == df["river_rivière"].unique().__len__()


### Everything looks good!!

In [3]:

grouped = df.groupby("river_rivière")
print(df.columns)
rivers = grouped.mean().index.to_list()
latitude = grouped["latitude_latitude"].mean().to_list()
longitude = grouped["longitude_longitude"].mean().to_list()
mean_annual_count = grouped["n_n"].mean().to_list()
mean_annual_count_std = grouped["n_n"].std().to_list()
mean_fork_length = grouped["mean fork length_longueur moyenne à la fourche"].mean().to_list()
mean_fork_length_std = grouped["mean fork length_longueur moyenne à la fourche"].std().to_list()

Index(['river_rivière', 'age_âge', 'year_année', 'latitude_latitude',
       'longitude_longitude', 'n_n',
       'mean fork length_longueur moyenne à la fourche'],
      dtype='object')


In [4]:


# there are a few tricky calculations; I am not sure the best way to do them
years_sampled = list()
sample_count = list()
age_percentages = list()
for river in rivers:
    # years sampled
    years = df.where(df["river_rivière"] == river).dropna().sort_values("year_année", ascending=True)["year_année"].unique()
    # should have a nicer, more succinct way to display things
    final_display = list()
    groups = list()
    group = list()
    prev = None
    for year in years:
        if not group.__len__():
            group.append(year)
        else:
            if year == (prev + 1):
                group.append(year)
            else:
                groups.append(group)
                group = [year]
        prev = year

    if not groups or group != groups[-1]:
        groups.append(group)

    # formatting
    for g in groups:
        if g.__len__() == 1:
            final_display.append(str(int(g[0])))
        else:
            final_display.append(f"{int(g[0])}-{int(g[-1])}")
    years_sampled.append(", ".join(final_display))
    sample_count.append(years.__len__())

    # ages
    ages = df.where(df["river_rivière"] == river).dropna()
    counts = ages.value_counts("age_âge")
    total = ages.shape[0]
    items = list()
    for index, count in counts.items():
        items.append(f"age {int(index)}: {(count / total):.0%}")
    age_percentages.append(", ".join(items))


In [5]:
# assemble the final dataframes in english and french
final_df_eng = pd.DataFrame({
    "river name": rivers,
    "latitude": latitude,
    "longitude": longitude,
    "annual count, mean": mean_annual_count,
    "annual count, standard deviation": mean_annual_count_std,
    "fork length, mean": mean_fork_length,
    "fork length, standard deviation": mean_fork_length_std,
    "number of years sampled": sample_count,
    "years sampled, inclusive": years_sampled,
    "age percentage breakdown": age_percentages,
})

final_df_eng.to_csv("AtlanticSalmonMeanForkLength WMS (EN).csv")

In [6]:
# assemble the final dataframes in english and french
final_df_fra = pd.DataFrame({
    "nom de la rivière": rivers,
    "latitude": latitude,
    "longitude": longitude,
    "comptage annuel, moyenne": mean_annual_count,
    "comptage annuel, écart-type": mean_annual_count_std,
    "longueur de la fourche, moyenne": mean_fork_length,
    "longueur de la fourche, écart type": mean_fork_length_std,
    "nombre d'années d'échantillonnage": sample_count,
    "années échantillonnées, incluses": years_sampled,
    "ventilation des pourcentages d'âge": age_percentages,

})

final_df_fra.to_csv("AtlanticSalmonMeanForkLength WMS (FR).csv")

In [8]:
import geojson
from geojson import Feature, Point, FeatureCollection

feature_list = list()
for index, row in final_df_eng.iterrows():
    feature_list.append(
        Feature(geometry=Point((row["longitude"], row["latitude"])),
                properties={
                    "river name": row["river name"],
                    "latitude": row["latitude"],
                    "longitude": row["longitude"],
                    "annual count, mean": row["annual count, mean"],
                    "annual count, standard deviation": row["annual count, standard deviation"],
                    "fork length, mean": row["fork length, mean"],
                    "fork length, standard deviation": row["fork length, standard deviation"],
                    "number of years sampled": row["number of years sampled"],
                    "years sampled, inclusive": row["years sampled, inclusive"],
                    "age percentage breakdown": row["age percentage breakdown"],
                })
    )

collection = FeatureCollection(feature_list)
with open("AtlanticSalmonMeanForkLength WMS (EN).geojson", "w", encoding="utf8") as file:
    file.write(geojson.dumps(collection).encode("utf8").decode("utf8"))
    
    

In [9]:
feature_list = list()
for index, row in final_df_fra.iterrows():
    feature_list.append(
        Feature(geometry=Point((row["longitude"], row["latitude"])),
                properties={
                    "nom de la rivière": row["nom de la rivière"],
                    "latitude": row["latitude"],
                    "longitude": row["longitude"],
                    "comptage annuel, moyenne": row["comptage annuel, moyenne"],
                    "comptage annuel, écart-type": row["comptage annuel, écart-type"],
                    "longueur de la fourche, moyenne": row["longueur de la fourche, moyenne"],
                    "longueur de la fourche, écart type": row["longueur de la fourche, écart type"],
                    "nombre d'années d'échantillonnage": row["nombre d'années d'échantillonnage"],
                    "années échantillonnées, incluses": row["années échantillonnées, incluses"],
                    "ventilation des pourcentages d'âge": row["ventilation des pourcentages d'âge"],
                })
    )

collection = FeatureCollection(feature_list)
with open("AtlanticSalmonMeanForkLength WMS (FR).geojson", "w", encoding="utf8") as file:
    file.write(geojson.dumps(collection).encode("utf8").decode("utf8"))
  
  