In [1]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd

sys.path.append(str(Path.resolve(Path.cwd() / ".." / "..")))

import new_util as util

DATA_DIR = "../../../temp"
MONTHS, YEARS = util.get_months_years(DATA_DIR)
TIME_DICT = util.get_month_year_dicts(DATA_DIR)

# TODO: delete files in data folder and delete content of plots.json?
def reset_data_and_plots():
    pass

reset_data_and_plots()

In [9]:
def get_top_k_contributor_count(ddf, tag, k):
    if tag in ["created_by", "corporation", "streetcomplete"]:
        highest_number = np.iinfo(ddf[tag].dtype).max
        ddf = ddf[ddf[tag] < highest_number]

    # counting the contibutor count only for tags that have a minimum amount of changesets for efficiency
    # min_amount_of_changeset should be lower then the contributor count of the top 500 from previous runs
    total_changeset_count = ddf.groupby(tag).size().compute().sort_values(ascending=False)
    tag_to_min_amount_of_changeset = {
        "created_by": 30,
        "imagery": 500,
        "hashtag": 2500,
        "source": 250,
        "corporation": 0,
        "streetcomplete": 0,
    }
    min_amount_of_changeset = tag_to_min_amount_of_changeset.get(tag, 0)
    filtered_ddf = ddf[ddf[tag].isin(total_changeset_count[total_changeset_count > min_amount_of_changeset].index)]

    total_contributor_count = filtered_ddf.groupby(tag)["user_index"].nunique().compute().sort_values(ascending=False)

    indices = total_contributor_count.index.to_numpy()[:k]
    index_to_tag = util.load_index_to_tag(DATA_DIR, tag)
    names = [index_to_tag[i] for i in indices]
    
    values = total_contributor_count[indices[:k]].to_numpy()
    return indices[:k], names[:k], values

def get_top_k_edit_count(ddf, tag, k):
    if tag in ["created_by", "corporation", "streetcomplete"]:
        highest_number = np.iinfo(ddf[tag].dtype).max
        ddf = ddf[ddf[tag] < highest_number]
    
    total_edit_count = ddf.groupby(tag)["edits"].sum().compute().sort_values(ascending=False)

    indices = total_edit_count.index.to_numpy()[:k]
    index_to_tag = util.load_index_to_tag(DATA_DIR, tag)
    names = [index_to_tag[i] for i in indices]
    
    values = total_edit_count[indices[:k]].to_numpy()
    return indices[:k], names[:k], values

def get_top_k_changeset_count(ddf, tag, k):
    if tag in ["created_by", "corporation", "streetcomplete"]:
        highest_number = np.iinfo(ddf[tag].dtype).max
        ddf = ddf[ddf[tag] < highest_number]
    
    total_changeset_count = ddf.groupby(tag).size().compute().sort_values(ascending=False)

    indices = total_changeset_count.index.to_numpy()[:k]
    index_to_tag = util.load_index_to_tag(DATA_DIR, tag)
    names = [index_to_tag[i] for i in indices]

    values = total_changeset_count[indices[:k]].to_numpy()
    return indices[:k], names[:k], values


def top_k_unique_contributor_monthly_list(contributor_count_ddf_filename, tag, indices):
    unique_monthly_list = [[set() for _ in range(len(MONTHS))] for _ in range(len(indices))]

    for month_i in range(len(MONTHS)):
        filters = [("month_index", "==", month_i)]
        temp_ddf = util.load_ddf(DATA_DIR, contributor_count_ddf_filename, ("user_index", tag), filters)
        contributor_unique_month = (
            temp_ddf[temp_ddf[tag].isin(indices)].groupby([tag], observed=False)["user_index"].unique().compute()
        )
        for i, tag_index in enumerate(indices):
            value = contributor_unique_month[contributor_unique_month.index == tag_index].to_numpy()
            if len(value) > 0:
                unique_monthly_list[i][month_i] = set(value[0].tolist())
    return unique_monthly_list

# Contributor Count

In [10]:
ddf = util.load_ddf(DATA_DIR, "general", ["month_index", "user_index", "created_by"])
k = 100
top_100_contrib_indices, top_100_contrib_names, top_100_contrib_values = get_top_k_contributor_count(ddf, "created_by", k)
unique_monthly_list = top_k_unique_contributor_monthly_list("general", "created_by", top_100_contrib_indices)

In [11]:
top_100_monthly_contributor_count = pd.DataFrame(np.array([[len(unique_set) for unique_set in month_list] for month_list in unique_monthly_list]).transpose(), columns=top_100_contrib_names)
top_100_monthly_contributor_count = top_100_monthly_contributor_count.rename_axis("month_index").reset_index()
util.save_data(DATA_DIR, "created_by_top_100_contributor_count_monthly", top_100_monthly_contributor_count)
util.save_data(DATA_DIR, "created_by_top_10_contributor_count_monthly", top_100_monthly_contributor_count.iloc[:,:10+1])

In [12]:
plot_config = util.get_plot_config("created_by_top_10_contributor_count_monthly", "monthly contributors per editing software", y_unit="contributors")
util.save_plot_config(plot_config)
util.show_plot(plot_config)

In [13]:
year_index_to_month_indices = util.get_year_index_to_month_indices(DATA_DIR)

contributor_count_yearly_list = []
for month_list in unique_monthly_list:
    contributor_count_yearly = []
    for year_i, _ in enumerate(YEARS):
        contributor_count_yearly.append(
            len(set().union(*[month_list[month_i] for month_i in year_index_to_month_indices[year_i]])),
        )
    contributor_count_yearly_list.append(contributor_count_yearly)

top_100_yearly_contributor_count = pd.DataFrame(np.array(contributor_count_yearly_list).transpose(), columns=top_100_contrib_names).rename_axis("year_index").reset_index()
top_100_yearly_contributor_count_and_total = top_100_yearly_contributor_count
util.save_data(DATA_DIR, "created_by_top_100_contributor_count_yearly", top_100_yearly_contributor_count)

In [15]:
plot_config = util.get_html_table_str("created_by_top_100_contributor_count_yearly", transpose_new_index_column="years", add_rank=True, last_column_name_values=("total", top_100_contrib_values))
util.save_plot_config(plot_config)
util.show_plot(plot_config)

Rank,Editor,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,total
1,iD,0,0,0,0,56176,125149,133826,148447,194537,214057,203438,241197,239537,204579,210136,29969,1413350
2,Potlatch,59540,69047,81896,107158,72954,29604,24286,14901,10184,6598,5043,3531,608,350,234,77,358269
3,MAPS.ME android,0,0,0,0,0,0,0,60058,63947,37666,32622,20331,12043,6626,10651,1279,192520
4,JOSM,13755,18797,20232,23441,23140,21862,22796,22315,23191,22762,22801,22109,21512,20536,19741,5914,143216
5,MAPS.ME ios,0,0,0,0,0,0,0,37424,39443,34149,23541,15385,8052,8966,5658,453,129302
6,StreetComplete,0,0,0,0,0,0,0,7,9572,9764,8967,11695,21935,23525,26809,5404,65258
7,OsmAnd,0,192,645,1129,1653,1927,2354,3534,4863,5982,7064,6978,7959,8623,8544,1424,33616
8,Vespucci,58,237,460,957,1622,1801,2075,2380,2920,3332,3632,4314,4539,5025,5371,1044,21349
9,Go Map!!,0,0,0,0,2205,2046,1335,1508,1834,2685,3074,3152,3748,3720,3830,873,16716
10,Rapid,0,0,0,0,0,0,0,0,0,0,1105,3709,2124,2183,9121,537,15683


In [16]:
ddf = util.load_ddf(DATA_DIR, "general", ["month_index", "user_index"])
contributors_unique_monthly = ddf.groupby(["month_index"], observed=False)["user_index"].unique().rename("contributors").compute()
util.save_data(DATA_DIR, "general_contributor_count_monthly", contributors_unique_monthly.apply(len))

util.save_percent(
    "created_by_top_10_contributor_count_monthly",
    "general_contributor_count_monthly",
    "months",
    "contributors",
)

In [17]:
plot_config = util.get_plot_config("created_by_top_10_contributor_count_monthly_percent", "percent of contributors that use each editing software per month", y_unit="%")
util.save_plot_config(plot_config)
util.show_plot(plot_config)

## monthly first editing software contributor count

In [48]:
ddf = util.load_ddf(DATA_DIR, "general", ["month_index", "user_index", "created_by"])
tag = "created_by"
first_edit_per_user = (ddf.groupby(["user_index"], observed=False)["month_index", tag].first().compute())
first_contibutor_count_monthly = (first_edit_per_user.reset_index().groupby(["month_index", tag], observed=False)["user_index"].count())
first_contibutor_count_monthly = first_contibutor_count_monthly.rename("contributors").reset_index()

first_contibutor_count_monthly = pd.pivot_table(first_contibutor_count_monthly, values="contributors", index="month_index", columns=tag)[top_100_contrib_indices[:10]]
first_contibutor_count_monthly.columns = top_100_contrib_names[:10]
first_contibutor_count_monthly = first_contibutor_count_monthly.reset_index()
util.save_data(DATA_DIR, "created_by_top_10_contributor_count_first_changeset_monthly", first_contibutor_count_monthly)

In [51]:
util.save_data(DATA_DIR, "general_new_contributor_count_monthly", util.cumsum_new_nunique(contributors_unique_monthly))
util.save_percent(
    "created_by_top_10_contributor_count_first_changeset_monthly",
    "general_new_contributor_count_monthly",
    "months",
    "contributors",
)

In [53]:
plot_config = util.get_plot_config("created_by_top_10_contributor_count_first_changeset_monthly_percent", "first editor of new contributors per month", y_unit="%")
util.save_plot_config(plot_config)
util.show_plot(plot_config)

# Edit Count

In [18]:
ddf = util.load_ddf(DATA_DIR, "general", ["month_index", "edits", "created_by"])
k = 100
top_100_edit_indices, top_100_edit_names, top_100_edit_values = get_top_k_edit_count(ddf, "created_by", k)

In [19]:
monthly_edit_count_df = (
    ddf[ddf["created_by"].isin(top_100_edit_indices)]
    .groupby(["month_index", "created_by"], observed=False)["edits"]
    .sum()
    .compute()
    .reset_index()
)
monthly_edit_count_df = pd.pivot_table(monthly_edit_count_df, values="edits", index="month_index", columns="created_by")[top_100_edit_indices]
monthly_edit_count_df.columns = top_100_edit_names
monthly_edit_count_df = monthly_edit_count_df.reset_index()
util.save_data(DATA_DIR, "created_by_top_100_edit_count_monthly", monthly_edit_count_df)
util.save_data(DATA_DIR, "created_by_top_10_edit_count_monthly", monthly_edit_count_df.iloc[:, :10+1])

In [20]:
plot_config = util.get_plot_config("created_by_top_10_edit_count_monthly", "monthly edits per editing software", y_unit="edits")
util.save_plot_config(plot_config)
util.show_plot(plot_config)

In [23]:
util.save_monthly_to_yearly("created_by_top_100_edit_count_monthly")
plot_config = util.get_html_table_str("created_by_top_100_edit_count_yearly", transpose_new_index_column="years", add_rank=True, last_column_name_values=("total", top_100_edit_values))
util.save_plot_config(plot_config)
util.show_plot(plot_config)

Rank,Editor,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,total
1,JOSM,125617700,494460089,539095272,542112695,500031282,666695875,642724561,625396093,663901878,793994770,790692615,996035337,917479929,758553217,756755624,85510186,9899057123
2,iD,0,0,0,0,34430862,109275830,142202485,189396418,279162048,321342148,359664267,459865024,480370002,416745824,427963559,39107936,3259526403
3,Potlatch,44030413,64726838,100975427,147722227,133843984,78110810,60205870,43031441,33578906,25192118,20654187,17405786,5083884,2678828,2574667,220722,780036108
4,Rapid,0,0,0,0,0,0,0,0,0,0,11678103,51870526,63919308,63564994,63683830,4745709,259462470
5,osmtools,150412240,8994328,981921,1229689,2734257,1499370,6826400,15035558,2657362,3272976,7558685,6454470,8152531,12399003,29128241,45092,257382123
6,bulk_upload.py,64669041,24649625,1442184,759090,185033,8089,1061001,1113638,2398,1036,26933301,263564,464798,0,0,0,121552798
7,Merkaartor,5378663,10887187,10250061,9702870,4380406,3791724,3696670,2523573,1735352,2138115,2316287,1745000,334114,491775,123828,4594,59500219
8,upload.py,31167015,10166002,2424682,421238,776590,422731,488194,4827,38997,3021,4195,1853644,4378760,822536,968637,2739,53943808
9,StreetComplete,0,0,0,0,0,0,0,85,1256645,1701151,1983767,3934683,12440285,12905084,13630661,837371,48689732
10,osmapi,0,0,0,0,0,797190,139521,682197,288536,240521,162631,2095734,23940913,1860602,1030376,23192,31261413


In [24]:
# Which editor is used for the first edit?
# Ich möchte von jeder Person den ersten Beitrag haben. Aus dem Beitrag möchte ich (Monat, created_by)
# Dann kann ich zählen wie oft was vorkommt und dann habe ich die Tabelle.


#ddf = util.load_ddf(DATA_DIR, "general", ["month_index", "created_by"])
#k = 100
#top_100_change_indices, top_100_change_names, top_100_change_values = get_top_k_changeset_count(ddf, "created_by", k)

In [None]:


#contibutor_monthly = (ddf[ddf[tag].isin(indices)].groupby(["user_index"], observed=False)["month_index", tag].first().compute())
#contibutor_count_monthly = (contibutor_monthly.reset_index().groupby(["month_index", tag], observed=False)["user_index"].count())

In [None]:

#ddf": load_parquet_func("general", ["month_index", "user_index", "created_by"]),

def get_tag_top_10_contributor_count_first_changeset_monthly(ddf, tag):
    indices, names = get_cached_top_k(ddf, tag, "contributor_count", 10)

    
    
    contibutor_count_monthly = contibutor_count_monthly.rename("contributors").reset_index()

    df = pd.pivot_table(contibutor_count_monthly, values="contributors", index="month_index", columns=tag)[indices]
    df.columns = names
    return df.reset_index()