# Source Tag Analysis

Analysis of OpenStreetMap changesets that use source tags to indicate data sources.

In [1]:
import duckdb
import util

util.init()

## Monthly Percentage of Edits and Contributors Using Source Tags

In [2]:
df = duckdb.sql("""
WITH monthly_with_source AS (
    SELECT 
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT user_name) as contributors_with_source,
        SUM(edit_count) as edits_with_source
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE source IS NOT NULL
    GROUP BY year, month
),
monthly_total AS (
    SELECT 
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT user_name) as total_contributors,
        SUM(edit_count) as total_edits
    FROM '../changeset_data/year=*/month=*/*.parquet'
    GROUP BY year, month
)
SELECT 
    mt.months,
    COALESCE(ROUND((mws.contributors_with_source * 100.0) / mt.total_contributors, 2), 0) as 'Percentage Contributors with Source',
    COALESCE(ROUND((mws.edits_with_source * 100.0) / mt.total_edits, 2), 0) as 'Percentage Edits with Source'
FROM monthly_total mt
LEFT JOIN monthly_with_source mws ON mt.year = mws.year AND mt.month = mws.month
ORDER BY mt.year, mt.month
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Percentage of Contributors Using Source Tags",
            label="Contributors",
            x_col="months",
            y_col="Percentage Contributors with Source",
            y_unit_hover_template="%",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Percentage of Edits Using Source Tags",
            label="Edits",
            x_col="months",
            y_col="Percentage Edits with Source",
            y_unit_hover_template="%",
            query_or_df=df,
        ),
    ]
)

## Monthly Top 10 Sources: Edits, Accumulated Edits, Contributors and Accumulated Contributors

In [3]:
# Get top 10 sources by total edits
df = duckdb.sql("""
WITH source_expanded AS (
    SELECT 
        year,
        month,
        user_name,
        edit_count,
        unnest(source) as source_tag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE source IS NOT NULL
),
top_sources AS (
    SELECT source_tag
    FROM (
        SELECT
            source_tag,
            SUM(edit_count) as total_edits
        FROM source_expanded
        GROUP BY source_tag
        ORDER BY total_edits DESC
        LIMIT 10
    )
),
monthly_source_data AS (
    SELECT 
        se.year,
        se.month,
        CONCAT(se.year, '-', LPAD(CAST(se.month as VARCHAR), 2, '0')) as months,
        se.source_tag,
        COUNT(DISTINCT se.user_name) as "Contributors",
        SUM(se.edit_count) as "Edits"
    FROM source_expanded se
    WHERE se.source_tag IN (SELECT source_tag FROM top_sources)
    GROUP BY se.year, se.month, se.source_tag
)
SELECT 
    months,
    source_tag,
    "Contributors",
    "Edits",
    SUM("Contributors") OVER (PARTITION BY source_tag ORDER BY year, month) as "Contributors Accumulated",
    SUM("Edits") OVER (PARTITION BY source_tag ORDER BY year, month) as "Edits Accumulated"
FROM monthly_source_data
ORDER BY year, month, source_tag
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Edits by Top 10 Source Tags",
            label="Edits",
            x_col="months",
            y_col="Edits",
            group_col="source_tag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Edits by Top 10 Source Tags",
            label="Edits Accumulated",
            x_col="months",
            y_col="Edits Accumulated",
            group_col="source_tag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Contributors by Top 10 Source Tags",
            label="Contributors",
            x_col="months",
            y_col="Contributors",
            group_col="source_tag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Contributors by Top 10 Source Tags",
            label="Contributors Accumulated",
            x_col="months",
            y_col="Contributors Accumulated",
            group_col="source_tag",
            query_or_df=df,
        ),
    ]
)

## Top 100 Sources Yearly

In [4]:
import json

# Load replacement rules for clickable links
with open("../config/replace_rules_imagery_and_source.json") as f:
    source_name_to_html_link = {
        name: f'<a href="{item["link"]}">{name}</a>' for name, item in json.load(f).items() if "link" in item
    }

query = """
WITH source_expanded AS (
    SELECT 
        year,
        user_name,
        edit_count,
        unnest(source) as source_tag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE source IS NOT NULL
),
user_first_year AS (
    SELECT 
        user_name,
        source_tag,
        MIN(year) as first_year
    FROM source_expanded
    GROUP BY user_name, source_tag
),
source_totals AS (
    SELECT
        source_tag as "Source",
        CAST(SUM(edit_count) as BIGINT) as total_edits_all_time,
        CAST(SUM(CASE WHEN year >= 2021 THEN edit_count ELSE 0 END) as BIGINT) as total_edits_2021_now,
        CAST(COUNT(DISTINCT user_name) as BIGINT) as total_contributors_all_time,
        CAST(COUNT(DISTINCT CASE WHEN year >= 2021 THEN user_name END) as BIGINT) as total_contributors_2021_now
    FROM source_expanded
    GROUP BY source_tag
),
yearly_metrics AS (
    SELECT
        se.year,
        se.source_tag as "Source",
        CAST(SUM(se.edit_count) as BIGINT) as "Edits",
        CAST(COUNT(DISTINCT se.user_name) as BIGINT) as "Contributors",
        CAST(COUNT(DISTINCT CASE WHEN ufy.first_year = se.year THEN se.user_name END) as BIGINT) as "New Contributors"
    FROM source_expanded se
    LEFT JOIN user_first_year ufy 
        ON se.user_name = ufy.user_name AND se.source_tag = ufy.source_tag
    GROUP BY se.year, se.source_tag
)
SELECT 
    ym.year,
    ym."Source",
    ym."Edits",
    ym."New Contributors",
    ym."Contributors",
    st.total_edits_all_time as "Total Edits",
    st.total_edits_2021_now as "Total Edits (2021 - Now)",
    st.total_contributors_all_time as "Total Contributors",
    st.total_contributors_2021_now as "Total Contributors (2021 - Now)"
FROM yearly_metrics ym
JOIN source_totals st
    ON ym."Source" = st."Source"
ORDER BY year DESC, "Edits" DESC
"""
df = duckdb.sql(query).df()

# Apply HTML links to source names
df["Source"] = df["Source"].apply(
    lambda name: source_name_to_html_link[name] if name in source_name_to_html_link else name
)

top_100_contributors = df.groupby("Source")["Total Contributors"].first().nlargest(100)
top_100_contributors_2021_now = df.groupby("Source")["Total Contributors (2021 - Now)"].first().nlargest(100)
top_100_edits = df.groupby("Source")["Total Edits"].first().nlargest(100)
top_100_edits_2021_now = df.groupby("Source")["Total Edits (2021 - Now)"].first().nlargest(100)

table_configs = [
    util.TableConfig(
        title="Top 100 Sources by Contributors",
        query_or_df=df[df["Source"].isin(top_100_contributors.index)],
        x_axis_col="year",
        y_axis_col="Source",
        value_col="Contributors",
        center_columns=["Rank", "Source"],
        sum_col="Total Contributors",
    ),
    util.TableConfig(
        title="Top 100 Sources by Contributors 2021 - Now",
        query_or_df=df[(df["Source"].isin(top_100_contributors_2021_now.index)) & (df["year"] >= 2021)],
        x_axis_col="year",
        y_axis_col="Source",
        value_col="Contributors",
        center_columns=["Rank", "Source"],
        sum_col="Total Contributors (2021 - Now)",
    ),
    util.TableConfig(
        title="Top 100 Sources by Edits",
        query_or_df=df[df["Source"].isin(top_100_edits.index)],
        x_axis_col="year",
        y_axis_col="Source",
        value_col="Edits",
        center_columns=["Rank", "Source"],
        sum_col="Total Edits",
    ),
    util.TableConfig(
        title="Top 100 Sources by Edits 2021 - Now",
        query_or_df=df[(df["Source"].isin(top_100_edits_2021_now.index)) & (df["year"] >= 2021)],
        x_axis_col="year",
        y_axis_col="Source",
        value_col="Edits",
        center_columns=["Rank", "Source"],
        sum_col="Total Edits (2021 - Now)",
    ),
]

util.show_tables(table_configs)

Rank,Source,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,Total Contributors
1,Local Knowledge,15,39,96,124,299,1112,1185,1155,6004,13364,14866,16846,16774,16676,24925,73065,53118,178428
2,Survey,50,151,266,332,936,6016,6994,6815,16397,17965,18067,21616,31851,35699,43215,64838,50979,170734
3,Bing Aerial Imagery,0,250,543,628,1521,7987,9704,9369,11150,13935,14177,14847,14617,14549,17520,33157,24364,109629
4,GPS,24,73,142,158,379,2086,1968,1784,2756,4071,4474,5516,5437,4636,5351,11780,8702,40896
5,Streetlevel Imagery,0,0,0,0,0,0,0,0,552,1525,3804,5611,5473,4915,6128,12539,9609,38566
6,Knowledge,5,36,63,95,150,5231,5692,5528,4952,4737,4846,4643,4101,4110,3955,3988,2808,28843
7,Esri World Imagery,0,0,0,0,0,0,8,3,884,2528,3875,5311,5061,4733,5468,5750,4696,19360
8,Maxar Imagery,0,0,0,0,0,0,0,0,0,0,4099,5927,6691,6777,4527,886,458,16667
9,Mapillary,0,0,0,0,0,76,294,452,733,832,2899,4476,4421,3594,3428,3404,2751,15492
10,Mapbox Satellite,0,0,0,0,5,589,1309,1513,1669,1651,1943,2692,2740,2399,2376,2231,1895,12878

Rank,Source,2021,2022,2023,2024,2025,Total Contributors (2021 - Now)
1,Local Knowledge,16774,16676,24925,73065,53118,146919
2,Survey,31851,35699,43215,64838,50979,138345
3,Bing Aerial Imagery,14617,14549,17520,33157,24364,71048
4,Streetlevel Imagery,5473,4915,6128,12539,9609,30945
5,GPS,5437,4636,5351,11780,8702,28464
6,Esri World Imagery,5061,4733,5468,5750,4696,14751
7,Maxar Imagery,6691,6777,4527,886,458,12630
8,Knowledge,4101,4110,3955,3988,2808,11868
9,Mapillary,4421,3594,3428,3404,2751,11330
10,mapwithai,1310,1773,6674,2268,1571,10689

Rank,Source,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,Total Edits
1,Bing Aerial Imagery,0,1104845,5943994,8538817,12991168,173987208,267851133,261768074,271025232,324732716,202089213,207038817,178095386,203860026,251808690,387820797,186040270,2944696386
2,Maxar Imagery,0,0,0,0,0,0,0,0,0,0,145612418,353554693,408562421,303658817,142318444,15868695,3742548,1373318036
3,Esri World Imagery,0,0,0,0,0,0,17993,239758,10061925,58375488,90140592,119054401,93982677,82623613,137202984,196555020,145721381,933975832
4,Survey,43194,364397,862049,1712740,1855142,34186247,40265343,39557199,41176824,56172056,50957047,49945224,55832938,56553374,56619794,66199723,41206469,593509760
5,CanVec,0,18312,205,152732,975848,42633069,18107000,20337259,2129899,11335763,4736884,2891894,724293,10120862,53586798,162185149,108905182,438841149
6,Mapbox Satellite,0,0,0,0,19888,10008126,19191368,23290202,43882174,47136421,35725178,26537119,22505737,22954892,25742919,19467349,23426754,319888127
7,mapwithai,0,0,0,0,0,0,0,0,0,0,147700,7018150,38297311,71128396,72462816,58094289,23130121,270278783
8,DigitalGlobe Imagery (now Maxar),0,3701,1013,146701,222332,1558906,1020733,1648266,43702463,108249504,64681206,4086526,1427252,150542,128030,173436,343060,227543671
9,Kartverket,0,0,0,0,193189,6472510,22616804,12638501,11824485,20021301,16442925,23808966,35847131,18230318,25748106,11069152,1084325,205997713
10,tms[]:https://,0,0,0,0,0,150663,3942516,8811397,64344159,20599410,15163452,42952662,34851999,10207070,2413201,2239419,84926,205760874

Rank,Source,2021,2022,2023,2024,2025,Total Edits (2021 - Now)
1,Bing Aerial Imagery,178095386,203860026,251808690,387820797,186040270,1207625169
2,Maxar Imagery,408562421,303658817,142318444,15868695,3742548,874150925
3,Esri World Imagery,93982677,82623613,137202984,196555020,145721381,656085675
4,CanVec,724293,10120862,53586798,162185149,108905182,335522284
5,Survey,55832938,56553374,56619794,66199723,41206469,276412298
6,mapwithai,38297311,71128396,72462816,58094289,23130121,263112933
7,Local Knowledge,17547534,17780030,21011440,38555737,27422323,122317064
8,Geoportal 2: Orthophotomap,34681231,27461981,19943564,24349122,15159000,121594898
9,Mapbox Satellite,22505737,22954892,25742919,19467349,23426754,114097651
10,Kartverket,35847131,18230318,25748106,11069152,1084325,91979032
