# Tag Analysis

Analysis of OpenStreetMap changesets that use tag prefixes to indicate the types of data being edited.

In [1]:
import duckdb
import util

util.init()

## Monthly Top 10 Tag Prefixes: Edits, Contributors, and Percentage Analysis

In [2]:
# Get top 10 tag prefixes by total edits
df = duckdb.sql("""
WITH tags_expanded AS (
    SELECT 
        year,
        month,
        user_name,
        edit_count,
        unnest(all_tags) as tag_prefix
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE all_tags IS NOT NULL
),
top_tags AS (
    SELECT tag_prefix
    FROM (
        SELECT
            tag_prefix,
            SUM(edit_count) as total_edits
        FROM tags_expanded
        GROUP BY tag_prefix
        ORDER BY total_edits DESC
        LIMIT 10
    )
),
monthly_tag_data AS (
    SELECT 
        te.year,
        te.month,
        CONCAT(te.year, '-', LPAD(CAST(te.month as VARCHAR), 2, '0')) as months,
        te.tag_prefix,
        COUNT(DISTINCT te.user_name) as contributors,
        SUM(te.edit_count) as edits
    FROM tags_expanded te
    WHERE te.tag_prefix IN (SELECT tag_prefix FROM top_tags)
    GROUP BY te.year, te.month, te.tag_prefix
),
monthly_total_contributors AS (
    SELECT 
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT user_name) as total_contributors
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE all_tags IS NOT NULL
    GROUP BY year, month
),
monthly_total_edits AS (
    SELECT 
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        SUM(edit_count) as total_edits
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE all_tags IS NOT NULL
    GROUP BY year, month
)
SELECT 
    mtd.months,
    mtd.tag_prefix,
    mtd.contributors as "Contributors",
    mtd.edits as "Edits",
    ROUND((mtd.contributors * 100.0) / mtc.total_contributors, 2) as 'Percentage of Contributors',
    ROUND((mtd.edits * 100.0) / mte.total_edits, 2) as 'Percentage of Edits'
FROM monthly_tag_data mtd
JOIN monthly_total_contributors mtc ON mtd.months = mtc.months
JOIN monthly_total_edits mte ON mtd.months = mte.months
ORDER BY mtd.year, mtd.month, mtd.tag_prefix
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Edits by Top 10 Tag Prefixes",
            label="Edits",
            x_col="months",
            y_col="Edits",
            group_col="tag_prefix",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Percentage of Edits by Top 10 Tag Prefixes",
            label="Percentage of Edits",
            x_col="months",
            y_col="Percentage of Edits",
            y_unit_hover_template="%",
            group_col="tag_prefix",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Contributors by Top 10 Tag Prefixes",
            label="Contributors",
            x_col="months",
            y_col="Contributors",
            group_col="tag_prefix",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Percentage of Contributors by Top 10 Tag Prefixes",
            label="Percentage of Contributors",
            x_col="months",
            y_col="Percentage of Contributors",
            y_unit_hover_template="%",
            group_col="tag_prefix",
            query_or_df=df,
        ),
    ]
)

## Top 100 Tag Prefixes Yearly

In [3]:
query = """
WITH tags_expanded AS (
    SELECT 
        year,
        user_name,
        edit_count,
        unnest(all_tags) as tag_prefix
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE all_tags IS NOT NULL
),
user_first_year AS (
    SELECT 
        user_name,
        tag_prefix,
        MIN(year) as first_year
    FROM tags_expanded
    GROUP BY user_name, tag_prefix
),
tag_totals AS (
    SELECT
        tag_prefix as "Tag Prefix",
        CAST(SUM(edit_count) as BIGINT) as total_edits_all_time,
        CAST(SUM(CASE WHEN year >= 2021 THEN edit_count ELSE 0 END) as BIGINT) as total_edits_2021_now,
        CAST(COUNT(DISTINCT user_name) as BIGINT) as total_contributors_all_time,
        CAST(COUNT(DISTINCT CASE WHEN year >= 2021 THEN user_name END) as BIGINT) as total_contributors_2021_now
    FROM tags_expanded
    GROUP BY tag_prefix
),
yearly_metrics AS (
    SELECT
        te.year,
        te.tag_prefix as "Tag Prefix",
        CAST(SUM(te.edit_count) as BIGINT) as "Edits",
        CAST(COUNT(DISTINCT te.user_name) as BIGINT) as "Contributors",
        CAST(COUNT(DISTINCT CASE WHEN ufy.first_year = te.year THEN te.user_name END) as BIGINT) as "New Contributors"
    FROM tags_expanded te
    LEFT JOIN user_first_year ufy 
        ON te.user_name = ufy.user_name AND te.tag_prefix = ufy.tag_prefix
    GROUP BY te.year, te.tag_prefix
)
SELECT 
    ym.year,
    ym."Tag Prefix",
    ym."Edits",
    ym."New Contributors",
    ym."Contributors",
    tt.total_edits_all_time as "Total Edits",
    tt.total_edits_2021_now as "Total Edits (2021 - Now)",
    tt.total_contributors_all_time as "Total Contributors",
    tt.total_contributors_2021_now as "Total Contributors (2021 - Now)"
FROM yearly_metrics ym
JOIN tag_totals tt
    ON ym."Tag Prefix" = tt."Tag Prefix"
ORDER BY year DESC, "Edits" DESC
"""
df = duckdb.sql(query).df()

top_100_contributors = df.groupby("Tag Prefix")["Total Contributors"].first().nlargest(100)
top_100_contributors_2021_now = df.groupby("Tag Prefix")["Total Contributors (2021 - Now)"].first().nlargest(100)
top_100_edits = df.groupby("Tag Prefix")["Total Edits"].first().nlargest(100)
top_100_edits_2021_now = df.groupby("Tag Prefix")["Total Edits (2021 - Now)"].first().nlargest(100)

table_configs = [
    util.TableConfig(
        title="Top 100 Tag Prefixes by Contributors",
        query_or_df=df[df["Tag Prefix"].isin(top_100_contributors.index)],
        x_axis_col="year",
        y_axis_col="Tag Prefix",
        value_col="Contributors",
        center_columns=["Rank", "Tag Prefix"],
        sum_col="Total Contributors",
    ),
    util.TableConfig(
        title="Top 100 Tag Prefixes by Contributors 2021 - Now",
        query_or_df=df[(df["Tag Prefix"].isin(top_100_contributors_2021_now.index)) & (df["year"] >= 2021)],
        x_axis_col="year",
        y_axis_col="Tag Prefix",
        value_col="Contributors",
        center_columns=["Rank", "Tag Prefix"],
        sum_col="Total Contributors (2021 - Now)",
    ),
    util.TableConfig(
        title="Top 100 Tag Prefixes by Edits",
        query_or_df=df[df["Tag Prefix"].isin(top_100_edits.index)],
        x_axis_col="year",
        y_axis_col="Tag Prefix",
        value_col="Edits",
        center_columns=["Rank", "Tag Prefix"],
        sum_col="Total Edits",
    ),
    util.TableConfig(
        title="Top 100 Tag Prefixes by Edits 2021 - Now",
        query_or_df=df[(df["Tag Prefix"].isin(top_100_edits_2021_now.index)) & (df["year"] >= 2021)],
        x_axis_col="year",
        y_axis_col="Tag Prefix",
        value_col="Edits",
        center_columns=["Rank", "Tag Prefix"],
        sum_col="Total Edits (2021 - Now)",
    ),
]

util.show_tables(table_configs)

Rank,Tag Prefix,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,Total Contributors
1,created_by,64271,79548,94631,122416,127132,155904,163504,261946,318851,307995,281778,302771,292540,258556,273523,263199,218189,2340524
2,comment,38981,61602,76523,100702,96185,95195,110950,254853,312573,301664,275756,300447,290998,256925,271704,262001,217376,2167340
3,imagery_used,0,0,0,0,56447,125548,134093,148792,195614,216254,206481,245587,243206,209843,219588,216398,174108,1648031
4,locale,0,0,0,0,0,0,97179,148482,195195,215384,205222,250333,255078,224522,236992,234477,192225,1549158
5,host,0,0,0,0,0,0,97178,148481,194558,213957,203646,242552,240327,207088,216679,213305,171605,1505012
6,changesets_count,0,0,0,0,0,0,0,0,87312,213851,203704,242421,240027,205019,214397,206437,170699,1264173
7,review_requested,0,0,0,0,0,0,0,0,27653,64721,70171,89607,88709,68719,72617,70705,49066,517866
8,ideditor,0,0,0,0,0,0,0,0,20577,60617,58015,73159,64522,53378,53806,51293,40857,435143
9,hashtags,0,0,0,0,0,0,0,0,26592,53296,54135,87224,89990,59699,60736,54550,32919,433048
10,source,144,592,864,992,3385,17755,19981,20293,35191,52333,51976,52119,61802,62124,78860,129950,121133,416940

Rank,Tag Prefix,2021,2022,2023,2024,2025,Total Contributors (2021 - Now)
1,created_by,292540,258556,273523,263199,218189,939039
2,comment,290998,256925,271704,262001,217376,934933
3,locale,255078,224522,236992,234477,192225,834571
4,imagery_used,243206,209843,219588,216398,174108,793735
5,host,240327,207088,216679,213305,171605,788467
6,changesets_count,240027,205019,214397,206437,170699,777170
7,review_requested,88709,68719,72617,70705,49066,310496
8,source,61802,62124,78860,129950,121133,305410
9,hashtags,89990,59699,60736,54550,32919,255445
10,ideditor,64522,53378,53806,51293,40857,244136

Rank,Tag Prefix,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,Total Edits
1,created_by,437095538,629391888,667201899,731585439,705402715,868160737,865972850,883389644,989676920,1162437327,1233643407,1558439065,1530831493,1286891342,1329628032,1295792577,970616134,17146157007
2,comment,426991137,608593067,620251830,652061174,610062156,758528444,755680455,847855752,958691712,1139898224,1212186095,1536708310,1520258834,1275490244,1316777200,1286012411,964348466,16490395511
3,source,10622526,13408300,20040916,36142972,68633439,552644361,557710343,570273582,627895963,793509711,797788654,1014487296,963365101,828856857,833908255,904555509,685089369,9278933154
4,imagery_used,0,0,0,0,34482684,109306934,142220824,188327528,275993159,324724361,377522059,520604175,553484877,489057748,501088769,495356727,382489813,4394659658
5,locale,0,0,0,0,0,0,97896806,189399539,279322636,323329205,373636273,519398897,562247145,502879845,517180702,512170426,395159329,4272620803
6,host,0,0,0,0,0,0,97895925,188111073,274531981,307528125,339369031,486004271,545038143,481623481,492866210,482848960,370827134,4066644334
7,changesets_count,0,0,0,0,0,0,0,0,111051825,299407156,356087893,508453627,541971457,477756421,503172121,480945048,369036267,3647881815
8,hashtags,0,0,0,0,0,0,0,0,41831265,149094343,149824289,354181094,394893580,370419825,282910074,267899400,172060941,2183114811
9,version,0,7595955,92221053,146270098,133437723,78276371,60105498,42966897,33434566,25146865,20600382,17387455,5083208,2678888,2573846,2765120,2045096,672589021
10,build,0,7530377,92220246,146266904,133436693,77949781,60105777,42966280,33429209,25145618,20600208,17386499,5083243,2672918,2569504,2765121,2044935,672173313

Rank,Tag Prefix,2021,2022,2023,2024,2025,Total Edits (2021 - Now)
1,created_by,1530831493,1286891342,1329628032,1295792577,970616134,6413759578
2,comment,1520258834,1275490244,1316777200,1286012411,964348466,6362887155
3,source,963365101,828856857,833908255,904555509,685089369,4215775091
4,locale,562247145,502879845,517180702,512170426,395159329,2489637447
5,imagery_used,553484877,489057748,501088769,495356727,382489813,2421477934
6,host,545038143,481623481,492866210,482848960,370827134,2373203928
7,changesets_count,541971457,477756421,503172121,480945048,369036267,2372881314
8,hashtags,394893580,370419825,282910074,267899400,172060941,1488183820
9,resolved,112367672,102012064,100720604,99969833,76980149,492050322
10,warnings,108070388,103553349,97201454,91662003,64251717,464738911
