# Hashtags Analysis

Analysis of OpenStreetMap changesets that use hashtags from the hashtags column.

In [1]:
import duckdb
import util

util.init()

## Monthly Percentage of Edits and Contributors Using Hashtags

In [2]:
df = duckdb.sql("""
WITH monthly_with_hashtags AS (
    SELECT 
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT user_name) as contributors_with_hashtags,
        SUM(edit_count) as edits_with_hashtags
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL
    GROUP BY year, month
),
monthly_total AS (
    SELECT 
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT user_name) as total_contributors,
        SUM(edit_count) as total_edits
    FROM '../changeset_data/year=*/month=*/*.parquet'
    GROUP BY year, month
)
SELECT 
    mt.months,
    COALESCE(ROUND((mwh.contributors_with_hashtags * 100.0) / mt.total_contributors, 2), 0) as 'Percentage Contributors with Hashtags',
    COALESCE(ROUND((mwh.edits_with_hashtags * 100.0) / mt.total_edits, 2), 0) as 'Percentage Edits with Hashtags'
FROM monthly_total mt
LEFT JOIN monthly_with_hashtags mwh ON mt.year = mwh.year AND mt.month = mwh.month
ORDER BY mt.year, mt.month
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Percentage of Contributors Using Hashtags",
            label="Contributors",
            x_col="months",
            y_col="Percentage Contributors with Hashtags",
            y_unit_hover_template="%",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Percentage of Edits Using Hashtags",
            label="Edits",
            x_col="months",
            y_col="Percentage Edits with Hashtags",
            y_unit_hover_template="%",
            query_or_df=df,
        ),
    ]
)

## Monthly Top 10 Hashtags: Edits, Accumulated Edits, Contributors and Accumulated Contributors

In [3]:
# Get top 10 hashtags by total edits
df = duckdb.sql("""
WITH hashtag_expanded AS (
    SELECT 
        year,
        month,
        user_name,
        edit_count,
        unnest(hashtags) as hashtag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL
),
top_hashtags AS (
    SELECT hashtag
    FROM (
        SELECT
            hashtag,
            SUM(edit_count) as total_edits
        FROM hashtag_expanded
        GROUP BY hashtag
        ORDER BY total_edits DESC
        LIMIT 10
    )
),
monthly_hashtag_data AS (
    SELECT 
        he.year,
        he.month,
        CONCAT(he.year, '-', LPAD(CAST(he.month as VARCHAR), 2, '0')) as months,
        he.hashtag,
        COUNT(DISTINCT he.user_name) as "Contributors",
        SUM(he.edit_count) as "Edits"
    FROM hashtag_expanded he
    WHERE he.hashtag IN (SELECT hashtag FROM top_hashtags)
    GROUP BY he.year, he.month, he.hashtag
)
SELECT 
    months,
    hashtag,
    "Contributors",
    "Edits",
    SUM("Contributors") OVER (PARTITION BY hashtag ORDER BY year, month) as "Contributors Accumulated",
    SUM("Edits") OVER (PARTITION BY hashtag ORDER BY year, month) as "Edits Accumulated"
FROM monthly_hashtag_data
ORDER BY year, month, hashtag
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Edits by Top 10 Hashtags",
            label="Edits",
            x_col="months",
            y_col="Edits",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Edits by Top 10 Hashtags",
            label="Edits Accumulated",
            x_col="months",
            y_col="Edits Accumulated",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Contributors by Top 10 Hashtags",
            label="Contributors",
            x_col="months",
            y_col="Contributors",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Contributors by Top 10 Hashtags",
            label="Contributors Accumulated",
            x_col="months",
            y_col="Contributors Accumulated",
            group_col="hashtag",
            query_or_df=df,
        ),
    ]
)

## Top 100 Hashtags Yearly

In [4]:
query = """
WITH hashtag_expanded AS (
    SELECT 
        year,
        user_name,
        edit_count,
        unnest(hashtags) as hashtag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL
),
user_first_year AS (
    SELECT 
        user_name,
        hashtag,
        MIN(year) as first_year
    FROM hashtag_expanded
    GROUP BY user_name, hashtag
),
hashtag_totals AS (
    SELECT
        hashtag as "Hashtag",
        CAST(SUM(edit_count) as BIGINT) as total_edits_all_time,
        CAST(SUM(CASE WHEN year >= 2021 THEN edit_count ELSE 0 END) as BIGINT) as total_edits_2021_now,
        CAST(COUNT(DISTINCT user_name) as BIGINT) as total_contributors_all_time,
        CAST(COUNT(DISTINCT CASE WHEN year >= 2021 THEN user_name END) as BIGINT) as total_contributors_2021_now
    FROM hashtag_expanded
    GROUP BY hashtag
),
yearly_metrics AS (
    SELECT
        he.year,
        he.hashtag as "Hashtag",
        CAST(SUM(he.edit_count) as BIGINT) as "Edits",
        CAST(COUNT(DISTINCT he.user_name) as BIGINT) as "Contributors",
        CAST(COUNT(DISTINCT CASE WHEN ufy.first_year = he.year THEN he.user_name END) as BIGINT) as "New Contributors"
    FROM hashtag_expanded he
    LEFT JOIN user_first_year ufy 
        ON he.user_name = ufy.user_name AND he.hashtag = ufy.hashtag
    GROUP BY he.year, he.hashtag
)
SELECT 
    ym.year,
    ym."Hashtag",
    ym."Edits",
    ym."New Contributors",
    ym."Contributors",
    ht.total_edits_all_time as "Total Edits",
    ht.total_edits_2021_now as "Total Edits (2021 - Now)",
    ht.total_contributors_all_time as "Total Contributors",
    ht.total_contributors_2021_now as "Total Contributors (2021 - Now)"
FROM yearly_metrics ym
JOIN hashtag_totals ht
    ON ym."Hashtag" = ht."Hashtag"
ORDER BY year DESC, "Edits" DESC
"""
df = duckdb.sql(query).df()

top_100_contributors = df.groupby("Hashtag")["Total Contributors"].first().nlargest(100)
top_100_contributors_2021_now = df.groupby("Hashtag")["Total Contributors (2021 - Now)"].first().nlargest(100)
top_100_edits = df.groupby("Hashtag")["Total Edits"].first().nlargest(100)
top_100_edits_2021_now = df.groupby("Hashtag")["Total Edits (2021 - Now)"].first().nlargest(100)

table_configs = [
    util.TableConfig(
        title="Top 100 Hashtags by Contributors",
        query_or_df=df[df["Hashtag"].isin(top_100_contributors.index)],
        x_axis_col="year",
        y_axis_col="Hashtag",
        value_col="Contributors",
        center_columns=["Rank", "Hashtag"],
        sum_col="Total Contributors",
    ),
    util.TableConfig(
        title="Top 100 Hashtags by Contributors 2021 - Now",
        query_or_df=df[(df["Hashtag"].isin(top_100_contributors_2021_now.index)) & (df["year"] >= 2021)],
        x_axis_col="year",
        y_axis_col="Hashtag",
        value_col="Contributors",
        center_columns=["Rank", "Hashtag"],
        sum_col="Total Contributors (2021 - Now)",
    ),
    util.TableConfig(
        title="Top 100 Hashtags by Edits",
        query_or_df=df[df["Hashtag"].isin(top_100_edits.index)],
        x_axis_col="year",
        y_axis_col="Hashtag",
        value_col="Edits",
        center_columns=["Rank", "Hashtag"],
        sum_col="Total Edits",
    ),
    util.TableConfig(
        title="Top 100 Hashtags by Edits 2021 - Now",
        query_or_df=df[(df["Hashtag"].isin(top_100_edits_2021_now.index)) & (df["year"] >= 2021)],
        x_axis_col="year",
        y_axis_col="Hashtag",
        value_col="Edits",
        center_columns=["Rank", "Hashtag"],
        sum_col="Total Edits (2021 - Now)",
    ),
]

util.show_tables(table_configs)

Rank,Hashtag,2017,2018,2019,2020,2021,2022,2023,2024,2025,Total Contributors
1,#missingmaps,12561,28330,26727,28770,35232,10913,12791,9616,4092,149661
2,#redcross,7849,8209,7332,19818,25801,3043,688,1071,217,67821
3,#msf,724,5128,8456,4899,5714,4988,6094,6319,2928,40368
4,#opencitieslac,0,0,0,0,9244,16658,9565,125,36,32844
5,#youthmappers,1601,2916,4288,5469,9179,4429,2870,2408,1411,30756
6,#indonesia,1,1622,2031,14365,14434,371,51,3,1,30597
7,#covid19,0,0,0,21990,6691,283,154,72,0,28071
8,#climatechange,0,0,4,8012,19957,462,0,160,7,27035
9,#accenture,548,1870,2085,4814,5614,1730,2065,1792,1328,20079
10,#bangladesh,3191,3011,161,675,7919,3644,186,521,41,18620

Rank,Hashtag,2021,2022,2023,2024,2025,Total Contributors (2021 - Now)
1,#missingmaps,35232,10913,12791,9616,4092,66666
2,#opencitieslac,9244,16658,9565,125,36,32844
3,#redcross,25801,3043,688,1071,217,29955
4,#msf,5714,4988,6094,6319,2928,23369
5,#climatechange,19957,462,0,160,7,20417
6,#youthmappers,9179,4429,2870,2408,1411,18358
7,#indonesia,14434,371,51,3,1,14701
8,#jpmc,4305,4828,2222,2532,18,12806
9,#bangladesh,7919,3644,186,521,41,11955
10,#accenture,5614,1730,2065,1792,1328,11539

Rank,Hashtag,2017,2018,2019,2020,2021,2022,2023,2024,2025,Total Edits
1,#missingmaps,17287000,57972037,48677395,46035057,55677193,41012814,34551063,33669385,12381643,347263587
2,#adt,0,0,0,94151846,65719545,19962618,13249704,9962239,5024456,208070408
3,#mapwithai,0,0,5474757,60335776,73648969,37870684,22784491,1841552,26547,201982776
4,#msf,1313114,7072065,10802075,6639937,11133143,20111327,17809753,21931008,9513608,106326030
5,#kaart,553738,2806898,19867429,25313042,14644067,16100486,11480043,7660211,4243172,102669086
6,#redcross,8954746,14580209,8882537,26819099,25701858,4284352,2308505,2289851,567602,94388759
7,#youthmappers,2527184,5353394,5937531,10523710,18290595,16253248,9107544,5291073,10035207,83319486
8,#india,12,865010,712643,14810650,34730951,23166080,3701630,848019,23509,78858504
9,#tanzania,3207761,4996902,1525349,12042077,19406138,9327963,15162106,483792,44448,66196536
10,#maproulette,108535,868900,4220553,5451770,7089845,10215278,11705825,14269086,8994171,62923963

Rank,Hashtag,2021,2022,2023,2024,2025,Total Edits (2021 - Now)
1,#missingmaps,55677193,41012814,34551063,33669385,12381643,177292098
2,#mapwithai,73648969,37870684,22784491,1841552,26547,136172243
3,#adt,65719545,19962618,13249704,9962239,5024456,113918562
4,#msf,11133143,20111327,17809753,21931008,9513608,80498839
5,#india,34730951,23166080,3701630,848019,23509,62470189
6,#youthmappers,18290595,16253248,9107544,5291073,10035207,58977667
7,#kaart,14644067,16100486,11480043,7660211,4243172,54127979
8,#maproulette,7089845,10215278,11705825,14269086,8994171,52274205
9,#tanzania,19406138,9327963,15162106,483792,44448,44424447
10,#indonesia,24547584,11769334,4332600,58697,1,40708216


## Geographical Distribution of Top 10 Hashtags

In [5]:
# Create geographical maps for top 10 hashtags
df = duckdb.sql("""
WITH hashtag_expanded AS (
    SELECT 
        mid_pos_x,
        mid_pos_y,
        edit_count,
        unnest(hashtags) as hashtag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL 
      AND mid_pos_x IS NOT NULL 
      AND mid_pos_y IS NOT NULL
),
top_hashtags AS (
    SELECT hashtag
    FROM (
        SELECT
            hashtag,
            SUM(edit_count) as total_edits
        FROM hashtag_expanded
        GROUP BY hashtag
        ORDER BY total_edits DESC
        LIMIT 10
    )
),
hashtag_geo_data AS (
    SELECT 
        he.hashtag,
        he.mid_pos_x as x,
        he.mid_pos_y as y,
        SUM(he.edit_count) as z
    FROM hashtag_expanded he
    WHERE he.hashtag IN (SELECT hashtag FROM top_hashtags)
    GROUP BY he.hashtag, he.mid_pos_x, he.mid_pos_y
)
SELECT * FROM hashtag_geo_data
ORDER BY hashtag, x, y
""").df()

# Create dropdown maps for each hashtag
configs = []
for hashtag in sorted(df["hashtag"].unique()):
    configs.append(
        util.FigureConfig(
            title=f"Edit Distribution - {hashtag}".replace("#", ""),
            x_col="x",
            y_col="y",
            z_col="z",
            query_or_df=df[df["hashtag"] == hashtag],
            plot_type="map",
        )
    )

util.show_figure(configs, type="dropdown")