In [11]:
import pandas as pd

url = "https://dumps.wikimedia.org/other/geoeditors/geoeditors-monthly-{month}.tsv"

months = [f"{year}-{month:02}" for year in range(2018, 2025) for month in range(1, 13)]

cols = ["wiki", "country", "activity_level", "lower_bound", "upper_bound"]

dfs = []
for month in months:
    try:
        df = pd.read_csv(url.format(month=month), sep="\t", names=cols)
        df["month"] = month
        dfs.append(df)
    except Exception as e:
        print(f"Error reading {month}: {e}")
    # break


In [12]:
df_full = pd.concat(dfs, ignore_index=True)

In [54]:
df_res = df_full.query("wiki == 'eswiki'").groupby(["month", "country"], as_index=False)[["upper_bound"]].sum()
df_res["distribution"] = df_res["upper_bound"] / df_res.groupby("month")["upper_bound"].transform("sum")

In [55]:
# Fill missing countries-months with 0:
df_distrib = df_res.pivot(index="month", columns="country", values="distribution").fillna(0)
df_distrib

country,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Aruba,Australia,Austria,...,Uganda,Ukraine,United Kingdom,United States,Uruguay,Vanuatu,Vatican City,Zambia,Zimbabwe,unknown
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01,0.000907,0.000000,0.000907,0.000000,0.0,0.127949,0.000000,0.000000,0.001815,0.001815,...,0.000000,0.001815,0.005445,0.025408,0.019964,0.0,0.0,0.0,0.0,0.000907
2018-02,0.001015,0.000000,0.001015,0.000000,0.0,0.119797,0.000000,0.000000,0.003046,0.002030,...,0.000000,0.002030,0.005076,0.026396,0.019289,0.0,0.0,0.0,0.0,0.001015
2018-03,0.000000,0.000933,0.000933,0.000000,0.0,0.119403,0.000933,0.000000,0.002799,0.000933,...,0.000000,0.001866,0.004664,0.027052,0.023321,0.0,0.0,0.0,0.0,0.000933
2018-04,0.000000,0.000000,0.001754,0.000000,0.0,0.116667,0.000877,0.000000,0.001754,0.001754,...,0.000000,0.001754,0.005263,0.024561,0.028947,0.0,0.0,0.0,0.0,0.001754
2018-05,0.000000,0.000000,0.000831,0.000000,0.0,0.124585,0.000000,0.000000,0.001661,0.001661,...,0.000000,0.000831,0.004153,0.023256,0.020764,0.0,0.0,0.0,0.0,0.001661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08,0.000000,0.001159,0.001159,0.000000,0.0,0.132097,0.001159,0.001159,0.002317,0.001159,...,0.001159,0.001159,0.005794,0.033604,0.026651,0.0,0.0,0.0,0.0,0.000000
2024-09,0.001167,0.001167,0.001167,0.001167,0.0,0.128355,0.000000,0.001167,0.002334,0.001167,...,0.000000,0.001167,0.003501,0.028005,0.023337,0.0,0.0,0.0,0.0,0.000000
2024-10,0.002203,0.001101,0.001101,0.000000,0.0,0.123348,0.001101,0.000000,0.001101,0.001101,...,0.000000,0.001101,0.004405,0.026432,0.019824,0.0,0.0,0.0,0.0,0.000000
2024-11,0.001168,0.001168,0.001168,0.000000,0.0,0.120327,0.001168,0.001168,0.001168,0.002336,...,0.000000,0.001168,0.004673,0.026869,0.022196,0.0,0.0,0.0,0.0,0.000000


In [60]:
df_counts = df_res.pivot(index="month", columns="country", values="upper_bound").fillna(0)
df_final = df_counts.mean().to_frame()
df_final.columns = ["mean"]
df_final.sort_values("mean", ascending=False).head(20).round(1)

Unnamed: 0_level_0,mean
country,Unnamed: 1_level_1
Spain,3062.6
Argentina,1416.9
Mexico,1353.1
Chile,861.1
Colombia,803.8
Peru,684.5
United States,255.0
Ecuador,229.4
Uruguay,228.0
Costa Rica,127.5


In [61]:
# Get the avg diistribution:
df_final = df_distrib.mean().to_frame()
df_final.columns = ["avg_distribution"]
df_final["avg_distribution"] = df_final["avg_distribution"] * 100
df_final["avg_distribution"] = df_final["avg_distribution"].round(1)
df_final.sort_values("avg_distribution", ascending=False).head(20)

Unnamed: 0_level_0,avg_distribution
country,Unnamed: 1_level_1
Spain,28.1
Argentina,13.0
Mexico,12.4
Chile,8.0
Colombia,7.4
Peru,6.3
United States,2.4
Uruguay,2.1
Ecuador,2.1
Costa Rica,1.2


In [None]:
# # Show as %, group small values under "Others":
# df_final = df_res.copy()
# df_final["upper_bound"] = df_final["upper_bound"] / df_final["upper_bound"].sum()
# df_final.loc[df_final["upper_bound"] < 0.01, "country"] = "Others"
# df_final = df_final.groupby("country")["upper_bound"].sum().sort_values(ascending=False).reset_index()
# assert df_final["upper_bound"].sum() == 1.0
# df_final["upper_bound"] = df_final["upper_bound"].map("{:.2%}".format)
# df_final.head(20)

Unnamed: 0,country,upper_bound
0,Spain,28.18%
1,Others,15.93%
2,Argentina,13.04%
3,Mexico,12.45%
4,Chile,7.92%
5,Colombia,7.40%
6,Peru,6.30%
7,United States,2.35%
8,Ecuador,2.11%
9,Uruguay,2.10%
