# Region coverage on Wikipedia over time
Take current groundtruth data about the regions associated with each Wikidata item and map that to articles on Wikipedia and when they were created to generate data on the representation of different regions by wiki by month from the start of Wikipedia to today.

Caveats:
* I ignore articles that were deleted. For most wikis, this should have minimal impact. For others like Swedish Wikipedia that has deleted a large number of articles that are geographic, this might impact the results in a noticeable way. It would be a lot more work to track deleted articles too though and still imperfect and isn't clearly a better result.

In [5]:
import gzip

import wmfdata

In [6]:
spark = wmfdata.spark.create_session(app_name='pyspark regular; regions; isaacj',
                                  type='yarn-regular', # local, yarn-regular, yarn-large
                                  )  

SPARK_HOME: /usr/lib/spark3
Using Hadoop client lib jars at 3.2.0, provided by Spark.
PYSPARK_PYTHON=/opt/conda-analytics/bin/python3


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/01/26 17:18:42 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/01/26 17:18:43 WARN Utils: Service 'sparkDriver' could not bind on port 12000. Attempting port 12001.
23/01/26 17:18:43 WARN Utils: Service 'sparkDriver' could not bind on port 12001. Attempting port 12002.
23/01/26 17:18:43 WARN Utils: Service 'sparkDriver' could not bind on port 12002. Attempting port 12003.
23/01/26 17:18:43 WARN Utils: Service 'sparkDriver' could not bind on port 12003. Attempting port 12004.
23/01/26 17:18:43 WARN Utils: Service 'sparkDriver' could not bind on port 12004. Attempting port 12005.
23/01/26 17:18:43 WARN Utils: Service 'sparkDriver' could not bind on port 12005. Attempting port 12006.
23/01/26 17:18:43 WARN Utils: Service 'spar

## Parameters

In [7]:
initial_gt_table = 'isaacj.qid_to_country'
initial_gt_snapshot = '2023-01-02'
wikidata_snapshot = '2023-01-02'
mediawiki_snapshot = '2022-12'

In [8]:
spark.sql(f'SELECT * FROM {initial_gt_table} where snapshot = "{initial_gt_snapshot}" LIMIT 10').show(50, False)

23/01/26 17:18:55 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
                                                                                

+----------+--------+-------+----------+
|qid       |property|country|snapshot  |
+----------+--------+-------+----------+
|Q133615   |P625    |Angola |2023-01-02|
|Q1200523  |P625    |Angola |2023-01-02|
|Q15632867 |P625    |Angola |2023-01-02|
|Q98079293 |P625    |Angola |2023-01-02|
|Q796583   |P625    |Angola |2023-01-02|
|Q3073442  |P625    |Angola |2023-01-02|
|Q110529257|P625    |Angola |2023-01-02|
|Q3696650  |P625    |Angola |2023-01-02|
|Q17461405 |P625    |Angola |2023-01-02|
|Q386051   |P625    |Angola |2023-01-02|
+----------+--------+-------+----------+



In [12]:
print_for_hive = False
do_execute = True

# CTDs:
#  * wikis: limit to just Wikipedia wikis
#  * regions: get mapping of QID to country (a QID can be associated with 0+ countries)
#  * page_creation_dates: get creation months for existing articles in each wiki
#  * pageid_to_qid: mapping of page ID to Wikidata ID for joining in country data
#  * article_country_creation: join all data for counts of articles created per wiki/region/month
#  * SELECT...: add in cumulative sum of articles that existed for each geo by month

query = f"""
WITH wikis AS (
    SELECT DISTINCT
      database_code
    FROM canonical_data.wikis
    WHERE
      database_group = 'wikipedia'
      AND status = 'open'
      AND visibility = 'public'
      AND editability = 'public'
),
regions AS (
    SELECT DISTINCT
      qid,
      country
    FROM {initial_gt_table}
    WHERE
      snapshot = '{initial_gt_snapshot}'
),
page_creation_dates AS (
    SELECT DISTINCT
      wiki_db,
      page_id,
      CONCAT(YEAR(page_creation_timestamp), "-", LPAD(MONTH(page_creation_timestamp), 2, "0")) AS creation_month
    FROM wmf.mediawiki_page_history mph
    INNER JOIN wikis w
      ON (mph.wiki_db = w.database_code)
    WHERE
      snapshot = '{mediawiki_snapshot}'
      AND page_namespace = 0
      AND NOT page_is_deleted
      AND NOT page_is_redirect
      AND end_timestamp IS NULL
      AND page_creation_timestamp IS NOT NULL
),
pageid_to_qid AS (
    SELECT
      wiki_db,
      page_id,
      item_id
    FROM wmf.wikidata_item_page_link wipl
    INNER JOIN wikis w
      ON (wipl.wiki_db = w.database_code)
    WHERE
      snapshot = '{wikidata_snapshot}'
      AND page_namespace = 0
),
article_country_creation AS (
    SELECT
      pcd.wiki_db,
      creation_month,
      COALESCE(country, 'non-geo') AS geo,
      COUNT(1) AS num_articles
    FROM page_creation_dates pcd
    INNER JOIN pageid_to_qid ptq
      ON (pcd.wiki_db = ptq.wiki_db
          AND pcd.page_id = ptq.page_id)
    LEFT JOIN regions r
      ON (ptq.item_id = r.qid)
    GROUP BY
      pcd.wiki_db,
      creation_month,
      geo
)
SELECT
  wiki_db,
  creation_month,
  geo,
  num_articles AS num_created,
  SUM(num_articles) OVER w as total_articles
FROM article_country_creation
WINDOW w AS (PARTITION BY wiki_db, geo ORDER BY creation_month ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
ORDER BY
  wiki_db ASC,
  geo ASC,
  creation_month ASC
"""

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    result = spark.sql(query)
    result.coalesce(1).write.csv(path="/user/isaacj/wiki-month-geo-counts", compression="gzip", header=True, sep="\t")
    


WITH wikis AS (
    SELECT DISTINCT
      database_code
    FROM canonical_data.wikis
    WHERE
      database_group = 'wikipedia'
      AND status = 'open'
      AND visibility = 'public'
      AND editability = 'public'
),
regions AS (
    SELECT DISTINCT
      qid,
      country
    FROM isaacj.qid_to_country
    WHERE
      snapshot = '2023-01-02'
),
page_creation_dates AS (
    SELECT DISTINCT
      wiki_db,
      page_id,
      CONCAT(YEAR(page_creation_timestamp), "-", LPAD(MONTH(page_creation_timestamp), 2, "0")) AS creation_month
    FROM wmf.mediawiki_page_history mph
    INNER JOIN wikis w
      ON (mph.wiki_db = w.database_code)
    WHERE
      snapshot = '2022-12'
      AND page_namespace = 0
      AND NOT page_is_deleted
      AND NOT page_is_redirect
      AND end_timestamp IS NULL
      AND page_creation_timestamp IS NOT NULL
),
pageid_to_qid AS (
    SELECT
      wiki_db,
      page_id,
      item_id
    FROM wmf.wikidata_item_page_link wipl
    INNER JOIN wikis w
   

                                                                                

In [13]:
!hdfs dfs -ls wiki-month-geo-counts

Found 2 items
-rw-r-----   3 isaacj isaacj          0 2023-01-26 17:21 wiki-month-geo-counts/_SUCCESS
-rw-r-----   3 isaacj isaacj    9735099 2023-01-26 17:21 wiki-month-geo-counts/part-00000-1bfd23f2-86c6-4c8e-bfea-8bb67efb3ebb-c000.csv.gz


In [16]:
!hdfs dfs -copyToLocal wiki-month-geo-counts/part-00000-1bfd23f2-86c6-4c8e-bfea-8bb67efb3ebb-c000.csv.gz wiki-month-geo-counts.tsv.gz

In [17]:
!zless wiki-month-geo-counts.tsv.gz | head

wiki_db	creation_month	geo	num_created	total_articles
abwiki	2012-01	Afghanistan	1	1
abwiki	2017-10	Afghanistan	5	6
abwiki	2019-07	Afghanistan	2	8
abwiki	2021-10	Afghanistan	1	9
abwiki	2010-06	Albania	2	2
abwiki	2010-09	Albania	1	3
abwiki	2013-11	Albania	1	4
abwiki	2016-02	Albania	1	5
abwiki	2016-08	Albania	1	6


In [22]:
# Impute missing months
# e.g., if 1 article created for a country/wiki in 2001-05
# and then the next created in 2001-07, then 2001-06 should be added
# with 0 created articles but 1 existing article.

all_months = []
for y in range(2001, 2023):
    for m in range(1, 13):
        all_months.append(f'{y}-{str(m).rjust(2, "0")}')
all_months_map = {m:i for i,m in enumerate(all_months)}

prev_wiki = None
prev_geo = None
prev_month_idx = None
prev_total_articles = 0
with gzip.open('wiki-month-geo-counts.tsv.gz', 'rt') as fin:
    with gzip.open('wiki-month-geo-counts-imputed.tsv.gz', 'wt') as fout:
        fout.write(next(fin))  # header
        for line in fin:
            wiki_db, creation_month, geo, num_created, total_articles = line.strip().split('\t')
            if creation_month == '2023-01':
                continue
            if wiki_db == prev_wiki and prev_geo == geo:
                curr_month_idx = all_months_map[creation_month]
                for midx in range(prev_month_idx + 1, curr_month_idx):
                    fout.write('\t'.join((wiki_db, all_months[midx], geo, str(0), str(prev_total_articles))) + '\n')
                fout.write(line)
                prev_month_idx = curr_month_idx
                prev_total_articles = total_articles
            else:
                if prev_wiki:
                    for midx in range(prev_month_idx + 1, len(all_months)):
                        fout.write('\t'.join((prev_wiki, all_months[midx], prev_geo, str(0), str(prev_total_articles))) + '\n')
                fout.write(line)
                prev_wiki = wiki_db
                prev_geo = geo
                prev_month_idx = all_months_map[creation_month]
                prev_total_articles = total_articles

In [23]:
!zless wiki-month-geo-counts.tsv.gz | head 

wiki_db	creation_month	geo	num_created	total_articles
abwiki	2012-01	Afghanistan	1	1
abwiki	2017-10	Afghanistan	5	6
abwiki	2019-07	Afghanistan	2	8
abwiki	2021-10	Afghanistan	1	9
abwiki	2010-06	Albania	2	2
abwiki	2010-09	Albania	1	3
abwiki	2013-11	Albania	1	4
abwiki	2016-02	Albania	1	5
abwiki	2016-08	Albania	1	6


In [24]:
!zless wiki-month-geo-counts-imputed.tsv.gz | head -n150

wiki_db	creation_month	geo	num_created	total_articles
abwiki	2012-01	Afghanistan	1	1
abwiki	2012-02	Afghanistan	0	1
abwiki	2012-03	Afghanistan	0	1
abwiki	2012-04	Afghanistan	0	1
abwiki	2012-05	Afghanistan	0	1
abwiki	2012-06	Afghanistan	0	1
abwiki	2012-07	Afghanistan	0	1
abwiki	2012-08	Afghanistan	0	1
abwiki	2012-09	Afghanistan	0	1
abwiki	2012-10	Afghanistan	0	1
abwiki	2012-11	Afghanistan	0	1
abwiki	2012-12	Afghanistan	0	1
abwiki	2013-01	Afghanistan	0	1
abwiki	2013-02	Afghanistan	0	1
abwiki	2013-03	Afghanistan	0	1
abwiki	2013-04	Afghanistan	0	1
abwiki	2013-05	Afghanistan	0	1
abwiki	2013-06	Afghanistan	0	1
abwiki	2013-07	Afghanistan	0	1
abwiki	2013-08	Afghanistan	0	1
abwiki	2013-09	Afghanistan	0	1
abwiki	2013-10	Afghanistan	0	1
abwiki	2013-11	Afghanistan	0	1
abwiki	2013-12	Afghanistan	0	1
abwiki	2014-01	Afghanistan	0	1
abwiki	2014-02	Afghanistan	0	1
abwiki	2014-03	Afghanistan	0	1
abwiki	2014-04	Afghanistan	0	1
abwiki	2014-05	Afghanistan	0	1
abwiki	2014-06	Afghanistan	0	1
abwiki	2014-07	A