In [None]:
'''
File name: wiki_data_loader.ipynb
Authors: Charlotte Sertic, Arthur Nussbaumer, Carl Penning, Robin Debalme
Date created: 8/12/2022
Date last modified: 22/12/2022
Python version: 3.x.x
'''

### Data loader of pageviews of Covid-19 related articles for 175 languages.

We are here loading the number of pageviews of Covid-19 related articles for 175 languages. Using the function `wiki_to_df_extract`, we fix this data by creating a csv: `page_views_covid_related.csv`.

In [1]:
import pandas as pd
import os
import gzip
import json
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import requests
import datetime
from scipy import stats
from helper import *
import urllib.parse
import time   

In [4]:
#loading the covid 19 related articles df
COVID_RELATED_ARTICLES_PATH = "COVID_related_pages_project.csv"
df_covid_articles = pd.read_csv(COVID_RELATED_ARTICLES_PATH)
df_covid_articles

Unnamed: 0,page,project,url,wikilink
0,أثر جائحة فيروس كورونا على الدين 2019-20,ar.wikipedia,https://ar.wikipedia.org/wiki/%D8%A3%D8%AB%D8%...,[[ar:أثر جائحة فيروس كورونا على الدين 2019-20
1,Impact of the COVID-19 pandemic on religion,en.wikipedia,https://en.wikipedia.org/wiki/Impact_of_the_CO...,[[en:Impact of the COVID-19 pandemic on religion
2,Impacto en la religión de la pandemia de enfer...,es.wikipedia,https://es.wikipedia.org/wiki/Impacto_en_la_re...,[[es:Impacto en la religión de la pandemia de ...
3,Dampak pandemi koronavirus terhadap kegiatan k...,id.wikipedia,https://id.wikipedia.org/wiki/Dampak_pandemi_k...,[[id:Dampak pandemi koronavirus terhadap kegia...
4,코로나바이러스감염증-19 범유행이 종교에 준 영향,ko.wikipedia,https://ko.wikipedia.org/wiki/%EC%BD%94%EB%A1%...,[[ko:코로나바이러스감염증-19 범유행이 종교에 준 영향
...,...,...,...,...
5203,오만의 코로나바이러스감염증-19 범유행,ko.wikipedia,https://ko.wikipedia.org/wiki/%EC%98%A4%EB%A7%...,[[ko:오만의 코로나바이러스감염증-19 범유행
5204,Pandemia de COVID-19 em Omã,pt.wikipedia,https://pt.wikipedia.org/wiki/Pandemia_de_COVI...,[[pt:Pandemia de COVID-19 em Omã
5205,2020 ஓமானில் கொரோனாவைரசுத் தொற்று,ta.wikipedia,https://ta.wikipedia.org/wiki/2020_%E0%AE%93%E...,[[ta:2020 ஓமானில் கொரோனாவைரசுத் தொற்று
5206,Umman'da COVID-19 pandemisi,tr.wikipedia,https://tr.wikipedia.org/wiki/Umman%27da_COVID...,[[tr:Umman'da COVID-19 pandemisi


In [3]:
#get the 175 language code
df_lang = df_covid_articles['project'].apply(lambda p: p.split('.')[0]).drop_duplicates()
pageview_df= pd.DataFrame()
df_tmp  = pd.DataFrame()
for value in df_lang:
    #get covid related from 01-01-2020 to 31-07-2022 for each language
    df_tmp = wiki_to_df_extract(value, '20200101', '20220731', df_covid_articles).rename({'views': value}, axis='columns')
    pageview_df = pd.concat([pageview_df, df_tmp], axis= 1)

In [4]:
#convert index to date format
pageview_df['date'] = pageview_df.index
pageview_df['date'] = pageview_df['date'].apply(lambda s: pd.to_datetime(s[:8], format='%Y%m%d'))
pageview_df = pageview_df.set_index('date')
#full data with all languages
pageview_df.head()

  pageview_df['date'] = pageview_df.index


Unnamed: 0_level_0,ar,en,es,id,ko,pt,zh,de,he,ru,...,io,li,mi,mzn,nds-nl,nrm,szy,tet,tt,wa
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01,108,4509,341,31,81,79,4821,2747,58,502,...,,,,,,,,,,
2020-01-02,160,6780,477,46,177,86,11784,3699,40,511,...,,,,,,,,,,
2020-01-03,90,6524,553,49,298,109,7161,1473,132,852,...,,,,,,,,,,
2020-01-04,130,7220,506,145,215,101,7615,1131,102,1821,...,,,,,,,,,,
2020-01-05,134,8016,464,112,192,93,8165,1077,89,1925,...,,,,0.0,,,,,,


In [6]:
#fix data by creating a csv file
compression_opts = dict(method='gzip', archive_name='page_views_covid_related.csv')  
pageview_df.to_csv('page_views_covid_related.csv.gz', index=True, compression= 'gzip')  


In [2]:
#Rules:
#language is spoken by at least 75% of the population of the country
#country's population speaking the language must represent at least 75% of the total population using the language
country_own_lang = {"Italy" : "it", "Russia": "ru", "China": "zh", "Albania": "sq", 
"Bangladesh": "bn", "Bostwana": "tn", "Cambogia": "km", "Croatia": "hr", "Greece": "el", "Sweden": "sv", "Finland": "fi", "Norway": "no",
 "Malaysian": "ms", "Israel": "he", "Lithuania": "lt", "Serbia": "sr", "Slovakia": "sk", "Slovenia": "sl", "Turkey": "tr",
 "Vietnam": "vi", "Bulgaria": "bg", "Czeck Republic": "cs", "Denmark": "da", "Georgia": "ka", "German": "de", 
 "Hungary": "hu", "Iceland": "is", "Japan": "ja", "Kazakhstan": "kk", "South Korea": "ko", "Kyrgyzstan": 'ky', "Netherland": "nl", "Poland": "pl", 
 "Romania": "ro", "Tajikistan": "tg", "Thailand": "th", "Azerbaijan": "az", "Mongolia": "mn"}
 

In [3]:
pageview_df = pd.read_csv("page_views_covid_related.csv.gz")

In [5]:
#pageviews of the considered languages
pageview_df_imp_country = pageview_df[["date"] + list(country_own_lang.values())].set_index('date')
pageview_df_imp_country.head()

Unnamed: 0_level_0,it,ru,zh,sq,bn,tn,km,hr,el,sv,...,kk,ko,ky,nl,pl,ro,tg,th,az,mn
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01,252,502,4821,0.0,,,,10,14,56,...,0.0,81,,131,357,17,,4,0,
2020-01-02,419,511,11784,,,,,25,11,56,...,0.0,177,,137,225,24,,21,2,
2020-01-03,403,852,7161,0.0,,,,19,14,67,...,2.0,298,,139,258,32,,19,0,
2020-01-04,767,1821,7615,0.0,,,,31,8,77,...,0.0,215,,172,215,25,,30,0,
2020-01-05,1163,1925,8165,0.0,,,,16,16,86,...,0.0,192,,184,280,53,,31,0,
