# Wikipedia Bias

In [263]:
import pandas as pd
import requests
import json
import csv

t1 = pd.read_csv('D:\Own Stuff\Acads\DATA 512\country\country\data\page_data.csv')
t2 = pd.read_csv('D:\Own Stuff\Acads\DATA 512\WPDS_2018_data.csv')

# Removing articles starting with 'Template:.....'
t1 = t1[~t1['page'].str.startswith("Template:")]
t1.columns

Index(['page', 'country', 'rev_id'], dtype='object')

In [264]:
t1.head()

Unnamed: 0,page,country,rev_id
1,Bir I of Kanem,Chad,355319463
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188
12,Yos Por,Cambodia,393822005
23,Julius Gregr,Czech Republic,395521877
24,Edvard Gregr,Czech Republic,395526568


In [265]:
t2.head()

Unnamed: 0,Geography,Population mid-2018 (millions)
0,AFRICA,1284.0
1,Algeria,42.7
2,Egypt,97.0
3,Libya,6.5
4,Morocco,35.2


### Making ORES requests

In [266]:
headers = {'User-Agent' : 'https://github.com/ebm94', 'From' : 'edwin100@uw.edu'}

def get_ores_data(revision_ids, headers):
    
    # Define the endpoint
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'
    
    # Specify the parameters - smushing all the revision IDs together separated by | marks.  
    params = {'project' : 'enwiki',
              'model'   : 'wp10',
              'revids'  : '|'.join(str(x) for x in revision_ids)
              }
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    return json.dumps(response, indent=4, sort_keys=True)

Extracting predictions for batch of 100 rev_id's. 

In [119]:
revs = list(t1.rev_id)
preds = []
for i in np.arange(0, len(revs), 100):
    lis = revs[i:i+100]
    temp = eval(get_ores_data(lis, headers))["enwiki"]["scores"]
    for j in lis:
        try:
            preds.append(temp[str(j)]["wp10"]["score"]["prediction"])
        except:
            preds.append('NA')

In [267]:
revs_na = []
for i in np.arange(len(preds)):
    if preds[i] == 'NA':
        revs_na.append(revs[i])

Getting dataframe with quality predictions for the rev_id's.

In [268]:
preds_df = pd.DataFrame(
    {'rev_id': revs,
     'prediction': preds
    })

In [269]:
t1 = pd.merge(t1, preds_df, on='rev_id')

Bringing in population

In [270]:
merged_df = pd.merge(t1, t2, left_on='country', right_on='Geography')
merged_df = merged_df.drop('Geography', 1)
merged_df = merged_df.rename(columns={"page": "article_name", "rev_id": "revision_id", "prediction":"article_quality", "Population mid-2018 (millions)":"population"})
merged_df.to_csv('D:\Own Stuff\Acads\DATA 512\country\country\data\wp_wpds_politicians_by_country.csv')

Extracting countries that are not common in both Wikipedia pages and the population data.

In [271]:
nomatch = pd.DataFrame((set(np.unique(t1.country)) - set(t2.Geography)).union(set(t2.Geography) - set(np.unique(t1.country))))
nomatch = nomatch.rename(columns={0:'country'})
nomatch.to_csv('D:\Own Stuff\Acads\DATA 512\country\country\data\wp_wpds_countries-no_match.csv')

### Analysis

Treating the population 'object' type column

In [272]:
merged_df["population"] = merged_df["population"].str.replace(",","").astype(float)

In [273]:
agg_df = pd.DataFrame()
agg_df['articles'] = merged_df.groupby(['country'])['revision_id'].count()
agg_df['population'] = merged_df.groupby(['country'])['population'].max()
agg_df['articles-per-population'] = agg_df['articles']*1.0 / agg_df['population']
agg_df['high-quality-articles-count'] = merged_df[(merged_df.article_quality=='FA') | (merged_df.article_quality=='GA')].groupby(['country'])['revision_id'].count()
agg_df['high-quality-articles-per'] = agg_df['high-quality-articles-count']*1.0 / agg_df['articles']

### Top 10 countries by coverage

In [274]:
agg_df.sort_values('articles-per-population', ascending=False).head(10)

Unnamed: 0_level_0,articles,population,articles-per-population,high-quality-articles-count,high-quality-articles-per
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Tuvalu,54,0.01,5400.0,5.0,0.092593
Nauru,52,0.01,5200.0,,
San Marino,81,0.03,2700.0,,
Monaco,40,0.04,1000.0,,
Liechtenstein,28,0.04,700.0,,
Tonga,63,0.1,630.0,,
Marshall Islands,37,0.06,616.666667,,
Iceland,202,0.4,505.0,2.0,0.009901
Andorra,34,0.08,425.0,,
Grenada,36,0.1,360.0,1.0,0.027778


### Bottom 10 countries by coverage

In [275]:
agg_df.sort_values('articles-per-population', ascending=False).tail(10)

Unnamed: 0_level_0,articles,population,articles-per-population,high-quality-articles-count,high-quality-articles-per
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bangladesh,321,166.4,1.929087,3.0,0.009346
Mozambique,58,30.5,1.901639,,
Thailand,112,66.2,1.691843,3.0,0.026786
Zambia,25,17.7,1.412429,,
"Korea, North",36,25.6,1.40625,7.0,0.194444
Ethiopia,101,107.5,0.939535,2.0,0.019802
Uzbekistan,28,32.9,0.851064,2.0,0.071429
China,1133,1393.8,0.812886,41.0,0.036187
Indonesia,211,265.2,0.795626,10.0,0.047393
India,985,1371.3,0.718297,17.0,0.017259


### Top 10 countries by relative quality

In [276]:
agg_df.sort_values('high-quality-articles-per', ascending=False).head(10)

Unnamed: 0_level_0,articles,population,articles-per-population,high-quality-articles-count,high-quality-articles-per
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Korea, North",36,25.6,1.40625,7.0,0.194444
Saudi Arabia,118,33.4,3.532934,15.0,0.127119
Mauritania,48,4.5,10.666667,6.0,0.125
Central African Republic,66,4.7,14.042553,8.0,0.121212
Romania,343,19.5,17.589744,39.0,0.113703
Tuvalu,54,0.01,5400.0,5.0,0.092593
Bhutan,33,0.8,41.25,3.0,0.090909
Dominica,12,0.07,171.428571,1.0,0.083333
Syria,129,18.3,7.04918,10.0,0.077519
Benin,91,11.5,7.913043,7.0,0.076923


### Bottom 10 countries by relative quality

In [277]:
t = agg_df.sort_values('high-quality-articles-per', ascending=False)
t = t[t['high-quality-articles-per'].notnull()]
t.tail(10)

Unnamed: 0_level_0,articles,population,articles-per-population,high-quality-articles-count,high-quality-articles-per
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Azerbaijan,179,9.9,18.080808,1.0,0.005587
Fiji,198,0.9,220.0,1.0,0.005051
Lithuania,244,2.8,87.142857,1.0,0.004098
Colombia,285,49.8,5.722892,1.0,0.003509
Nigeria,679,195.9,3.466054,2.0,0.002946
Peru,350,32.2,10.869565,1.0,0.002857
Nepal,361,29.7,12.154882,1.0,0.00277
Switzerland,403,8.5,47.411765,1.0,0.002481
Tanzania,405,59.1,6.852792,1.0,0.002469
Belgium,520,11.4,45.614035,1.0,0.001923


### Geographic regions by coverage

In [278]:
# Reading in mapping file
t3 = pd.read_csv('D:\Own Stuff\Acads\DATA 512\country-region map.csv')
agg_df = agg_df.reset_index()
agg_df = pd.merge(agg_df, t3, left_on='country', right_on='Geography', how='left')
reg_agg_df = pd.DataFrame()
reg_agg_df['articles'] = agg_df.groupby(['Region'])['articles'].sum()
reg_agg_df['population'] = agg_df.groupby(['Region'])['population'].sum()
reg_agg_df['articles-per-population'] = reg_agg_df['articles']*1.0 / reg_agg_df['population']
reg_agg_df['high-quality-articles'] = agg_df.groupby(['Region'])['high-quality-articles-count'].sum()
reg_agg_df['high-quality-articles'] = reg_agg_df['high-quality-articles']*1.0 / reg_agg_df['articles']

In [279]:
reg_agg_df.sort_values('articles-per-population', ascending=False)

Unnamed: 0_level_0,articles,population,articles-per-population,high-quality-articles
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
OCEANIA,3132,39.78,78.733032,0.021073
EUROPE,15923,734.59,21.676037,0.020222
LATIN AMERICA AND THE CARIBBEAN,5174,628.27,8.235313,0.013336
AFRICA,6861,1172.4,5.852098,0.018219
NORTHERN AMERICA,1940,365.2,5.312158,0.051031
ASIA,11588,4513.1,2.567636,0.026752


### Geographic regions by coverage

In [280]:
reg_agg_df.sort_values('high-quality-articles', ascending=False)

Unnamed: 0_level_0,articles,population,articles-per-population,high-quality-articles
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NORTHERN AMERICA,1940,365.2,5.312158,0.051031
ASIA,11588,4513.1,2.567636,0.026752
OCEANIA,3132,39.78,78.733032,0.021073
EUROPE,15923,734.59,21.676037,0.020222
AFRICA,6861,1172.4,5.852098,0.018219
LATIN AMERICA AND THE CARIBBEAN,5174,628.27,8.235313,0.013336


- On looking Region-wise, Oceania is seen to have highest articles per population. Northern America has second-lowest artciles per population but has the highest high-quality-articles ratio.
- Article-quality wise, North Korea and Saudi Arabia are seen to have highest and Belgium has lowest.
- In terms of articles-per-population, Tuvalu has the highest and India has the lowest.
- The analysis is very much biased towards those countries that appear in both Wikipedia and population data.

1. One bias that I expected to find here was that this data being from English Wikipedia would be highly biased against non-English speaking countries.
6. It makes sense to use the avilable datasets to perform a hypothesis-driven research when the research problem pertains to only English speaking countries.
7. A researcher would be able to correct the bias present to an extent by using Wikipedia articles from other languages as well.