Import Libs

In [198]:
# Makes the plots appear within the notebook
%matplotlib inline

# Two fundamental packages for doing data manipulation
import numpy as np                   # http://www.numpy.org/
import pandas as pd                  # http://pandas.pydata.org/

# Two related packages for plotting data
import matplotlib.pyplot as plt      # http://matplotlib.org/
import seaborn as sb                 # https://stanford.edu/~mwaskom/software/seaborn/

# Package for requesting data via the web and parsing resulting JSON
import requests
import json
from bs4 import BeautifulSoup

# Two packages for accessing the MySQL server
import pymysql                       # http://pymysql.readthedocs.io/en/latest/
import os                            # https://docs.python.org/3.4/library/os.html

# Packages for analyzing complex networks
import networkx as nx                # https://networkx.github.io/
import igraph as ig                  #If required manually in terminal use pip install python-igraph

# Setup the code environment to use plots with a white background and DataFrames show more columns and rows
sb.set_style('whitegrid')
pd.options.display.max_columns = 100
pd.options.display.max_rows = 110

In [199]:

page_title = "2013 Egyptian coup d'état"

In [200]:

def link_getter(page_title):
    
    _S="https://en.wikipedia.org/w/api.php?action=query&format=json&prop=langlinks&titles={0}&llprop=autonym|langname&lllimit=500".format(page_title)
    
    req = requests.get(_S)

    json_string = json.loads(req.text)
    
    _pageID=list(json_string['query']['pages'].keys())[0]

    _langlink_list=json_string['query']['pages'][_pageID]['langlinks']
    
    #_langAbrev_dict=dict()

    #for t in _langlink_list:
        #_lang=t['lang']
        #_langname=t['langname']
        #_langAbrev_dict[_lang]=_langname
    
    
    _langlink_dict=dict()

    for d in _langlink_list:
        _lang=d['lang']
        _title=d['*']
        _langlink_dict[_lang]=_title
        
    _langlink_dict['en'] = page_title
    
    return _langlink_dict

In [None]:
link_getter(page_title)

In [203]:

def name_getter(page_title):
    
    _S="https://en.wikipedia.org/w/api.php?action=query&format=json&prop=langlinks&titles={0}&llprop=autonym|langname&lllimit=500".format(page_title)
    
    req = requests.get(_S)

    json_string = json.loads(req.text)
    
    _pageID=list(json_string['query']['pages'].keys())[0]

    _langlink_list=json_string['query']['pages'][_pageID]['langlinks']
    
    _langname_dict=dict()

    for t in _langlink_list:
        _lang=t['lang']
        _langname=t['langname']
        _title=t['*']
        _langname_dict[_lang]=_langname
        
    _langname_dict['en'] = _title
    
    return _langname_dict

In [206]:
name_getter(page_title)

{'af': 'Afrikaans',
 'ar': 'Arabic',
 'arz': 'Egyptian Arabic',
 'az': 'Azerbaijani',
 'bg': 'Bulgarian',
 'ca': 'Catalan',
 'ckb': 'Central Kurdish',
 'de': 'German',
 'el': 'Greek',
 'en': '2013年埃及政变',
 'es': 'Spanish',
 'fa': 'Persian',
 'fi': 'Finnish',
 'fr': 'French',
 'he': 'Hebrew',
 'hi': 'Hindi',
 'id': 'Indonesian',
 'it': 'Italian',
 'ja': 'Japanese',
 'ko': 'Korean',
 'nl': 'Dutch',
 'pl': 'Polish',
 'pt': 'Portuguese',
 'ro': 'Romanian',
 'ru': 'Russian',
 'sr': 'Serbian',
 'tg': 'Tajik',
 'tr': 'Turkish',
 'uk': 'Ukrainian',
 'ur': 'Urdu',
 'vi': 'Vietnamese',
 'zh': 'Chinese'}

In [208]:
_langAbrev_dict = {}

names_and_langs = name_getter(page_title)

for lang,langname in names_and_langs.items():
    _langAbrev_dict[lang] = langname

In [209]:

titles_and_lang_dict = {}

pages_and_langs = link_getter(page_title)

for lang,title in pages_and_langs.items():
    titles_and_lang_dict[lang] = title

In [210]:
def get_page_outlinks(page_title,lang='en',redirects=1):
    # Replace spaces with underscores
    page_title = page_title.replace(' ','_')
    
    bad_titles = ['Special:','Wikipedia:','Help:','Template:','Category:','International Standard','Portal:','s:','File:']
    
    # Get the response from the API for a query
    # After passing a page title, the API returns the HTML markup of the current article version within a JSON payload
    req = requests.get('https://{2}.wikipedia.org/w/api.php?action=parse&format=json&page={0}&redirects={1}&prop=text&disableeditsection=1&disabletoc=1'.format(page_title,redirects,lang))
    
    # Read the response into JSON to parse and extract the HTML
    json_string = json.loads(req.text)
    
    # Initialize an empty list to store the links
    outlinks_list = [] 
    
    if 'parse' in json_string.keys():
        page_html = json_string['parse']['text']['*']

        # Parse the HTML into Beautiful Soup
        soup = BeautifulSoup(page_html,'lxml')

        # Delete tags associated with templates
        for tag in soup.find_all('tr'):
            tag.replace_with('')

        # For each paragraph tag, extract the titles within the links
        for para in soup.find_all('p'):
            for link in para.find_all('a'):
                if link.has_attr('title'):
                    title = link['title']
                    # Ignore links that aren't interesting
                    if all(bad not in title for bad in bad_titles):
                        outlinks_list.append(title)

        # For each unordered list, extract the titles within the child links
        for unordered_list in soup.find_all('ul'):
            for item in unordered_list.find_all('li'):
                for link in item.find_all('a'):
                    if link.has_attr('title'):
                        title = link['title']
                        # Ignore links that aren't interesting
                        if all(bad not in title for bad in bad_titles):
                            outlinks_list.append(title)

    return outlinks_list

In [211]:
outlinks_per_lang = {}

language_titles = link_getter(page_title)

for lang,title in language_titles.items():
    #print("The language is",lang,"and the article title is", title)
    outlinks_per_lang[lang] = get_page_outlinks(title,lang)

In [212]:
def get_outlink_translations(outlinks_per_lang):
    translation_dict = dict()
    for lang,links in outlinks_per_lang.items():
        _page_titles = list(set(links))

        translation_dict[lang] = {}

        for _page_title in _page_titles:
            _S="https://{1}.wikipedia.org/w/api.php?action=query&format=json&prop=langlinks&titles={0}&redirects=1&lllimit=500&formatversion=2".format(_page_title,lang)
            response = requests.get(_S).json()
            if 'pages' in response['query'].keys():
                langlink_dict = response['query']['pages'][0]
                translation_dict[lang][_page_title] = {}
                if 'langlinks' in langlink_dict.keys():
                    for _ll in langlink_dict['langlinks']:
                        _ll_title = _ll['title']
                        _ll_lang = _ll['lang']
                        translation_dict[lang][_page_title][_ll_lang] = _ll_title
            else:
                translation_dict[lang][_page_title] = {}
                
    return translation_dict

In [213]:
translation_dict = get_outlink_translations(outlinks_per_lang)

In [214]:
list(translation_dict.items())[0]

('he',
 {'1 ביולי': {'ab': 'Ҧхынгәы 1',
   'af': '1 Julie',
   'als': '1. Juli',
   'am': '1 July',
   'an': '1 de chulio',
   'ang': '1. Mǣdmōnðes',
   'ar': '1 يوليو',
   'arz': '1 يوليه',
   'as': '১ জুলাই',
   'ast': '1 de xunetu',
   'av': '1 Июл',
   'az': '1 iyul',
   'ba': '1 июль',
   'bat-smg': 'Lėipas 1',
   'bcl': 'Hulyo 1',
   'be': '1 ліпеня',
   'be-x-old': '1 ліпеня',
   'bg': '1 юли',
   'bh': '१ जुलाई',
   'bn': '১ জুলাই',
   'bpy': 'জুলাই ১',
   'br': '1añ Gouere',
   'bs': '1. juli',
   'ca': '1 de juliol',
   'cdo': '7 nguŏk 1 hô̤',
   'ce': '1 июль',
   'ceb': 'Hulyo 1',
   'ckb': '١ی تەممووز',
   'co': '1 di lugliu',
   'cs': '1. červenec',
   'csb': '1 lëpińca',
   'cv': 'Утă, 1',
   'cy': '1 Gorffennaf',
   'da': '1. juli',
   'de': '1. Juli',
   'diq': '1 Temuz',
   'dv': 'ޖުލައި 1',
   'ee': 'Siamlɔm 1',
   'el': '1 Ιουλίου',
   'eml': '1 ed lói',
   'en': 'July 1',
   'eo': '1-a de julio',
   'es': '1 de julio',
   'et': '1. juuli',
   'eu': 'Uztailaren 1',


In [215]:
len_transoutlink_dict_counterEn = {}

for lang,titles_dict in translation_dict.items():
    len_transoutlink_dict_counterEn[lang]=0         #starting at fa at a count of 0 
    for art_title,lang_dict in titles_dict.items(): #go into links on fa 
        if 'en' in lang_dict.keys():               #check for en version in links 
            len_transoutlink_dict_counterEn[lang] +=1   #if one is found add to counter 
            

In [216]:
len_transoutlink_dict_enList = {}

for lang,titles_dict in translation_dict.items():
    len_transoutlink_dict_enList[lang]=[]         #starting at fa at a count of 0 
    for art_title,lang_dict in titles_dict.items(): #go into links on fa 
        if 'en' in lang_dict.keys():  
            _title = lang_dict['en']        #check for en version in links 
            len_transoutlink_dict_enList[lang].append(_title)   

In [217]:
len_transoutlink_dict = {}

for langlinks_lang,titles in translation_dict.items():
    len_transoutlink_dict[langlinks_lang] = len(titles)

In [218]:
s_d2 = pd.Series(titles_and_lang_dict)

s_d1 = pd.Series(_langAbrev_dict)

s_d3 = pd.Series(len_transoutlink_dict)

s_d4 = pd.Series(len_transoutlink_dict_counterEn)

In [219]:
df_mix = pd.DataFrame({'lang' : s_d1,
                       'title' : s_d2, 
                       'total links' : s_d3,
                       'english links': s_d4})

df_mix = df_mix[['lang', 'title', 'total links', 'english links']]

In [220]:
df_mix['percent en linkbacks'] = df_mix['english links']/df_mix['total links']

df_mix

Unnamed: 0,lang,title,total links,english links,percent en linkbacks
af,Afrikaans,Egiptiese staatsgreep van 2013,4,2,0.5
ar,Arabic,انقلاب 2013 في مصر,110,94,0.854545
arz,Egyptian Arabic,خريطة المستقبل (مصر),28,26,0.928571
az,Azerbaijani,Misirdə hərbi çeviriliş (2013),0,0,
bg,Bulgarian,Държавен преврат в Египет (2013 г.),24,17,0.708333
ca,Catalan,Cop d'Estat a Egipte l'any 2013,21,21,1.0
ckb,Central Kurdish,کودەتای ٢٠١٣ی میسر,11,9,0.818182
de,German,Militärputsch in Ägypten 2013,267,233,0.872659
el,Greek,Αιγυπτιακό πραξικόπημα 2013,9,6,0.666667
en,2013年埃及政变,2013 Egyptian coup d'état,262,0,0.0
