# Download Data

This notebook downloads the necessary data to replicate the results of our paper on Gender Inequalities on Wikipedia.

Note that we use a file named `dbpedia_config.py` where we set which language editions we will we study, as well as where to save and load data files.

By [Eduardo Graells-Garrido](http://carnby.github.io).

In [1]:
!cat dbpedia_config.py

# The DBpedia editions we will consider
MAIN_LANGUAGE = 'en'
LANGUAGES = 'en|bg|ca|cs|de|es|eu|fr|hu|id|it|ja|ko|nl|pl|pt|ru|tr|ar|el'.split('|')

# Where are we going to download the data files
DATA_FOLDER = '/home/egraells/resources/dbpedia'

# Folder to store analysis results
TARGET_FOLDER = '/home/egraells/phd/notebooks/pajaritos/person_results'

# This is used when crawling WikiData.
QUERY_WIKIDATA_GENDER = False
YOUR_EMAIL = 'mail@example.com'

In [None]:
import subprocess
import os
import dbpedia_config

In [None]:
target = dbpedia_config.TARGET_FOLDER
languages = dbpedia_config.LANGUAGES

In [None]:
# Ontology
if not os.path.exists('{0}/dbpedia_2014.owl.bz2'.format(target)):
    subprocess.call(['/usr/bin/wget', 
        'http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/dbpedia_2014.owl.bz2',
        '-O', '{0}/dbpedia_2014.owl.bz2'.format(target)], 
        stdout=None, stderr=None)

In [None]:
for lang in languages:
    if not os.path.exists('{0}/instance_types_{1}.nt.bz2'.format(target, lang)):
        subprocess.call(['/usr/bin/wget', 
            'http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/{0}/instance_types_{0}.nt.bz2'.format(lang),
            '-O', '{0}/instance_types_{1}.nt.bz2'.format(target, lang)], 
            stdout=None, stderr=None)
    
    if not os.path.exists('{0}/interlanguage_links_{0}.nt.bz2'.format(target, lang)):
        subprocess.call(['/usr/bin/wget', 
            'http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/{0}/interlanguage_links_{0}.nt.bz2'.format(lang),
            '-O', '{0}/interlanguage_links_{1}.nt.bz2'.format(target, lang)], 
            stdout=None, stderr=None)
    
    if not os.path.exists('{0}/labels_{1}.nt.bz2'.format(target, lang)):
        subprocess.call(['/usr/bin/wget', 
            'http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/{0}/labels_{0}.nt.bz2'.format(lang),
            '-O', '{0}/labels_{1}.nt.bz2'.format(target, lang)], 
            stdout=None, stderr=None)
        
    if not os.path.exists('{0}/mappingbased_properties_{1}.nt.bz2'.format(target, lang)):
        subprocess.call(['/usr/bin/wget', 
            'http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/{0}/mappingbased_properties_{0}.nt.bz2'.format(lang),
            '-O', '{0}/mappingbased_properties_{1}.nt.bz2'.format(target, lang)], 
            stdout=None, stderr=None)

In [None]:
# http://oldwiki.dbpedia.org/Datasets/NLP#h172-7
dbpedia_gender = 'http://wifo5-04.informatik.uni-mannheim.de/downloads/datasets/genders_en.nt.bz2'

if not os.path.exists('{0}/genders_en.nt.bz2'.format(target)):
    subprocess.call(['/usr/bin/wget', 
        dbpedia_gender,
        '-O', '{0}/genders_en.nt.bz2'.format(target)], 
        stdout=None, stderr=None)

In [None]:
# http://www.davidbamman.com/?p=12
wikipedia_gender = 'http://www.ark.cs.cmu.edu/bio/data/wiki.genders.txt'

if not os.path.exists('{0}/wiki.genders.txt'.format(target)):
    subprocess.call(['/usr/bin/wget', 
        dbpedia_gender,
        '-O', '{0}/wiki.genders.txt'.format(target)], 
        stdout=None, stderr=None)

In [None]:
# abstract data for english only
lang = 'en'
if not os.path.exists('{0}/long_abstracts_{1}.nt.bz2'.format(target, lang)):
    subprocess.call(['/usr/bin/wget', 
        'http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/{0}/long_abstracts_{0}.nt.bz2'.format(lang),
        '-O', '{0}/long_abstracts_{1}.nt.bz2'.format(target, lang)], 
        stdout=None, stderr=None)

In [None]:
# network data for english only
lang = 'en'
if not os.path.exists('{0}/page_links_{1}.nt.bz2'.format(target, lang)):
    subprocess.call(['/usr/bin/wget', 
        'http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/{0}/page_links_{0}.nt.bz2'.format(lang),
        '-O', '{0}/page_links_{1}.nt.bz2'.format(target, lang)], 
        stdout=None, stderr=None)