In [1]:
import pyspark
import pyspark.sql
from pyspark.sql import *
from pyspark.sql.functions import *
import json

conf = pyspark.SparkConf().setMaster("local[*]").setAll([
                                   ('spark.driver.memory','150g'),
                                   ('spark.driver.maxResultSize', '32G'),
                                   ('spark.local.dir', '/scratch/tmp/'),
                                   ('spark.yarn.stagingDir', '/scratch/tmp/')                 
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# create the context
sc = spark.sparkContext

In [2]:
spark

In [3]:
wikidata_all = sc.textFile("latest-20200205.json.bz2")

In [4]:
DISAMBIGUATION = 'Q4167410'
LIST = 'Q13406463'
INTERNAL_ITEM = 'Q17442446'
CATEGORY = 'Q4167836'

def get_entity_info(line):
    try:
        if DISAMBIGUATION in line or LIST in line or INTERNAL_ITEM in line or CATEGORY in line:
            return []
        row = json.loads(line[:-1])
        if 'type' in row and row['type'] == 'item':
            titles = []
            if 'sitelinks' in row:
                for k,v in row['sitelinks'].items():
                    site = v['site']
                    if site.endswith('wiki'):
                        title = v['title']
                        titles.append(Row(qid=row['id'], site=site, title=title))
            return titles
        else:
            return []
    except Exception as e:
        return []


In [5]:
articles = wikidata_all.flatMap(get_entity_info)

In [6]:
all_entities = spark.createDataFrame(articles)

In [7]:
all_entities.write.mode("overwrite").parquet("WikidataInterlanguageLinks.parquet")