In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

In [3]:
spark = (SparkSession.builder
         .appName('Import sw data')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.13.0,org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")
         .config("spark.mongodb.output.uri", "mongodb://root:helloworld@127.0.0.1/wiki.starwars?authSource=admin")
         .getOrCreate())



:: loading settings :: url = jar:file:/data/pella/projects/University/Thesis/Thesis/code/env/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/egordm/.ivy2/cache
The jars for the packages stored in: /home/egordm/.ivy2/jars
com.databricks#spark-xml_2.12 added as a dependency
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-001b6935-c59b-4590-a282-19cf632eac94;1.0
	confs: [default]
	found com.databricks#spark-xml_2.12;0.13.0 in central
	found commons-io#commons-io;2.8.0 in central
	found org.glassfish.jaxb#txw2;2.3.4 in central
	found org.apache.ws.xmlschema#xmlschema-core;2.2.5 in central
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
:: resolution report :: resolve 299ms :: artifacts dl 14ms
	:: modules in use:
	com.databricks#spark-xml_2.12;0.13.0 from central in [default]
	commons-io#commons-io;2.8.0 from central in [default]
	org.apache.

In [4]:
df = (
    spark.read.format('xml')
        .option("rowTag", "page")
        .load("/data/pella/projects/University/Thesis/Thesis/code/storage/datasets/raw/star-wars/starwars_pages_current.xml")
).cache()

                                                                                

In [5]:
df.head(5)

                                                                                

[Row(id=1, ns=6, redirect=None, restrictions=None, revision=Row(comment=None, contributor=Row(id=116899, ip=None, username='Imperators II'), format='text/x-wiki', id=10712747, minor=None, model='wikitext', parentid=10210213, sha1='t4x8ueo47t5vq1fou4wiux1p0a39o76', text=Row(_VALUE='{{Top|fprot|uprot}}\n==Summary==\n{{Information\n|attention=\n|description=The logo of [[Wookieepedia]]. Cropped and modified from a picture of the [[Death Star II/Legends|second Death Star]].\n|source=Take a guess.\n|artist=*[[Tracy Duncan]] provided original design\n*[[User:Jaden Kenobi|Tyber]] provided higher-resolution version\n|filespecs=\n|licensing={{GFDL}}\n{{Cc-by-sa|3.0}}\n|other versions=[[:File:Wiki-shrinkable.png]]\n|cat artist=skip\n|cat licensee=skip\n|cat subject=skip\n|cat type=[[Category:Wookieepedia icons]]\n}}', _bytes=519, _xml:space='preserve'), timestamp=datetime.datetime(2021, 8, 15, 11, 26, 12)), title='File:Wiki.png'),
 Row(id=2, ns=2, redirect=None, restrictions=None, revision=Row(c

In [6]:
df.write.format("mongo").mode("append").save()

                                                                                

In [80]:
from pymongo import MongoClient
import wikitextparser as wtp
from lxml import html

In [85]:
client = MongoClient("mongodb://root:helloworld@127.0.0.1/wiki.starwars?authSource=admin")

In [86]:
collection = client.wiki.wookiepedia.characters

In [89]:
def select_character(parsed_item):
    for template in parsed_item.templates:
        if template.name.startswith('Character') or template.name.startswith('Droid'):
            return template
    return None

def clean_value(value):
    tree = html.fromstring(f'<span>{value}</span>')
    value = tree.text_content().strip()

    value = wtp.remove_markup(value)

    parsed = wtp.parse(value)
    if parsed.get_lists():
        value = ','.join(map(lambda x: x.strip(), parsed.get_lists()[0].items))

    return value

def extract_args(template):
    args = {}
    for arg in getattr(template, 'arguments', []):
        args[arg.name.strip()] = wtp.remove_markup(clean_value(arg.value.strip()))
    return args

In [92]:
from tqdm import tqdm

for record in tqdm(collection.find()):
    parsed = wtp.parse(record['text'])

    character = select_character(parsed)
    properties = extract_args(character)
    properties['is_droid'] = character and character.name.startswith('Droid')

    collection.update_one(
        {'_id': record['_id']},
        {'$set': {'properties': properties}}
    )

9979it [01:00, 163.59it/s]


# Mongo Queries

```sql
db.starwars.updateMany(
    {},
    [
        {"$set": {"text": "$revision.text._VALUE"}}
    ]
)



db.wookiepedia.characters.insertMany(
    db.getCollection("starwars")
            .find({'text': /\{\{(Character|Droid)/}).toArray()
)

db.wookiepedia.characters.deleteMany(
{
title: /^(Talk:|File:|User:|Forum:)/
}
)

db.wookiepedia.characters.deleteMany(
{
title: /\/Legends$/
}
)

db.wookiepedia.characters.deleteMany(
{text: /^\{\{Top\|leg\}\}/}
)

db.wookiepedia.characters.deleteMany(
{
text: /\{\{Noncanon\|/
}
)

db.wookiepedia.characters.deleteMany(
{
title: /lightsaber/
}
)

db.wookiepedia.characters.deleteMany(
{text: /\{\{Top\|.*real.*\}\}/}
)

db.wookiepedia.characters.deleteMany(
{$and: [{text: /\{\{Top\|.*leg.*\}\}/}, {text: {$not: /\{\{Top\|.*legends=.*\}\}/}}, {text: {$not: /\{\{Top\|.*canon.*\}\}/}}]}
)
```