In [1]:
import wptools
import pyspark
import pyspark.sql
from pyspark.sql import *
import os.path
from pyspark.sql.functions import desc

import findspark
findspark.init()

from pyspark.sql import dataframe
from pyspark.sql import functions as F

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# Loading data 

In [2]:
DATA_DIR = '../' 
WIKIPEDIA_CONFLICTS_PARQUET = DATA_DIR + 'selectedAllConflict.parquet'

# loading the saved parquet files
wikipedia = spark.read.parquet(WIKIPEDIA_CONFLICTS_PARQUET)

---

# Descriptive analysis

---

# Infobox per category 

Infobox functions `get_infobox_civilian_attack`, `get_infobox_civil_conflict`, `get_infobox_military_conflict`
to get relevant information and views for each category: `civilian attack`, `civil conflict`, `military conflict`. Relevant information is chosen based on the fields found on [List of infoboxes and fields](https://en.wikipedia.org/wiki/Wikipedia:List_of_infoboxes#Event) 

Extract info for each category:
* `civilian attack`
    * location
    * date 
    * fatalities
    * injuries
* `civil conflict`
    * place
    * date
    * casualities1, casualities2
    * fatalities
    * injuries
    * leadfigures1, leadfigures2
* `military conflict`
    * place
    * date 
    * status 
    * casualities1, casualities2
    * combatant1, combatant2
    
We use an external library [wptools wiki](https://github.com/siznax/wptools/wiki). 

### Infobox `civilian attack`

In [None]:
infobox_civil_attack = 'civilian attack'
# find all pages that have category civilian attack
wiki_civil_attack = wikipedia.where("categories like '%{}%'".format(infobox_civil_attack)) 
# show file schema
wiki_civil_attack.printSchema()

In [None]:
wiki_civil_attack.filter("title like '%Fraunces Tavern%'").select("categories").collect()

In [None]:
wiki_civil_attack_df = sqlContext.createDataFrame(wiki_civil_attack.rdd.map(get_infobox_civilian_attack))

In [None]:
wiki_civil_attack_df.take(3)

In [None]:
# GET INFO FROM INFOBOX + VIEWS for different categories
def get_infobox_civilian_attack(entity):
    # get page
    page = wptools.page(entity.title)
    page.get_parse()
    page.get_more()
    # extract relevant information and put in dictionary
    info = {'views': None, 'location': None, 
            'date': None, 'fatalities': None, 'injuries': None }
    try:
        info['views'] = page.data['views']
    except KeyError:
        info['views'] = None
    
    for ele in list(info.keys())[1:]:
        try:
            info[ele] = page.data['infobox'][ele]
        except KeyError:
            pass
    
    return Row(id=entity.id, title=entity.title, location=info['location'], views=info['views'], date=info['date'], 
                fatalities=info['fatalities'], injuries=info['injuries'])

def get_infobox_civil_conflict(entity):
    # get page
    page = wptools.page(entity.title)
    page.get_parse()
    page.get_more()
    # extract relevant information and put in dictionary
    info = {'views': None, 'place': None, 'injuries': None,
            'date': None, 'fatalities': None, 'casualties1': None, 'casualties2': None,
            'leadfigures1': None, 'leadfigures2': None} 
    try:
        info['views'] = page.data['views']
    except KeyError:
        info['views'] = None
    
    for ele in list(info.keys())[1:]:
        try:
            info[ele] = page.data['infobox'][ele]
        except KeyError:
            pass
    
    return Row(id=entity.id, title=entity.title, location=info['place'], views=info['views'], date=info['date'], 
               fatalities=info['fatalities'], casualties1=info['casualties1'], casualties2=info['casualties2'],
               injuries=info['injuries'], leadfigures1=info['leadfigures1'], leadfigures2=info['leadfigures2'])

def get_infobox_military_conflict(entity):
    # get page
    page = wptools.page(entity.title)
    page.get_parse()
    page.get_more()
    # extract relevant information and put in dictionary
    info = {'views': None, 'place': None, 
            'date': None, 'casualties1': None, 'casualties2': None}#, 'status': None}
            #'combatant1': None, 'combatant2': None, 'status': None} 
    try:
        info['views'] = page.data['views']
    except KeyError:
        info['views'] = None
    
    for ele in list(info.keys())[1:]:
        try:
            info[ele] = page.data['infobox'][ele]
        except KeyError:
            pass
    
    return Row(id=entity.id, title=entity.title, location=info['place'], views=info['views'], date=info['date'], 
               casualties1=info['casualties1'], casualties2=info['casualties2']) #, status=info['status'])
               #combatant1=info['combatant1'], combatant2=info['combatant2'])

In [None]:
DATA_DIR_FILTERED = '../clean_data/'

In [None]:
# saving binary file to future uses
wiki_civil_attack_df.write.parquet(DATA_DIR_FILTERED+"{}.parquet".format(infobox_civil_attack));
# loading the saved parquet files
wiki_civil_attack_df_reload = spark.read.parquet(DATA_DIR_FILTERED+"{}.parquet".format(infobox_civil_attack));

In [None]:
wiki_civil_attack_df = sqlContext.createDataFrame(wiki_civil_attack_RDD)

### Infobox `civil conflict`

In [None]:
infobox_civil_conflict = 'civil conflict'
# find all pages that have category civil conflict
wiki_civil_conflict = wikipedia.where("categories like '%{}%'".format(infobox_civil_conflict)) 
# show file schema
wiki_civil_conflict.printSchema()

In [None]:
wiki_civil_conflict_df = sqlContext.createDataFrame(wiki_civil_conflict.rdd.map(get_infobox_civil_conflict))

In [None]:
wiki_civil_conflict_df.take(3)

In [None]:
# saving binary file to future uses
wiki_civil_conflict_df.write.parquet(DATA_DIR_FILTERED+"{}.parquet".format(infobox_civil_conflict));
# loading the saved parquet files
wiki_civil_conflict_df_reload = spark.read.parquet(DATA_DIR_FILTERED+"{}.parquet".format(infobox_civil_conflict));

### Infobox `military conflict `

In [None]:
infobox_military_conflict = 'military conflict'
# find all pages that have category military conflict
wiki_military_conflict = wikipedia.where("categories like '%{}%'".format(infobox_military_conflict)) 
# show file schema
wiki_military_conflict.printSchema()

In [None]:
wiki_military_conflict_df = sqlContext.createDataFrame(wiki_military_conflict.rdd.map(get_infobox_military_conflict))

In [None]:
wiki_military_conflict_df.take(3)

In [None]:
# saving binary file to future uses
wiki_military_conflict_df.write.parquet(DATA_DIR_FILTERED+"{}.parquet".format(infobox_military_conflict));
# loading the saved parquet files
wiki_military_conflict_df_reload = spark.read.parquet(
    DATA_DIR_FILTERED+"{}.parquet".format(infobox_military_conflict));

### Functions

In [51]:
def get_wiki_military_conflict(entity):
    page = wptools.page(entity.title)
    # extract relevant information and put in dictionary
    info = {'death': None, 'end_date': None}#, 'location': None} #
    
    try: 
        page.get_wikidata()
        info['death'] = page.data['wikidata']['number of deaths (P1120)']['amount']
        info['end_date'] = page.date['wikidata']['end time (P582)']
        #info['location'] = page.date['wikidata']['location (P276)']['amount']
    
    except KeyError:
        try:
            page.get_parse()
            info['death'] = poly_page.data['infobox']['casualties1']
        except KeyError:
            info['death'] = None
        try:
            info['end_date'] = poly_page.data['infobox']['date']
        except KeyError:
            info['end_date'] = None
        #info['location'] = poly_page.data['infobox']['place']
    
    return Row(id=entity.id, title=entity.title, death=info['death'], date=info['end_date'])#, location=info['location'])    


### UNICORN ON THE GOOO

In [34]:
## trying access to infobox
poly_page = wptools.page('World War II')
poly_page.get_parse()
poly_page.get_wikidata()

en.wikipedia.org (parse) World War II
en.wikipedia.org (imageinfo) File:Infobox collage for WWII.PNG
World War II (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Infobox c...
  infobox: <dict(15)> conflict, image, image_size, caption, date, ...
  iwlinks: <list(11)> https://commons.wikimedia.org/wiki/Special:S...
  pageid: 32927
  parsetree: <str(436226)> <root><template><title>pp-semi-indef</t...
  requests: <list(2)> parse, imageinfo
  title: World War II
  wikibase: Q362
  wikidata_url: https://www.wikidata.org/wiki/Q362
  wikitext: <str(224849)> {{pp-semi-indef}}{{Pp-move-indef}}{{shor...
}
www.wikidata.org (wikidata) Q362
www.wikidata.org (labels) P6104|Q329888|Q5928186|P902|Q27643331|P...
www.wikidata.org (labels) P3509|P4212|P527|P1150|P906|Q11708|P141...
World War II (en) data
{
  aliases: <list(7)> WW2, World War Two, 2nd World War, Second Wor...
  claims: <dict(57)> P1478, P1151, P1150, P1120, P227, P349, P244,...
  description: 1939–1945 global war betwe

<wptools.page.WPToolsPage at 0x11f877cc0>

In [9]:
poly_page.data['infobox']

{'conflict': 'World War&nbsp;II',
 'image': 'Infobox collage for WWII.PNG',
 'image_size': '300px',
 'caption': '(clockwise from top left) {{flatlist|\n* Chinese forces in the [[Battle of Wanjialing]]\n* Australian [[25-pounder]] guns during the [[First Battle of El Alamein]]\n* German [[Junkers Ju 87|Stuka]] dive bombers on the [[Eastern Front (World War II)|Eastern Front]] in December 1943\n* American naval force in the [[Invasion of Lingayen Gulf|Lingayen Gulf]]\n* [[Wilhelm Keitel]] signing the [[German Instrument of Surrender]]\n* Soviet troops in the [[Battle of Stalingrad]]}}',
 'date': '{{ubl|start and end dates|1939|9|1|1945|9|2|df|=|yes|\n                  |(|Age in years and days|1 September 1939|2 September 1945|sep|=|and|)|efn| While [[World War II#Chronology|various other dates]] have been proposed as the date on which World War II began or ended, this is the time span most frequently cited.}} {{start and end dates|1939|9|1|1945|9|2|df|=|yes}} {{Age in years and days|1 Se

In [14]:
poly_page.data['wikidata']

{'has immediate cause (P1478)': 'Invasion of Poland (Q150812)',
 "topic's main Wikimedia portal (P1151)": 'Portal:World War II (Q3247957)',
 'Regensburg Classification (P1150)': 'NQ 2545 - NQ 2795',
 'number of deaths (P1120)': {'amount': '+73000000',
  'unit': '1',
  'upperBound': '+86000000',
  'lowerBound': '+60000000'},
 'GND ID (P227)': '4079167-1',
 'NDL Auth ID (P349)': '00570524',
 'Library of Congress authority ID (P244)': 'sh85148273',
 'Commons category (P373)': 'World War II',
 'start time (P580)': '+1939-09-01T00:00:00Z',
 'end time (P582)': '+1945-09-02T00:00:00Z',
 'NKCR AUT ID (P691)': 'ph117270',
 'instance of (P31)': 'world war (Q103495)',
 "topic's main category (P910)": 'Category:World War II (Q6816704)',
 'Freebase ID (P646)': '/m/081pw',
 'Gran Enciclopèdia Catalana ID (P1296)': '0031535',
 'has cause (P828)': ['Treaty of Versailles (Q8736)',
  'causes of World War II (Q714999)'],
 'LIR (P886)': '1129',
 'Commons gallery (P935)': 'World War II',
 'Encyclopædia Bri

In [52]:
ww2 = wikipedia.filter("title like '%World War II%'")
ww2_df = sqlContext.createDataFrame(ww2.rdd.map(get_wiki_military_conflict))

In [53]:
ww2_df.show(2)

Py4JJavaError: An error occurred while calling o705.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 25.0 failed 1 times, most recent failure: Lost task 0.0 in stage 25.0 (TID 52, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/nasrine/spark/python/lib/pyspark.zip/pyspark/worker.py", line 253, in main
    process()
  File "/Users/nasrine/spark/python/lib/pyspark.zip/pyspark/worker.py", line 248, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/Users/nasrine/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 379, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/Users/nasrine/spark/python/lib/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-51-b840832b57f8>", line 9, in get_wiki_military_conflict
AttributeError: 'WPToolsPage' object has no attribute 'date'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:330)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:470)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:453)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:284)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1651)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1639)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1638)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1638)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1872)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1821)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1810)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3278)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3259)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3258)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2703)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:254)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/nasrine/spark/python/lib/pyspark.zip/pyspark/worker.py", line 253, in main
    process()
  File "/Users/nasrine/spark/python/lib/pyspark.zip/pyspark/worker.py", line 248, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/Users/nasrine/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 379, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/Users/nasrine/spark/python/lib/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-51-b840832b57f8>", line 9, in get_wiki_military_conflict
AttributeError: 'WPToolsPage' object has no attribute 'date'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:330)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:470)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:453)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:284)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
ww2_df.select('end_date')

In [43]:
ww2_df.select('title').collect()

[Row(title='Bombing of Dresden in World War II'),
 Row(title='Balkans Campaign (World War II)'),
 Row(title='World War II'),
 Row(title='Mediterranean U-boat Campaign (World War II)'),
 Row(title='Repatriation of Cossacks after World War II'),
 Row(title='Siege of Malta (World War II)'),
 Row(title='East African Campaign (World War II)'),
 Row(title='Arctic convoys of World War II'),
 Row(title='South-East Asian theatre of World War II'),
 Row(title='European theatre of World War II'),
 Row(title='Bombing of Vienna in World War II'),
 Row(title='Bombing of Königsberg in World War II'),
 Row(title='Bombing of Warsaw in World War II'),
 Row(title='Bombing of Berlin in World War II'),
 Row(title='Polish resistance movement in World War II'),
 Row(title='Mediterranean and Middle East theatre of World War II'),
 Row(title='Eastern Front (World War II)'),
 Row(title='Western Front (World War II)'),
 Row(title='Strategic bombing during World War II'),
 Row(title='Italian Campaign (World War I

In [44]:
ww2_df.select('death').collect()

[Row(death="{{plainlist|\n* '''Military dead:'''\n* Over 16,000,000\n* '''Civilian dead:'''\n* Over 45,000,000\n* '''Total dead:'''\n* Over 61,000,000\n* (1937–1945)\n* [[World War II casualties|...''further details'']]}}"),
 Row(death="{{plainlist|\n* '''Military dead:'''\n* Over 16,000,000\n* '''Civilian dead:'''\n* Over 45,000,000\n* '''Total dead:'''\n* Over 61,000,000\n* (1937–1945)\n* [[World War II casualties|...''further details'']]}}"),
 Row(death='+73000000'),
 Row(death="{{plainlist|\n* '''Military dead:'''\n* Over 16,000,000\n* '''Civilian dead:'''\n* Over 45,000,000\n* '''Total dead:'''\n* Over 61,000,000\n* (1937–1945)\n* [[World War II casualties|...''further details'']]}}"),
 Row(death="{{plainlist|\n* '''Military dead:'''\n* Over 16,000,000\n* '''Civilian dead:'''\n* Over 45,000,000\n* '''Total dead:'''\n* Over 61,000,000\n* (1937–1945)\n* [[World War II casualties|...''further details'']]}}"),
 Row(death="{{plainlist|\n* '''Military dead:'''\n* Over 16,000,000\n* '''C

In [45]:
ww2_df.select('id').collect()

[Row(id=64692),
 Row(id=627326),
 Row(id=32927),
 Row(id=8027546),
 Row(id=3498598),
 Row(id=776074),
 Row(id=988219),
 Row(id=998807),
 Row(id=902040),
 Row(id=342640),
 Row(id=7439614),
 Row(id=7468801),
 Row(id=927764),
 Row(id=936661),
 Row(id=4329310),
 Row(id=1779568),
 Row(id=519489),
 Row(id=519516),
 Row(id=730658),
 Row(id=493696),
 Row(id=5352468),
 Row(id=4149594),
 Row(id=54639200),
 Row(id=52997544),
 Row(id=7668163),
 Row(id=7148715),
 Row(id=11413940),
 Row(id=13669170),
 Row(id=8814971),
 Row(id=20537542),
 Row(id=6820802),
 Row(id=54956721),
 Row(id=537817),
 Row(id=22573476),
 Row(id=22619466),
 Row(id=22752221),
 Row(id=22878403),
 Row(id=25536548),
 Row(id=291341),
 Row(id=2728998),
 Row(id=10805870),
 Row(id=33247384),
 Row(id=32732761),
 Row(id=15873865),
 Row(id=2823356),
 Row(id=10262809),
 Row(id=4397117),
 Row(id=30281564),
 Row(id=30319330),
 Row(id=17426585),
 Row(id=36240634),
 Row(id=27495752)]