In [1]:
import wptools
import pyspark
import pyspark.sql
from pyspark.sql import *
import os.path
from pyspark.sql.functions import desc

import findspark
findspark.init()

from pyspark.sql import dataframe
from pyspark.sql import functions as F

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# Loading data 

**File schema** is shown.

In [86]:
DATA_DIR = '../' 
WIKIPEDIA_CONFLICTS_PARQUET = DATA_DIR + 'selectedAllConflict.parquet'

# loading the saved parquet files
wikipedia = spark.read.parquet(WIKIPEDIA_CONFLICTS_PARQUET)
# show file schema
wikipedia.printSchema()

root
 |-- id: long (nullable = true)
 |-- ns: long (nullable = true)
 |-- restrictions: string (nullable = true)
 |-- revision: struct (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _deleted: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _deleted: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _xml:space: string (nullable = true)
 |    |-- timestamp: string (nulla

---
# Quantifying *importance* of each page in each category

We want to see how *important* each page is in each category. As we are solely focusing on *'war'*-related subjects in this pilot phase, we define *page importance* by the number of deaths. 

## Infobox or Wikidata per category 

Functions `get_wiki_civilian_attack`, `get_wiki_civil_conflict`, `get_wiki_military_conflict`
to get relevant information and views for each category: `civilian attack`, `civil conflict`, `military conflict`. 
Data is obtained either using the page's wikidata is the data exists or acquired through infobox parsing.
Relevant information is chosen based on the fields found on [List of infoboxes and fields](https://en.wikipedia.org/wiki/Wikipedia:List_of_infoboxes#Event) 

Extract info for each category in **Infobox**:
* `civilian attack`
    * location
    * date 
    * fatalities
* `civil conflict`
    * place
    * date
    * casualities1
* `military conflict`
    * place
    * date 
    * casualities1
    
Extract info for each category in **Wikidata**:
* `civilian attack`
    * location
    * date 
    * fatalities
* `civil conflict`
    * place
    * date
    * casualities1
* `military conflict`
    * 'number of deaths (P1120)']['amount']
    * 'end time (P582)'
    * 'location (P276)'
    
We use an external library [wptools wiki](https://github.com/siznax/wptools/wiki) to help us parse the data. 

In [87]:
def get_wiki_military_conflict(entity):
    page = wptools.page(entity.title)
    # extract relevant information and put in dictionary
    info = {'death': None, 'end_date': None, 'location': None}
    
    try: 
        page.get_wikidata()
        info['death'] = page.data['wikidata']['number of deaths (P1120)']['amount']
    
    except KeyError:
        try:
            page.get_parse()
            info['death'] = poly_page.data['infobox']['casualties1']
        except KeyError:
            info['death'] = None
            
    try:
        page.get_wikidata()
        info['end_date'] = page.data['wikidata']['end time (P582)']
    except KeyError:
        try:
            page.get_parse()
            info['end_date'] = poly_page.data['infobox']['date']
        except KeyError:
            info['end_date'] = None
    
    try:
        page.get_wikidata()
        info['location'] = page.data['wikidata']['location (P276)']
    except KeyError:
        try:
            page.get_parse()
            info['location'] = poly_page.data['infobox']['place']
        except KeyError:
            info['location'] = None
    
    
    return Row(id=entity.id, title=entity.title, death=info['death'],
               end_date=info['end_date'], location=info['location'])

### Infobox `military conflict `

In [88]:
infobox_military_conflict = 'military conflict'
# find all pages that have category military conflict
wiki_military_conflict = wikipedia.where("categories like '%{}%'".format(infobox_military_conflict)) 

In [89]:
wiki_military_conflict_df = sqlContext.createDataFrame(wiki_military_conflict.rdd.map(get_wiki_military_conflict))

In [94]:
wiki_military_conflict_df.take(3)

[Row(death="{{plainlist|\n* '''Military dead:'''\n* Over 16,000,000\n* '''Civilian dead:'''\n* Over 45,000,000\n* '''Total dead:'''\n* Over 61,000,000\n* (1937–1945)\n* [[World War II casualties|...''further details'']]}}", end_date='{{ubl|start and end dates|1939|9|1|1945|9|2|df|=|yes|\n                  |(|Age in years and days|1 September 1939|2 September 1945|sep|=|and|)|efn| While [[World War II#Chronology|various other dates]] have been proposed as the date on which World War II began or ended, this is the time span most frequently cited.}} {{start and end dates|1939|9|1|1945|9|2|df|=|yes}} {{Age in years and days|1 September 1939|2 September 1945|sep|=|and}} ) {{efn| While [[World War II#Chronology|various other dates]] have been proposed as the date on which World War II began or ended, this is the time span most frequently cited.}}', id=655845, location='Athens (Q3292481)', title='Battle of Athens (1946)'),
 Row(death="{{plainlist|\n* '''Military dead:'''\n* Over 16,000,000\n*

In [None]:
# saving binary file to future uses
wiki_military_conflict_df.write.parquet(DATA_DIR +"{}.parquet".format(infobox_military_conflict));
# loading the saved parquet files
wiki_military_conflict_df_reload = spark.read.parquet(
    DATA_DIR_FILTERED+"{}.parquet".format(infobox_military_conflict));

In [None]:
def get_wiki_civilian_attack(entity):
    page = wptools.page(entity.title)
    # extract relevant information and put in dictionary
    info = {'death': None, 'end_date': None, 'location': None}
    
    try: 
        page.get_wikidata()
        info['death'] = page.data['wikidata']['number of deaths (P1120)']['amount']
    
    except KeyError:
        try:
            page.get_parse()
            info['death'] = poly_page.data['infobox']['casualties1']
        except KeyError:
            info['death'] = None
            
    try:
        page.get_wikidata()
        info['end_date'] = page.data['wikidata']['end time (P582)']
    except KeyError:
        try:
            page.get_parse()
            info['end_date'] = poly_page.data['infobox']['date']
        except KeyError:
            info['end_date'] = None
    
    try:
        page.get_wikidata()
        info['location'] = page.data['wikidata']['location (P276)']
    except KeyError:
        try:
            page.get_parse()
            info['location'] = poly_page.data['infobox']['location']
        except KeyError:
            info['location'] = None
    
    
    return Row(id=entity.id, title=entity.title, death=info['death'],
               end_date=info['end_date'], location=info['location'])

# ===== TO DELETE ========

### Infobox `civilian attack`

In [None]:
infobox_civil_attack = 'civilian attack'
# find all pages that have category civilian attack
wiki_civil_attack = wikipedia.where("categories like '%{}%'".format(infobox_civil_attack)) 
# show file schema
wiki_civil_attack.printSchema()

In [None]:
wiki_civil_attack.filter("title like '%Fraunces Tavern%'").select("categories").collect()

In [None]:
wiki_civil_attack_df = sqlContext.createDataFrame(wiki_civil_attack.rdd.map(get_infobox_civilian_attack))

In [None]:
wiki_civil_attack_df.take(3)

In [None]:
# GET INFO FROM INFOBOX + VIEWS for different categories
def get_infobox_civilian_attack(entity):
    # get page
    page = wptools.page(entity.title)
    page.get_parse()
    page.get_more()
    # extract relevant information and put in dictionary
    info = {'views': None, 'location': None, 
            'date': None, 'fatalities': None, 'injuries': None }
    try:
        info['views'] = page.data['views']
    except KeyError:
        info['views'] = None
    
    for ele in list(info.keys())[1:]:
        try:
            info[ele] = page.data['infobox'][ele]
        except KeyError:
            pass
    
    return Row(id=entity.id, title=entity.title, location=info['location'], views=info['views'], date=info['date'], 
                fatalities=info['fatalities'], injuries=info['injuries'])

def get_infobox_civil_conflict(entity):
    # get page
    page = wptools.page(entity.title)
    page.get_parse()
    page.get_more()
    # extract relevant information and put in dictionary
    info = {'views': None, 'place': None, 'injuries': None,
            'date': None, 'fatalities': None, 'casualties1': None, 'casualties2': None,
            'leadfigures1': None, 'leadfigures2': None} 
    try:
        info['views'] = page.data['views']
    except KeyError:
        info['views'] = None
    
    for ele in list(info.keys())[1:]:
        try:
            info[ele] = page.data['infobox'][ele]
        except KeyError:
            pass
    
    return Row(id=entity.id, title=entity.title, location=info['place'], views=info['views'], date=info['date'], 
               fatalities=info['fatalities'], casualties1=info['casualties1'], casualties2=info['casualties2'],
               injuries=info['injuries'], leadfigures1=info['leadfigures1'], leadfigures2=info['leadfigures2'])

def get_infobox_military_conflict(entity):
    # get page
    page = wptools.page(entity.title)
    page.get_parse()
    page.get_more()
    # extract relevant information and put in dictionary
    info = {'views': None, 'place': None, 
            'date': None, 'casualties1': None, 'casualties2': None}#, 'status': None}
            #'combatant1': None, 'combatant2': None, 'status': None} 
    try:
        info['views'] = page.data['views']
    except KeyError:
        info['views'] = None
    
    for ele in list(info.keys())[1:]:
        try:
            info[ele] = page.data['infobox'][ele]
        except KeyError:
            pass
    
    return Row(id=entity.id, title=entity.title, location=info['place'], views=info['views'], date=info['date'], 
               casualties1=info['casualties1'], casualties2=info['casualties2']) #, status=info['status'])
               #combatant1=info['combatant1'], combatant2=info['combatant2'])

In [None]:
DATA_DIR_FILTERED = '../clean_data/'

In [None]:
# saving binary file to future uses
wiki_civil_attack_df.write.parquet(DATA_DIR_FILTERED+"{}.parquet".format(infobox_civil_attack));
# loading the saved parquet files
wiki_civil_attack_df_reload = spark.read.parquet(DATA_DIR_FILTERED+"{}.parquet".format(infobox_civil_attack));

In [None]:
wiki_civil_attack_df = sqlContext.createDataFrame(wiki_civil_attack_RDD)

### Infobox `civil conflict`

In [None]:
infobox_civil_conflict = 'civil conflict'
# find all pages that have category civil conflict
wiki_civil_conflict = wikipedia.where("categories like '%{}%'".format(infobox_civil_conflict)) 
# show file schema
wiki_civil_conflict.printSchema()

In [None]:
wiki_civil_conflict_df = sqlContext.createDataFrame(wiki_civil_conflict.rdd.map(get_infobox_civil_conflict))

In [None]:
wiki_civil_conflict_df.take(3)

In [None]:
# saving binary file to future uses
wiki_civil_conflict_df.write.parquet(DATA_DIR_FILTERED+"{}.parquet".format(infobox_civil_conflict));
# loading the saved parquet files
wiki_civil_conflict_df_reload = spark.read.parquet(DATA_DIR_FILTERED+"{}.parquet".format(infobox_civil_conflict));

### Infobox `military conflict `

In [None]:
infobox_military_conflict = 'military conflict'
# find all pages that have category military conflict
wiki_military_conflict = wikipedia.where("categories like '%{}%'".format(infobox_military_conflict)) 
# show file schema
wiki_military_conflict.printSchema()

In [None]:
wiki_military_conflict_df = sqlContext.createDataFrame(wiki_military_conflict.rdd.map(get_infobox_military_conflict))

In [None]:
wiki_military_conflict_df.take(3)

In [None]:
# saving binary file to future uses
wiki_military_conflict_df.write.parquet(DATA_DIR_FILTERED+"{}.parquet".format(infobox_military_conflict));
# loading the saved parquet files
wiki_military_conflict_df_reload = spark.read.parquet(
    DATA_DIR_FILTERED+"{}.parquet".format(infobox_military_conflict));

### Functions

In [79]:
def get_wiki_military_conflict(entity):
    page = wptools.page(entity.title)
    # extract relevant information and put in dictionary
    info = {'death': None, 'end_date': None, 'location': None}
    
    try: 
        page.get_wikidata()
        info['death'] = page.data['wikidata']['number of deaths (P1120)']['amount']
    
    except KeyError:
        try:
            page.get_parse()
            info['death'] = poly_page.data['infobox']['casualties1']
        except KeyError:
            info['death'] = None
            
    try:
        page.get_wikidata()
        info['end_date'] = page.data['wikidata']['end time (P582)']
    except KeyError:
        try:
            page.get_parse()
            info['end_date'] = poly_page.data['infobox']['date']
        except KeyError:
            info['end_date'] = None
    
    try:
        page.get_wikidata()
        info['location'] = page.data['wikidata']['location (P276)']
    except KeyError:
        try:
            page.get_parse()
            info['location'] = poly_page.data['infobox']['place']
        except KeyError:
            info['location'] = None
    
    
    return Row(id=entity.id, title=entity.title, death=info['death'],
               end_date=info['end_date'], location=info['location']) #, location=info['location'])    


### UNICORN ON THE GOOO

In [91]:
## trying access to infobox
poly_page = wptools.page('1993 World Trade Center bombing')
poly_page.get_parse()
poly_page.get_wikidata()

en.wikipedia.org (parse) 1993 World Trade Center bombing
en.wikipedia.org (imageinfo) File:WTC 1993 ATF Commons.jpg
1993 World Trade Center bombing (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:WTC 1993 ...
  infobox: <dict(15)> title, partof, image, caption, location, coo...
  iwlinks: <list(1)> https://commons.wikimedia.org/wiki/Category:1...
  pageid: 46815127
  parsetree: <str(50244)> <root><template><title>about</title><par...
  requests: <list(2)> parse, imageinfo
  title: 1993 World Trade Center bombing
  wikibase: Q11240
  wikidata_url: https://www.wikidata.org/wiki/Q11240
  wikitext: <str(40355)> {{about|the 1993 World Trade Center bombi...
}
www.wikidata.org (wikidata) Q11240
www.wikidata.org (labels) Q51762078|P1120|P276|P373|P625|P31|Q816...
en.wikipedia.org (imageinfo) File:WTC 1993 ATF Commons.jpg
1993 World Trade Center bombing (en) data
{
  claims: <dict(11)> P373, P31, P585, P625, P646, P18, P17, P1120,...
  description: <str(84)> truck bomb deto

<wptools.page.WPToolsPage at 0x11d45e470>

In [92]:
poly_page.data['infobox']

{'title': '1993 World Trade Center bombing',
 'partof': '[[terrorism in the United States]]',
 'image': 'WTC 1993 ATF Commons.jpg',
 'caption': 'Underground damage after the bombing',
 'location': '[[World Trade Center (1973–2001)|World Trade Center]]<br />[[New York City]], [[New York (state)|New York]], U.S.',
 'coordinates': '{{coord|40.711452|N|74.011919|W|region:US-NY_type:event|display|=|title,inline}}',
 'target': '[[World Trade Center (1973-2001)|World Trade Center]]',
 'date': '{{start date and age|1993|02|26}}',
 'time': '12:17:37 p.m.',
 'timezone': '[[UTC-05:00]]',
 'type': '[[Car bomb|Truck bombing]], [[mass murder]]',
 'fatalities': '6',
 'injuries': '1,042',
 'perpetrators': '[[Ramzi Yousef]], [[Eyad Ismoil]], and co-conspirators',
 'motive': '[[Islamic fundamentalism]] <br />[[American foreign policy]]<br />[[Israel–United States military relations#Military aid and procurement|U.S. support for Israel]]'}

In [93]:
poly_page.data['wikidata']

{'Commons category (P373)': '1993 World Trade Center bombing',
 'instance of (P31)': 'attack (Q81672)',
 'point in time (P585)': '+1993-02-26T00:00:00Z',
 'coordinate location (P625)': {'latitude': 40.71166667,
  'longitude': -74.01361111,
  'altitude': None,
  'precision': 0.00027777777777778,
  'globe': 'http://www.wikidata.org/entity/Q2'},
 'Freebase ID (P646)': '/m/08gcw',
 'image (P18)': 'WTC 1993 ATF Commons.jpg',
 'country (P17)': 'United States of America (Q30)',
 'number of deaths (P1120)': {'amount': '+6', 'unit': '1'},
 'number of injured (P1339)': {'amount': '+1042', 'unit': '1'},
 'location (P276)': 'World Trade Center (Q11235)',
 "topic's main category (P910)": 'Category:1993 World Trade Center bombing (Q51762078)'}

In [None]:
ww2 = wikipedia.filter("title like '%1993 World Trade Center bombing%'") #World War II for military, civilian attack 1993 World Trade Center bombing
ww2_df = sqlContext.createDataFrame(ww2.rdd.map(get_wiki_military_conflict))

In [81]:
ww2_df.show(2)

+--------------------+--------------------+------+---------------+--------------------+
|               death|            end_date|    id|       location|               title|
+--------------------+--------------------+------+---------------+--------------------+
|{{plainlist|
* ''...|+1945-02-15T00:00...| 64692|Dresden (Q1731)|Bombing of Dresde...|
|{{plainlist|
* ''...|+1941-06-01T00:00...|627326| Albania (Q222)|Balkans Campaign ...|
+--------------------+--------------------+------+---------------+--------------------+
only showing top 2 rows



In [82]:
ww2_df.select('location').collect()

[Row(location='Dresden (Q1731)'),
 Row(location='Albania (Q222)'),
 Row(location='[Russia (Q159), Europe (Q46), Africa (Q15), Pacific Ocean (Q98), Mediterranean Sea (Q4918), Asia (Q48), Atlantic Ocean (Q97), Middle East (Q7204), Southeast Asia (Q11708), Scandinavia (Q21195)]'),
 Row(location='Mediterranean Sea (Q4918)'),
 Row(location='Lienz (Q336250)'),
 Row(location='Malta Colony (Q6744657)'),
 Row(location='East Africa (Q27407)'),
 Row(location='[Norwegian Sea (Q47545), Arctic Ocean (Q788)]'),
 Row(location='State of Burma (Q704358)'),
 Row(location='Europe (Q46)'),
 Row(location='Vienna (Q1741)'),
 Row(location='Königsberg (Q4120832)'),
 Row(location='Warsaw (Q270)'),
 Row(location='Berlin (Q64)'),
 Row(location='[[European theatre of World War II|Europe]], [[Pacific War|Pacific]], [[Battle of the Atlantic|Atlantic]], [[South-East Asian theatre of World War II|South-East Asia]], [[Second Sino-Japanese War|China]], [[Mediterranean and Middle East theatre of World War II|Middle East]

In [83]:
ww2_df.select('end_date').collect()

[Row(end_date='+1945-02-15T00:00:00Z'),
 Row(end_date='+1941-06-01T00:00:00Z'),
 Row(end_date='+1945-09-02T00:00:00Z'),
 Row(end_date='{{ubl|start and end dates|1939|9|1|1945|9|2|df|=|yes|\n                  |(|Age in years and days|1 September 1939|2 September 1945|sep|=|and|)|efn| While [[World War II#Chronology|various other dates]] have been proposed as the date on which World War II began or ended, this is the time span most frequently cited.}} {{start and end dates|1939|9|1|1945|9|2|df|=|yes}} {{Age in years and days|1 September 1939|2 September 1945|sep|=|and}} ) {{efn| While [[World War II#Chronology|various other dates]] have been proposed as the date on which World War II began or ended, this is the time span most frequently cited.}}'),
 Row(end_date='{{ubl|start and end dates|1939|9|1|1945|9|2|df|=|yes|\n                  |(|Age in years and days|1 September 1939|2 September 1945|sep|=|and|)|efn| While [[World War II#Chronology|various other dates]] have been proposed as the

In [43]:
ww2_df.select('title').collect()

[Row(title='Bombing of Dresden in World War II'),
 Row(title='Balkans Campaign (World War II)'),
 Row(title='World War II'),
 Row(title='Mediterranean U-boat Campaign (World War II)'),
 Row(title='Repatriation of Cossacks after World War II'),
 Row(title='Siege of Malta (World War II)'),
 Row(title='East African Campaign (World War II)'),
 Row(title='Arctic convoys of World War II'),
 Row(title='South-East Asian theatre of World War II'),
 Row(title='European theatre of World War II'),
 Row(title='Bombing of Vienna in World War II'),
 Row(title='Bombing of Königsberg in World War II'),
 Row(title='Bombing of Warsaw in World War II'),
 Row(title='Bombing of Berlin in World War II'),
 Row(title='Polish resistance movement in World War II'),
 Row(title='Mediterranean and Middle East theatre of World War II'),
 Row(title='Eastern Front (World War II)'),
 Row(title='Western Front (World War II)'),
 Row(title='Strategic bombing during World War II'),
 Row(title='Italian Campaign (World War I

In [44]:
ww2_df.select('death').collect()

[Row(death="{{plainlist|\n* '''Military dead:'''\n* Over 16,000,000\n* '''Civilian dead:'''\n* Over 45,000,000\n* '''Total dead:'''\n* Over 61,000,000\n* (1937–1945)\n* [[World War II casualties|...''further details'']]}}"),
 Row(death="{{plainlist|\n* '''Military dead:'''\n* Over 16,000,000\n* '''Civilian dead:'''\n* Over 45,000,000\n* '''Total dead:'''\n* Over 61,000,000\n* (1937–1945)\n* [[World War II casualties|...''further details'']]}}"),
 Row(death='+73000000'),
 Row(death="{{plainlist|\n* '''Military dead:'''\n* Over 16,000,000\n* '''Civilian dead:'''\n* Over 45,000,000\n* '''Total dead:'''\n* Over 61,000,000\n* (1937–1945)\n* [[World War II casualties|...''further details'']]}}"),
 Row(death="{{plainlist|\n* '''Military dead:'''\n* Over 16,000,000\n* '''Civilian dead:'''\n* Over 45,000,000\n* '''Total dead:'''\n* Over 61,000,000\n* (1937–1945)\n* [[World War II casualties|...''further details'']]}}"),
 Row(death="{{plainlist|\n* '''Military dead:'''\n* Over 16,000,000\n* '''C

In [45]:
ww2_df.select('id').collect()

[Row(id=64692),
 Row(id=627326),
 Row(id=32927),
 Row(id=8027546),
 Row(id=3498598),
 Row(id=776074),
 Row(id=988219),
 Row(id=998807),
 Row(id=902040),
 Row(id=342640),
 Row(id=7439614),
 Row(id=7468801),
 Row(id=927764),
 Row(id=936661),
 Row(id=4329310),
 Row(id=1779568),
 Row(id=519489),
 Row(id=519516),
 Row(id=730658),
 Row(id=493696),
 Row(id=5352468),
 Row(id=4149594),
 Row(id=54639200),
 Row(id=52997544),
 Row(id=7668163),
 Row(id=7148715),
 Row(id=11413940),
 Row(id=13669170),
 Row(id=8814971),
 Row(id=20537542),
 Row(id=6820802),
 Row(id=54956721),
 Row(id=537817),
 Row(id=22573476),
 Row(id=22619466),
 Row(id=22752221),
 Row(id=22878403),
 Row(id=25536548),
 Row(id=291341),
 Row(id=2728998),
 Row(id=10805870),
 Row(id=33247384),
 Row(id=32732761),
 Row(id=15873865),
 Row(id=2823356),
 Row(id=10262809),
 Row(id=4397117),
 Row(id=30281564),
 Row(id=30319330),
 Row(id=17426585),
 Row(id=36240634),
 Row(id=27495752)]