In [10]:
import findspark
findspark.init()

import pyspark
import pyspark.sql
from pyspark.sql import *
import os.path
from pyspark.sql.functions import desc

from pyspark.sql import dataframe
from pyspark.sql import functions as F
from pyspark.sql.functions import isnan, when, count, col

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

from pyspark.sql.types import *

import wptools

In [2]:
DATA_DIR = '../data/data_processed/intermediate/'

In [3]:
military = spark.read.parquet(DATA_DIR+"wiki_military_conflict_df.parquet")

In [4]:
military_null = military.where(((col("death").isNull()) | (F.lower(col("death"))=="none")) & (col("location").isNull()) & (col("end_date").isNull()))
military_null.count()

1815

In [5]:
military_null.show()

+-----+--------+--------+--------+
|death|end_date|      id|location|
+-----+--------+--------+--------+
| null|    null| 5421507|    null|
| null|    null|21845109|    null|
| null|    null|   31607|    null|
| null|    null|   91609|    null|
| null|    null| 2174819|    null|
| null|    null| 2138979|    null|
| null|    null|30269008|    null|
| null|    null| 8221689|    null|
| null|    null|23503395|    null|
| null|    null|23584843|    null|
| null|    null|36605624|    null|
| null|    null|30704529|    null|
| null|    null| 9131667|    null|
| null|    null|22515182|    null|
| null|    null|19931370|    null|
| null|    null| 1315408|    null|
| null|    null|  633952|    null|
| null|    null|  830626|    null|
| null|    null|  236843|    null|
| null|    null|  214155|    null|
+-----+--------+--------+--------+
only showing top 20 rows



In [13]:
def get_wiki_military_conflict(entity):
    page = wptools.page(pageid=entity.id)
    # extract relevant information and put in dictionary
    info = {'death': None, 'end_date': None, 'location': None}
    try:
        page.get_parse()
    except:
        return Row(id=entity.id, death=info['death'],
               end_date=info['end_date'], location=info['location'])
    
    try: 
        page.get_wikidata()
        info['death'] = page.data['wikidata']['number of deaths (P1120)']['amount']
    
    except:
        try:
            info['death'] = page.data['infobox']['casualties1']
        except:
            info['death'] = None
            
    try:
        info['end_date'] = page.data['wikidata']['end time (P582)']
    except:
        try:
            info['end_date'] = page.data['infobox']['date']
        except:
            info['end_date'] = None
    
    try:
        info['location'] = page.data['wikidata']['location (P276)']
    except:
        try:
            info['location'] = page.data['infobox']['place']
        except:
            info['location'] = None
    
    
    return Row(id=entity.id, death=info['death'],
               end_date=info['end_date'], location=info['location'])

In [7]:
schema = StructType([StructField('death', StringType()),StructField('end_date', StringType()),
                     StructField('id', IntegerType()),StructField('location', StringType())])

In [8]:
df = military_null.select("id");

In [14]:
%%capture
wiki_military_conflict_rows = [get_wiki_military_conflict(i) for i in df.collect()];

In [15]:
wiki_military_attack_df = spark.createDataFrame(wiki_military_conflict_rows, schema)

wiki_military_attack_df.printSchema()
wiki_military_attack_df.show()

root
 |-- death: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- location: string (nullable = true)

+-----+--------+--------+--------+
|death|end_date|      id|location|
+-----+--------+--------+--------+
| null|    null| 5421507|    null|
| null|    null|21845109|    null|
| null|    null|   31607|    null|
| null|    null|   91609|    null|
| null|    null| 2174819|    null|
| null|    null| 2138979|    null|
| null|    null|30269008|    null|
| null|    null| 8221689|    null|
| null|    null|23503395|    null|
| null|    null|23584843|    null|
| null|    null|36605624|    null|
| null|    null|30704529|    null|
| null|    null| 9131667|    null|
| null|    null|22515182|    null|
| null|    null|19931370|    null|
| null|    null| 1315408|    null|
| null|    null|  633952|    null|
| null|    null|  830626|    null|
| null|    null|  236843|    null|
| null|    null|  214155|    null|
+-----+--------+--------+--------+
on

In [16]:
# saving binary file to future uses
wiki_military_attack_df.write.mode('overwrite').parquet(DATA_DIR+"wiki_military_conflict_null_df.parquet")

In [17]:
military_null_new = wiki_military_attack_df.where(((col("death").isNull()) | (F.lower(col("death"))=="none")) & (col("location").isNull()) & (col("end_date").isNull()))
military_null_new.count()

42

In [19]:
military_non_null_new = wiki_military_attack_df.where(~(((col("death").isNull()) | (F.lower(col("death"))=="none")) & (col("location").isNull()) & (col("end_date").isNull())))
military_non_null_new.show()

+--------------------+--------------------+--------+--------------------+
|               death|            end_date|      id|            location|
+--------------------+--------------------+--------+--------------------+
|                null|+1154-00-00T00:00...|  100442|[Normandy (Q15878...|
|'''U.S.''': 4,538...|25 November&nbsp;...|57869448|         North Korea|
|               3,770|+1813-09-08T00:00...|13704630|San Sebastián (Q1...|
|4 killed, 10–16 w...|      7 October 2007|13709567|Zhani-Vedeno, [[C...|
|6 killed<br>27 wo...|+1882-07-13T00:00...|13716154|    Alexandria (Q87)|
|                null|       25 April 1643|13721815|  Dartmoor (Q214823)|
|         ~330 killed|+1996-07-25T00:00...|13726970|Mullaitivu (Q507144)|
|88 killed<br> 100...|    22–23 April 2000|13727911|Elephant Pass (Q1...|
|700 killed<br>1,5...|+1997-06-25T00:00...|13729242|    Sri Lanka (Q854)|
|                null|+1966-02-23T00:00...|13731602|        Syria (Q858)|
|70 killed<br>175 ...|+2006-12-09T00:0

In [20]:
military_non_null_new.count()

1773

In [22]:
#check che ci sia tutto
check_military = spark.read.parquet(DATA_DIR+"wiki_military_conflict_null_df.parquet")
check_military.where(~(((col("death").isNull()) | (F.lower(col("death"))=="none")) & (col("location").isNull()) & (col("end_date").isNull()))).count()

1773

ok