In [1]:
from os import environ
environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell' 

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as md
import re
import time

import findspark
findspark.init()

import pyspark
import pyspark.sql
from pyspark.sql.types import *
from pyspark.sql import *
import hashlib
import os.path
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import desc
from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import udf
from datetime import timedelta, date
from mwviews.api import PageviewsClient


from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

leggiamo i dati

In [3]:
WIKIPEDIA_XML_DUMP = '10million.xml'

wikipedia = spark.read.format('com.databricks.spark.xml')\
   .options(rowTag='page',mode='PERMISSIVE',charset='UTF-8').load(WIKIPEDIA_XML_DUMP)

In [4]:
wikipedia.printSchema()

root
 |-- id: long (nullable = true)
 |-- ns: long (nullable = true)
 |-- redirect: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _title: string (nullable = true)
 |-- restrictions: string (nullable = true)
 |-- revision: struct (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _deleted: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _space: string (nullable = true)
 |    |-- 

In [5]:
ref_regex = re.compile(r'<ref[^>]*[^\/]>|<ref[ ]*>');

selezioniamo solo gli **articoli** con del testo

In [6]:
articles = wikipedia.filter("ns = '0'") \
    .filter("redirect._title is null") \
    .filter("revision.text._VALUE is not null") \
    .filter("length(revision.text._VALUE) > 0");
    
articles = articles.drop('redirect');

## Lunghezza del testo

a questi articoli aggiungiamo la **lunghezza del testo**

In [None]:
articles = articles.withColumn('article_lenght', F.length(wikipedia.revision.text._VALUE));
articles.show(10);

In [None]:
articles.count()

alcuni di questi articoli sono molto brevi, alcuni saranno delle disambiguation pages.. gli altri giusto per curiosità vediamo quanti ce ne sono:

In [None]:
articles.filter("article_lenght < 2000").filter("lower(revision.text._VALUE) not like '%{disambiguation}%'").count()

## Categorie

mettiamoci pure le **categorie** prese dagli **infobox**: domanda, come si comporta se un articolo ha più di una infobox?

In [None]:
regex = r"(?<={{infobox ).[a-zA-Z0-9.-_/ ]*"; #o con \\n nel caso andasse a capo XD
ibox_regex = re.compile(regex,re.IGNORECASE);

In [None]:
def extractCategory(text):
    res = ibox_regex.findall(text)
    return ', '.join(res);

In [None]:
category_udf = udf(lambda text: extractCategory(text), StringType())

In [None]:
articles = articles.withColumn("categories", category_udf(articles.revision.text._VALUE));
articles.filter('length(categories) > 0').show(10);

In [None]:
articles.filter('length(categories) > 0').count()

Selezioniamo quelli con l'infobox inerente alla guerra

In [None]:
goodCategories = ['civil conflicts','military conflicts','civilian attack']; # civilian attack?

regex = r"("+'|'.join(goodCategories)+")(,|$)"; #military operation?
categorySelect_regex = re.compile(regex,re.IGNORECASE);

In [None]:
def goodCategory(text):
    res = categorySelect_regex.findall(text);
    if not res:
        return False;
    return True;

In [None]:
good_category_udf = udf(lambda text: goodCategory(text), BooleanType())

In [None]:
conflict_articles = articles.withColumn("good_categories", good_category_udf(articles.categories))\
                    .filter('good_categories == true');

In [None]:
conflict_articles.count()

In [None]:
# saving binary file to future uses
conflict_articles.write.mode('overwrite').parquet("selectedConflict.parquet");

In [7]:
# loading the saved parquet files
conflict_articles = spark.read.parquet("selectedConflict.parquet");

In [8]:
conflict_articles.show(12)

+-----+---+------------+--------------------+--------------------+--------------+--------------------+---------------+
|   id| ns|restrictions|            revision|               title|article_lenght|          categories|good_categories|
+-----+---+------------+--------------------+--------------------+--------------+--------------------+---------------+
|22467|  0|        null|[[Added more back...|Oklahoma City bom...|        162443|     civilian attack|           true|
|68292|  0|        null|[[ce,], [25359749...|     My Lai Massacre|        113722|     civilian attack|           true|
|30785|  0|        null|[[/* top */ WP:CL...|     Tulsa race riot|         76865|     civilian attack|           true|
|52268|  0|        null|[[/* The attack *...|Sabra and Shatila...|         75210|     civilian attack|           true|
| 5030|  0|        null|[[/* Saville Inqu...|Bloody Sunday (1972)|         82169|     civilian attack|           true|
|65626|  0|        null|[[Undid revision ...|   

get page titles

In [39]:
conflict_array = [(i.title.encode("ascii",'replace')) for i in conflict_articles.collect()];
conflict_array

['Oklahoma City bombing',
 'My Lai Massacre',
 'Tulsa race riot',
 'Sabra and Shatila massacre',
 'Bloody Sunday (1972)',
 'Peterloo Massacre',
 '?cole Polytechnique massacre',
 'Dunblane massacre',
 'Erfurt school massacre',
 'Phoolan Devi',
 "Saint Valentine's Day Massacre",
 'Oradour-sur-Glane massacre']

## Pageview

In [40]:
startingLogDate = '20150701';
today = time.strftime('%Y%m%d');

p = PageviewsClient(user_agent="<ada@epfl.ch> Applied data analysis project")

p.article_views('en.wikipedia', conflict_array, granularity='monthly', start=startingLogDate, end=today, agent='user')

defaultdict(dict,
            {datetime.datetime(2015, 7, 1, 0, 0): {'?cole_Polytechnique_massacre': None,
              'Bloody_Sunday_(1972)': 50566,
              'Dunblane_massacre': 2955,
              'Erfurt_school_massacre': 6,
              'My_Lai_Massacre': 47449,
              'Oklahoma_City_bombing': 82146,
              'Oradour-sur-Glane_massacre': 304,
              'Peterloo_Massacre': 8861,
              'Phoolan_Devi': 39268,
              'Sabra_and_Shatila_massacre': 13034,
              "Saint_Valentine's_Day_Massacre": 42913,
              'Tulsa_race_riot': 27399},
             datetime.datetime(2015, 8, 1, 0, 0): {'?cole_Polytechnique_massacre': None,
              'Bloody_Sunday_(1972)': 42093,
              'Dunblane_massacre': 4417,
              'Erfurt_school_massacre': 5,
              'My_Lai_Massacre': 46340,
              'Oklahoma_City_bombing': 83186,
              'Oradour-sur-Glane_massacre': 296,
              'Peterloo_Massacre': 25512,
         

## External links

In [None]:
# todo

## Referenze

In [None]:
# todo

## To Pandas

In [None]:
pdArticles = articles.select('id','title','article_lenght','categories').toPandas().set_index('id');

In [None]:
pdArticles.head(50)

yo