In [1]:
from os import environ
environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell' 

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as md
import re
import time

import findspark
findspark.init()

import pyspark
import pyspark.sql
from pyspark.sql.types import *
from pyspark.sql import *
import hashlib
import os.path
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import desc
from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import udf
from pyspark.sql.functions import countDistinct
from datetime import timedelta, date
from mwviews.api import PageviewsClient


from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

leggiamo i dati

In [8]:
WIKIPEDIA_XML_DUMP = 'million.xml'

wikipedia = spark.read.format('com.databricks.spark.xml')\
   .options(rowTag='page',mode='PERMISSIVE',charset='UTF-8').load(WIKIPEDIA_XML_DUMP)

In [9]:
wikipedia.printSchema()

root
 |-- id: long (nullable = true)
 |-- ns: long (nullable = true)
 |-- redirect: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _title: string (nullable = true)
 |-- restrictions: string (nullable = true)
 |-- revision: struct (nullable = true)
 |    |-- comment: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _space: string (nullable = true)
 |    |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)



selezioniamo solo gli **articoli** con del testo

In [10]:
articles = wikipedia.filter("ns = '0'") \
    .filter("redirect._title is null") \
    .filter("revision.text._VALUE is not null") \
    .filter("length(revision.text._VALUE) > 0");
    
articles = articles.drop('redirect');

## Lunghezza del testo

a questi articoli aggiungiamo la **lunghezza del testo**

In [11]:
articles = articles.withColumn('article_lenght', F.length(wikipedia.revision.text._VALUE));

In [None]:
articles.count()

In [None]:
articles.show(10);

togliamo i disambuigui

In [12]:
articles = articles.filter("lower(revision.text._VALUE) not like '%{disambiguation}%'")

In [13]:
articles.count()

3258

## Categorie

mettiamoci pure le **categorie** prese dagli **infobox**: domanda, come si comporta se un articolo ha più di una infobox?

In [None]:
regex = r"(?<={{infobox ).[a-zA-Z0-9.-_/ ]*"; #o con \\n nel caso andasse a capo XD
ibox_regex = re.compile(regex,re.IGNORECASE);

In [None]:
def extractCategory(text):
    res = ibox_regex.findall(text)
    return ', '.join(res);

In [None]:
category_udf = udf(lambda text: extractCategory(text), StringType())

In [None]:
articles = articles.withColumn("categories", category_udf(articles.revision.text._VALUE));

In [None]:
articles.filter('length(categories) > 0').show(10);

In [None]:
articles.filter('length(categories) > 0').count()

Selezioniamo quelli con l'infobox inerente alla guerra

In [None]:
goodCategories = ['civil conflict','military conflict','civilian attack']; # civilian attack?

regex = r"("+'|'.join(goodCategories)+")(,|$)"; #military operation?
categorySelect_regex = re.compile(regex,re.IGNORECASE);

In [None]:
def goodCategory(text):
    res = categorySelect_regex.findall(text);
    if not res:
        return False;
    return True;

In [None]:
good_category_udf = udf(lambda text: goodCategory(text), BooleanType())

In [None]:
conflict_articles = articles.withColumn("good_categories", good_category_udf(articles.categories))\
                    .filter('good_categories == true');

In [None]:
conflict_articles.count()

In [None]:
# saving binary file to future uses
conflict_articles.write.mode('overwrite').parquet("selectedConflict.parquet");

In [3]:
# loading the saved parquet files
conflict_articles = spark.read.parquet("selectedConflict.parquet");

In [4]:
conflict_articles.show(30)

+----+---+------------+--------------------+--------------------+--------------+-----------------+---------------+
|  id| ns|restrictions|            revision|               title|article_lenght|       categories|good_categories|
+----+---+------------+--------------------+--------------------+--------------+-----------------+---------------+
|4005|  0|        null|[/* External link...| Battle of Pharsalus|         22469|military conflict|           true|
|4049|  0|        null|[, [, 91.10.58.17...|  Battle of Blenheim|         66889|military conflict|           true|
|4050|  0|        null|[, [487310,, Kint...| Battle of Ramillies|         55678|military conflict|           true|
|4160|  0|        null|[Robot - Removing...|Battle of Lostwit...|          5670|military conflict|           true|
|4283|  0|        null|[/* In popular cu...|   Battle of Peleliu|         45153|military conflict|           true|
|4284|  0|        null|[Undid revision 8...|Battle of Stalingrad|        127960|

get page titles

In [None]:
# questo fa casini per l'encoding
conflict_array = [(i.title.encode("ascii",'replace')) for i in conflict_articles.collect()];
conflict_array

## Pageview

In [None]:
# Non funziona per via dell'encoding, da studiarci o da usare unicorn?
startingLogDate = '20150701';
today = time.strftime('%Y%m%d');

p = PageviewsClient(user_agent="<ada@epfl.ch> Applied data analysis project")

#p.article_views('en.wikipedia', conflict_array, granularity='monthly', start=startingLogDate, end=today, agent='user')

## External links

visto che ci sono problemi con l'encoding faccio in un altro modo:

In [5]:
regex = r"\[\[(.*?)\]\]";
link_regex = re.compile(regex,re.IGNORECASE);

In [6]:
external_links = [];
def extr_link(text):
    global external_links
    external_links = external_links + link_regex.findall(text);

In [14]:
for i in articles.select("revision.text._VALUE").collect():
    extr_link(i[0])

In [15]:
external_links_rdd = spark.createDataFrame(external_links, StringType()).selectExpr("value as title");

In [16]:
external_links_rdd.count()

732106

In [17]:
group_links = external_links_rdd.groupBy("title").agg(countDistinct("title")).select("title",F.col("count(DISTINCT title)").alias("external_links"))

In [18]:
group_links.show()

+--------------------+--------------+
|               title|external_links|
+--------------------+--------------+
|Libertarian socia...|             1|
|  Diogenes of Sinope|             1|
|The Kingdom of Go...|             1|
|       Louise Michel|             1|
|       Silvio Gesell|             1|
| Dot (diacritic)|Ạ ạ|             1|
|Birmingham, Alaba...|             1|
|        La Louisiane|             1|
|     Atheism|Atheist|             1|
|         ServisFirst|             1|
|              ADTRAN|             1|
|Greek sea gods|wa...|             1|
|           Aeschylus|             1|
|    Thomas Corneille|             1|
|    Achille in Sciro|             1|
|Popular sovereign...|             1|
|File:The Rail Can...|             1|
|File:Lincoln and ...|             1|
|Classical element...|             1|
|       Carlo Rovelli|             1|
+--------------------+--------------+
only showing top 20 rows



In [19]:
group_links.count()

413456

In [20]:
all_info = conflict_articles.join(group_links, "title",how='left').na.fill(0);

In [21]:
all_info.select("id","title","revision","categories","external_links").show()

+----+--------------------+--------------------+-----------------+--------------+
|  id|               title|            revision|       categories|external_links|
+----+--------------------+--------------------+-----------------+--------------+
|4050| Battle of Ramillies|[, [487310,, Kint...|military conflict|             0|
|4806|  Battle of Marathon|[Reverted edits b...|military conflict|             1|
|3793|Battle of Boswort...|[Reverted edits b...|military conflict|             1|
|5003|  Battle of Bouvines|[clean up, [[WP:A...|military conflict|             0|
|6827|Cuban Missile Crisis|[Reverted 1 edit ...|military conflict|             1|
|4978|Battle of Bereste...|[Undid revision 8...|military conflict|             0|
|4928|  Battle of Świecino|[, [, 5.172.255.5...|military conflict|             0|
|4975|   Battle of Abritus|[, [17328627,, Kr...|military conflict|             0|
|4849|Battle of Gettysburg|[/* Further readi...|military conflict|             1|
|5013|      Batt

## Referenze

In [None]:
# todo

## To Pandas

In [None]:
pdArticles = articles.select('id','title','article_lenght','categories').toPandas().set_index('id');

In [None]:
pdArticles.head(50)

yo