In [None]:
from os import environ
environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell' 

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as md
import re
import time

import findspark
findspark.init()

import pyspark
import pyspark.sql
from pyspark.sql.types import *
from pyspark.sql import *
import hashlib
import os.path
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import desc
from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import udf
from pyspark.sql.functions import countDistinct
from datetime import timedelta, date
from mwviews.api import PageviewsClient
from collections import OrderedDict


from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

leggiamo i dati

In [None]:
WIKIPEDIA_XML_DUMP = 'first.xml'

wikipedia = spark.read.format('com.databricks.spark.xml')\
   .options(rowTag='page',mode='PERMISSIVE',charset='UTF-8').load(WIKIPEDIA_XML_DUMP)

In [None]:
wikipedia.printSchema()

selezioniamo solo gli **articoli** con del testo

In [None]:
articles = wikipedia.filter("ns = '0'") \
    .filter("redirect._title is null") \
    .filter("revision.text._VALUE is not null") \
    .filter("length(revision.text._VALUE) > 0");
    
articles = articles.drop('redirect');

## Lunghezza del testo

a questi articoli aggiungiamo la **lunghezza del testo**

In [None]:
articles = articles.withColumn('article_lenght', F.length(wikipedia.revision.text._VALUE));

In [None]:
articles.count()

In [None]:
articles.show(10);

togliamo i disambuigui

In [None]:
articles = articles.filter("lower(revision.text._VALUE) not like '%{disambiguation}%'")

In [None]:
articles.count()

## Categorie

mettiamoci pure le **categorie** prese dagli **infobox**: domanda, come si comporta se un articolo ha più di una infobox?

In [None]:
regex = r"(?<={{infobox ).[a-zA-Z0-9.-_/ ]*"; #o con \\n nel caso andasse a capo XD
ibox_regex = re.compile(regex,re.IGNORECASE);

In [None]:
def extractCategory(text):
    res = ibox_regex.findall(text)
    return ', '.join(res);

In [None]:
category_udf = udf(lambda text: extractCategory(text), StringType())

In [None]:
articles = articles.withColumn("categories", category_udf(articles.revision.text._VALUE));

In [None]:
articles.filter('length(categories) > 0').show(10);

In [None]:
articles.filter('length(categories) > 0').count()

Selezioniamo quelli con l'infobox inerente alla guerra

In [None]:
goodCategories = ['civil conflict','military conflict','civilian attack']; # civilian attack?

regex = r"("+'|'.join(goodCategories)+")(,|$)"; #military operation?
categorySelect_regex = re.compile(regex,re.IGNORECASE);

In [None]:
def goodCategory(text):
    res = categorySelect_regex.findall(text);
    if not res:
        return False;
    return True;

In [None]:
good_category_udf = udf(lambda text: goodCategory(text), BooleanType())

In [None]:
conflict_articles = articles.withColumn("good_categories", good_category_udf(articles.categories))\
                    .filter('good_categories == true');

In [None]:
conflict_articles.count()

In [None]:
# saving binary file to future uses
conflict_articles.write.mode('overwrite').parquet("selectedConflict.parquet");

In [2]:
# loading the saved parquet files
conflict_articles = spark.read.parquet("selectedAllConflict.parquet");

In [None]:
conflict_articles.show(10)

In [None]:
conflict_articles.select("title").distinct().count()

get page titles

In [3]:
df = conflict_articles.select("title")
# questo fa casini per l'encoding
conflict_array = [(i.title) for i in df.collect()];

## Pageview

In [5]:
def getViewsFromAPI(conflict_array):
    views_dict = p.article_views('en.wikipedia', conflict_array, granularity='monthly',
                                 start='20150701', end=time.strftime('%Y%m%d'), agent='user')
    return views_dict

In [6]:
# Non funziona per via dell'encoding, da studiarci o da usare unicorn?
startingLogDate = '20150701';
today = time.strftime('%Y%m%d');

p = PageviewsClient(user_agent="<ada@epfl.ch> Applied data analysis project")

views_dict = p.article_views('en.wikipedia', conflict_array, granularity='monthly', start=startingLogDate, end=today, agent='user')

In [7]:
dizioTizio = [];
for key, value in views_dict.items():
    chiave = list(value.keys());
    valore = list(value.values());
    pippo  = list(zip(chiave,valore));
    dizioTizio.extend(pippo)

In [18]:
views = spark.createDataFrame(dizioTizio,["title", "views"])

views = views.select('title', views.views.cast('int')).na.fill(0);

In [19]:
views_df = views.groupBy("title").agg(F.sum("views")).select("title",F.col("sum(views)").alias("views"))

In [20]:
views_df.count()

18782

In [21]:
views_df.show()

+--------------------+-------+
|               title|  views|
+--------------------+-------+
|Battle_of_Cremona...|  12815|
|   Battle_of_Brienne|  24876|
|Battle_of_Turnhou...|  15813|
|Selma_to_Montgome...|1614145|
|Battle_of_Sarikamish| 167655|
|Battle_of_Mesamávida|   1038|
|Battle_of_Heilige...|  18517|
|Battle_of_Cape_Sp...|  97808|
| Battle_of_Chaldiran| 206369|
|     Heshui_Campaign|   3091|
|Battle_of_Borovo_...|  29039|
|Battle_of_Dębe_Wi...|   2620|
|First_Anglo-Afgha...| 640505|
|Serb_uprising_of_...|    901|
|2013_"Pro_Europe"...|   6001|
|            Camisard|  50928|
|    Battle_of_Sobota|   5638|
|Battle_of_Winchelsea|  33369|
|        Count's_Feud|  34054|
| Battle_of_Nashville| 270988|
+--------------------+-------+
only showing top 20 rows



In [22]:
# saving binary file
views_df.write.mode('overwrite').parquet("totalViews.parquet");

In [23]:
# loading the saved parquet files
views_df = spark.read.parquet("totalViews.parquet");

In [24]:
views_df.show()

+--------------------+------+
|               title| views|
+--------------------+------+
|Battle_of_Landshu...| 18166|
|Northern_Virginia...| 68827|
|      Te_Kooti's_War| 26069|
|         Radical_War| 49008|
|       Convoy_HX_106|  8372|
|Operation_Rolling...|603567|
|         Watts_riots|914726|
|    Battle_of_Sarmin| 12596|
|Battle_of_Guilin–...| 17691|
|Insurgency_in_the...|109816|
|            Ifni_War|189291|
|Attacks_on_High_Wood| 26417|
|     Battle_of_Szack| 25366|
|Pruth_River_Campaign| 65017|
|Battle_of_Albert_...| 22841|
|     Al_Asad_Airbase|159577|
|Second_Battle_of_...| 34579|
|    Battle_of_Asiago| 65537|
|Siege_of_Albarrac...|  1851|
|Action_of_30_Marc...| 16435|
+--------------------+------+
only showing top 20 rows



In [None]:
sqlContext.createDataFrame(row_rdd,['numbers']).show()

In [None]:
sc.parallelize(views_dict).take(3)

## External links

visto che ci sono problemi con l'encoding faccio in un altro modo:

In [None]:
regex = r"\[\[(.*?)\]\]";
link_regex = re.compile(regex,re.IGNORECASE);

In [None]:
external_links = [];
def extr_link(text):
    global external_links
    external_links = external_links + link_regex.findall(text);

In [None]:
for i in articles.select("revision.text._VALUE").collect():
    extr_link(i[0])

In [None]:
external_links_rdd = spark.createDataFrame(external_links, StringType()).selectExpr("value as title");

In [None]:
external_links_rdd.count()

In [None]:
group_links = external_links_rdd.groupBy("title").agg(countDistinct("title")).select("title",F.col("count(DISTINCT title)").alias("external_links"))

In [None]:
group_links.show()

In [None]:
group_links.count()

In [None]:
all_info = conflict_articles.join(group_links, "title",how='left').na.fill(0);

In [None]:
all_info.select("id","title","revision","categories","external_links").show()

## Referenze

In [None]:
# todo

## To Pandas

In [None]:
pdArticles = articles.select('id','title','article_lenght','categories').toPandas().set_index('id');

In [None]:
pdArticles.head(50)

yo