In [1]:
from os import environ
environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell' 

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as md
import re

import findspark
findspark.init()

import pyspark
import pyspark.sql
from pyspark.sql.types import *
from pyspark.sql import *
import hashlib
import os.path
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import desc
from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import udf
from datetime import timedelta, date

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

leggiamo i dati

In [3]:
WIKIPEDIA_XML_DUMP = 'first.xml'

wikipedia = spark.read.format('com.databricks.spark.xml')\
   .options(rowTag='page',mode='PERMISSIVE',charset='UTF-8').load(WIKIPEDIA_XML_DUMP)

In [4]:
wikipedia.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- id: long (nullable = true)
 |-- ns: long (nullable = true)
 |-- redirect: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _title: string (nullable = true)
 |-- restrictions: string (nullable = true)
 |-- revision: struct (nullable = true)
 |    |-- comment: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _space: string (nullable = true)
 |    |-- timestamp: string (nullable = true)
 |-- title: str

In [5]:
ref_regex = re.compile(r'<ref[^>]*[^\/]>|<ref[ ]*>');

selezioniamo solo gli **articoli** con del testo

In [6]:
articles = wikipedia.filter("ns = '0'").filter("_corrupt_record is null") \
    .filter("redirect._title is null") \
    .filter("revision.text._VALUE is not null") \
    .filter("length(revision.text._VALUE) > 0");
    
articles = articles.drop('_corrupt_record').drop('redirect');

## Lunghezza del testo

a questi articoli aggiungiamo la **lunghezza del testo**

In [7]:
articles = articles.withColumn('article_lenght', F.length(wikipedia.revision.text._VALUE));
articles.show(10);

+---+---+------------+--------------------+--------------------+--------------+
| id| ns|restrictions|            revision|               title|article_lenght|
+---+---+------------+--------------------+--------------------+--------------+
| 12|  0|        null|[, [, 108.34.186....|           Anarchism|        193812|
| 25|  0|        null|[needing clarific...|              Autism|        154548|
| 39|  0|        null|[Added table of p...|              Albedo|         42177|
|290|  0|        null|[/* Other uses */...|                   A|         24844|
|303|  0|        null|[Adding {{pp-vand...|             Alabama|        187311|
|305|  0|        null|[Achilles' heel<A...|            Achilles|         68901|
|307|  0|        null|[, [589223,, Good...|     Abraham Lincoln|        196309|
|308|  0|        null|[/* top */ ce, [2...|           Aristotle|        131089|
|309|  0|        null|[copy editing, ap...|An American in Paris|         19109|
|316|  0|        null|[/* 1940s */, [31.

alcuni di questi articoli sono molto brevi, alcuni saranno delle disambiguation pages.. gli altri giusto per curiosità vediamo quanti ce ne sono:

In [8]:
articles.filter("article_lenght < 2000").filter("lower(revision.text._VALUE) not like '%{disambiguation}%'").count()

2

## Categorie

mettiamoci pure le **categorie** prese dagli **infobox**: domanda, come si comporta se un articolo ha più di una infobox?

In [9]:
regex = r"(?<={{infobox ).[a-zA-Z0-9.-_/ ]*"; #o con \\n nel caso andasse a capo XD
ibox_regex = re.compile(regex,re.IGNORECASE);

In [10]:
def provaFunzione(text):
    res = ibox_regex.findall(text)
    return ', '.join(res);

In [11]:
category_udf = udf(lambda text: provaFunzione(text), StringType())

In [12]:
articles = articles.withColumn("categories", category_udf(articles.revision.text._VALUE));
articles.filter('length(categories) > 0').show(10);

+---+---+------------+--------------------+--------------------+--------------+--------------------+
| id| ns|restrictions|            revision|               title|article_lenght|          categories|
+---+---+------------+--------------------+--------------------+--------------+--------------------+
| 25|  0|        null|[needing clarific...|              Autism|        154548|  medical condition |
|290|  0|        null|[/* Other uses */...|                   A|         24844|            grapheme|
|303|  0|        null|[Adding {{pp-vand...|             Alabama|        187311|U.S. state, U.S. ...|
|307|  0|        null|[, [589223,, Good...|     Abraham Lincoln|        196309|officeholder, U.S...|
|308|  0|        null|[/* top */ ce, [2...|           Aristotle|        131089|         philosopher|
|316|  0|        null|[/* 1940s */, [31...|Academy Award for...|         94762|               award|
|324|  0|        null|[/* Accusations o...|      Academy Awards|         95598|     award, 

Selezioniamo quelli con l'infobox inerente alla guerra

In [13]:
#todo

## Referenze

In [14]:
pdArticles = articles.select('id','title','article_lenght','categories').toPandas().set_index('id');

In [15]:
pdArticles.head(50)

Unnamed: 0_level_0,title,article_lenght,categories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12,Anarchism,193812,
25,Autism,154548,medical condition
39,Albedo,42177,
290,A,24844,grapheme
303,Alabama,187311,"U.S. state, U.S. state symbols"
305,Achilles,68901,
307,Abraham Lincoln,196309,"officeholder, U.S. Cabinet"
308,Aristotle,131089,philosopher
309,An American in Paris,19109,
316,Academy Award for Best Production Design,94762,award


yo