# Wikipedia data transformations

In [None]:
# install a magic to reload the src directory
%load_ext autoreload
%autoreload 2

In [162]:
!tree ../data/

[38;5;33m../data/[0m
├── [38;5;33mexternal[0m
├── [38;5;33minterim[0m
├── [38;5;33mprocessed[0m
└── [38;5;33mraw[0m
    └── [38;5;40menwiki-20080103.main.bz2[0m

4 directories, 1 file


Read data into Spark. Reading through all the data by counting takes ~2 hours with 8 cores.

In [26]:
from pyspark.sql import SparkSession, functions as F
import os

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
conf = sc._jsc.hadoopConfiguration()
conf.set("textinputformat.record.delimiter", "\n\n")

input_file = 'enwiki-20080103.main.bz2'
rdd = sc.textFile(os.path.join("../data/raw", input_file))

In [None]:
%time rdd.count()

Spark has dataframe support like dyplr or pandas. We add typing and partition to reduce the overhead of processing this a second time.

We can programatically access the schemas, used when coercing string values into the correct type.

In [147]:
from src.data import import_wikipedia as impwiki
from pyspark.sql import types as T

# how to access the datatype of a schema
schema = impwiki.wikipedia_schema
schema['article_id'].dataType

# how we can add typing easily to new string data
# TODO: test case

cast_map = {
    T.IntegerType: int,
    T.BooleanType: bool,
    T.StringType: str,
}

for name in impwiki.wikipedia_schema.fieldNames():
    cast_map[type(imw.wikipedia_schema[name].dataType)](0)

IntegerType

Apply the schema and create a temporary table.

In [175]:
enwiki_df = spark.createDataFrame(
    rdd.map(impwiki.process_edit), 
    schema=impwiki.wikipedia_schema
)

enwiki_df.createOrReplaceTempView('enwiki')
enwiki_df.show(truncate=False, vertical=True, n=1)

-RECORD 0-----------------------------
 article_id    | 6                    
 rev_id        | 233188               
 article_title | AmericanSamoa        
 timestamp     | 2001-01-19T01:12:51Z 
 username      | ip:office.bomis.com  
 user_id       | ip:office.bomis.com  
 category      | null                 
 image         | null                 
 main          | null                 
 talk          | null                 
 user          | null                 
 user_talk     | null                 
 other         | null                 
 external      | null                 
 template      | null                 
 comment       | *                    
 minor         | true                 
 textdata      | 1516                 
only showing top 1 row



Let there be light. Here's the beginning of Wikipedia using Spark as a query engine. But if we take a second look, the timestamps are out of order and we're starting off at revision 233188.

In [168]:
spark.sql("""
SELECT 
    timestamp,
    article_id, 
    rev_id,
    user_id,
    username,
    minor,
    textdata
FROM 
    enwiki
""").show()

+--------------------+----------+---------+--------------------+--------------------+-----+--------+
|           timestamp|article_id|   rev_id|             user_id|            username|minor|textdata|
+--------------------+----------+---------+--------------------+--------------------+-----+--------+
|2001-01-19T01:12:51Z|         6|   233188| ip:office.bomis.com| ip:office.bomis.com| true|    1516|
|2007-05-24T14:41:33Z|         6|133180191|             4477979|            Ngaiklin| true|       5|
|2001-01-20T15:01:12Z|         8|   233189|ip:pD950754B.dip....|ip:pD950754B.dip....| true|       9|
|2007-05-24T14:41:48Z|         8|133180238|             4477979|            Ngaiklin| true|       6|
|2001-01-21T02:12:21Z|        10|   233192|                  99|           RoseParks| true|       8|
|2007-05-24T14:41:58Z|        10|133180268|             4477979|            Ngaiklin| true|       6|
|2002-02-25T15:00:22Z|        12|    18201|ip:Conversion_script|ip:Conversion_script| true|

Double check that the timestamps are actually out of order after casting into a timestamp.

In [172]:
spark.sql("SELECT cast(timestamp as TIMESTAMP) FROM enwiki").show(n=6)

+-------------------+
|          timestamp|
+-------------------+
|2001-01-18 17:12:51|
|2007-05-24 07:41:33|
|2001-01-20 07:01:12|
|2007-05-24 07:41:48|
|2001-01-20 18:12:21|
|2007-05-24 07:41:58|
+-------------------+
only showing top 6 rows



In [181]:
from pyspark.sql import functions as F

# take a few values
(
    enwiki_df
    .select(F.col('timestamp').cast('timestamp'))
    .limit(1000)
    .select(F.min('timestamp'), F.max('timestamp'))
).show()

+-------------------+-------------------+
|     min(timestamp)|     max(timestamp)|
+-------------------+-------------------+
|2001-01-18 17:12:51|2007-05-24 07:41:58|
+-------------------+-------------------+

