## Wikipedia Recent Changes Stream Consumer

In [1]:
import pyspark
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import re
from bs4 import BeautifulSoup # \ For parsing out contents of actual changes
import requests               # /

In [2]:
sc.version

'2.1.1.2.6.1.0-129'

In [3]:
sc.setLogLevel("ERROR")

### 0. Functions to make tasks below more reusable

In [4]:
def prune_event(df_IN, filter_IN):
    
    '''Take in a dataframe of wikipedia recent changes and produce 
       only the properties we want going forward
       
       filter_IN is the wikipedia domain we want to filter on (e.g., en.wikipedia.org)
    '''
    df_OUT = df_IN.select("parsed_wiki_values.*") \
            .select("id", \
                    "user", \
                    "timestamp", \
                    "bot", \
                    "comment", \
                    "server_name", \
                    "wiki", \
                    "title", \
                    "type", \
                    "log_action", \
                    "log_action_comment", \
                    "log_type", \
                    "minor", \
                    "namespace", \
                    "parsedcomment", \
                    "patrolled", \
                    col("meta.dt").alias("event_date"), \
                    col("meta.schema_uri").alias("event_schema_uri"), \
                    col("meta.uri").alias("wikipage_uri"), \
                    col("meta.domain").alias("event_domain"), \
                    col("length.old").alias("len_old"), \
                    col("length.new").alias("len_new"), \
                    col("revision.old").alias("rev_old"), \
                    col("revision.new").alias("rev_new"), \
                   ).where(col("server_name")==filter_IN)
    return df_OUT

In [5]:
def write_to_hdfs(stream, location):
    '''Provided a stream and a location in hdfs to write out to parquet file
    '''
    pathout = "hdfs://sandbox.hortonworks.com:8020/tmp/{}".format(location)
    return stream.writeStream \
    .format("parquet") \
    .option("startingOffsets", "earliest") \
    .option("path", pathout) \
    .option("checkpointLocation", pathout) \
    .start()

In [6]:
def windowed_counts(df_IN, col_IN):
    '''Give a data frame and a datetime column to create a window on.
       only for non-bot users.
       Window is set to 5 minutes for now with 2 minutes slide
    '''
    return df_IN.where(col("bot")==False).groupBy(
        window(df_IN[col_IN], "5 minutes", "2 minutes"),
        df_IN.user
    ).count()

In [7]:
def match_anonymous(userid):
    '''It is assumed in wikipedia that anonymous users are
       given the userid of the IP address from which their
       traffic is coming from. IPs can be ipv4 or ipv6
       
       This function checks to see whether a username is 
       an IP address and returns true if it is.
       '''
    
    ANONYMOUS = False
    
    # ipv4 and ipv6 expressions used from: http://nbviewer.jupyter.org/github/rasbt/python_reference/blob/master/tutorials/useful_regex.ipynb
    ipv4_pattern = r'^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
    ipv6_pattern = r'^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$'
    ipv4_ans = bool(re.match(ipv4_pattern, str(userid)))
    ipv6_ans = bool(re.match(ipv6_pattern, str(userid)))
    if ipv4_ans is True:
        ANONYMOUS = True
    elif ipv6_ans is True:
        ANONYMOUS = True
    else:
        ANONYMOUS = False
    return ANONYMOUS

matchAnonUDF = udf(match_anonymous, BooleanType())

In [8]:
def grab_raw_changes(old, curr):
    text = None
    url = "https://en.wikipedia.org/w/index.php?diff={}&oldid={}".format(str(curr), str(old))
    xpath = 'table.diff'
    res = requests.get(url)
    if res.status_code == 200:
        try:
            soup = BeautifulSoup(res.content, "html.parser")
            diff = soup.select(xpath)
            if diff is not []:
                text = str(diff)

        except Exception as e:
            print("Could not grab changes from {}: {}".format(url, e))
    else:
        print("Could not fetch resource at {} - status: {}".format(url, str(res.status_code)))
    
    return text



rawChangesUDF = udf(grab_raw_changes, StringType())

### 1. Read in Raw Stream

In [9]:
df = spark.readStream.format("kafka") \
                     .option("kafka.bootstrap.servers","sandbox.hortonworks.com:6667") \
                     .option("subscribe", "wiki-rc-stream") \
                     .option("startingOffsets", "earliest") \
                     .load()

In [10]:
df.printSchema 

<bound method DataFrame.printSchema of DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]>

In [11]:
data = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [12]:
data

DataFrame[key: string, value: string]

### 2. Provide a schema

Not all properties will be in every event coming through the stream. Those that aren't present get a null. We are not concerned about making any properties non-nullable so we'll leave it as such for now.

In [13]:
jsonschema = StructType().add("bot", BooleanType()) \
                         .add("comment", StringType()) \
                         .add("id", IntegerType()) \
                         .add("length", StructType() \
                            .add("new", IntegerType()) \
                            .add("old", IntegerType())) \
                         .add("meta", StructType() \
                            .add("domain", StringType()) \
                            .add("dt", StringType()) \
                            .add("id", StringType()) \
                            .add("request_id", StringType()) \
                            .add("schema_uri", StringType()) \
                            .add("topic", StringType()) \
                            .add("partition", IntegerType()) \
                            .add("uri", StringType()) \
                            .add("offset", IntegerType())) \
                         .add("minor",  BooleanType()) \
                         .add("namespace", IntegerType()) \
                         .add("parsedcomment", StringType()) \
                         .add("patrolled", BooleanType()) \
                         .add("revision", StructType() \
                            .add("new", IntegerType()) \
                            .add("old", IntegerType())) \
                         .add("server_name", StringType()) \
                         .add("server_script_path", StringType()) \
                         .add("server_url", StringType()) \
                         .add("timestamp", StringType()) \
                         .add("title", StringType()) \
                         .add("type", StringType()) \
                         .add("user", StringType()) \
                         .add("wiki", StringType()) \
                         .add("log_action", StringType()) \
                         .add("log_action_comment", StringType()) \
                         .add("log_id", IntegerType()) \
                         .add("log_params", StructType()) \
                         .add("log_type", StringType())

### 3. Get the json from the value and cast as string using the schema above

In [14]:
wiki_raw = df.select(from_json(col("value") \
                                .cast("string"), jsonschema) \
                                .alias("parsed_wiki_values"))

In [15]:
wiki_raw.printSchema 

<bound method DataFrame.printSchema of DataFrame[parsed_wiki_values: struct<bot:boolean,comment:string,id:int,length:struct<new:int,old:int>,meta:struct<domain:string,dt:string,id:string,request_id:string,schema_uri:string,topic:string,partition:int,uri:string,offset:int>,minor:boolean,namespace:int,parsedcomment:string,patrolled:boolean,revision:struct<new:int,old:int>,server_name:string,server_script_path:string,server_url:string,timestamp:string,title:string,type:string,user:string,wiki:string,log_action:string,log_action_comment:string,log_id:int,log_params:struct<>,log_type:string>]>

### 4. Filter raw dataframe and only get English Wikipedia events

In [16]:
en_wiki = prune_event(wiki_raw, "en.wikipedia.org") \
                        .withColumn("anonymous", matchAnonUDF("user")) \
                        .withColumn("timestamp_dt", from_unixtime("timestamp", "yyyy-MM-dd HH:mm:ss.SSS"))

In [17]:
en_wiki.printSchema 

<bound method DataFrame.printSchema of DataFrame[id: int, user: string, timestamp: string, bot: boolean, comment: string, server_name: string, wiki: string, title: string, type: string, log_action: string, log_action_comment: string, log_type: string, minor: boolean, namespace: int, parsedcomment: string, patrolled: boolean, event_date: string, event_schema_uri: string, wikipage_uri: string, event_domain: string, len_old: int, len_new: int, rev_old: int, rev_new: int, anonymous: boolean, timestamp_dt: string]>

### 5. Filter dataframe and only get Wikidata events

In [18]:
wikidata = prune_event(wiki_raw, "www.wikidata.org") \
                        .withColumn("anonymous", matchAnonUDF("user")) \
                        .withColumn("timestamp_dt", from_unixtime("timestamp", "yyyy-MM-dd HH:mm:ss.SSS"))

In [19]:
wikidata.printSchema

<bound method DataFrame.printSchema of DataFrame[id: int, user: string, timestamp: string, bot: boolean, comment: string, server_name: string, wiki: string, title: string, type: string, log_action: string, log_action_comment: string, log_type: string, minor: boolean, namespace: int, parsedcomment: string, patrolled: boolean, event_date: string, event_schema_uri: string, wikipage_uri: string, event_domain: string, len_old: int, len_new: int, rev_old: int, rev_new: int, anonymous: boolean, timestamp_dt: string]>

### 6. Write to console for 10 second to verify we have a stream as data frame

In [20]:
enWikiQuery = en_wiki.writeStream.outputMode("append").format("console") \
                                             .start()
enWikiQuery.awaitTermination(timeout=10)

False

### 7. Create windows for English Wikipedia and Wikidata

In [21]:
windowedEnWikiCounts = windowed_counts(en_wiki, 'timestamp_dt')

In [22]:
windowedWikidataCounts = windowed_counts(wikidata, 'timestamp_dt')

### 8. Write the window counts each to their own kafka topics

In [23]:
# wikidataCounts

streamQuery1 = windowedWikidataCounts.select(
    to_json(struct("window")).alias("key"),
    to_json(struct("window","user", "count")).alias("value")) \
    .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "sandbox.hortonworks.com:6667") \
    .option("topic", "wikiDataCounts") \
    .option("checkpointLocation", "hdfs://sandbox.hortonworks.com:8020/tmp/wikiDataCounts") \
    .outputMode("update") \
    .start()


streamQuery1.awaitTermination(timeout=10)

False

In [24]:
# enWikiCounts

streamQuery2 = windowedEnWikiCounts.select(
    to_json(struct("window")).alias("key"),
    to_json(struct("window","user","count")).alias("value")) \
    .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "sandbox.hortonworks.com:6667") \
    .option("topic", "enWikiCounts") \
    .option("checkpointLocation", "hdfs://sandbox.hortonworks.com:8020/tmp/enWikiCounts") \
    .outputMode("update") \
    .start()


streamQuery2.awaitTermination(timeout=10)

False

###  9. Write streams each to their own location on HDFS

In [25]:
query_en_wiki_hdfs = write_to_hdfs(en_wiki, "en_wiki")
query_en_wiki_hdfs.awaitTermination(timeout=10)

False

In [26]:
query_wikidata_hdfs = write_to_hdfs(wikidata, "wikidata")
query_wikidata_hdfs.awaitTermination(timeout=10)

False

### 10. Write anonymous english wikipedia entries to a topic called anon-en-wiki-hydrate to notify it's time to add onto that with raw change text and push to HDFS

In [27]:
streamQuery3 = en_wiki.where(col("anonymous")==True).select(
    to_json(struct("id")).alias("key"), 
    to_json(struct([col(c).alias(c) for c in en_wiki.columns])).alias("value")) \
    .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "sandbox.hortonworks.com:6667") \
    .option("topic", "anon-en-wiki-hydrate") \
    .option("checkpointLocation", "hdfs://sandbox.hortonworks.com:8020/tmp/anon-en-wiki-hydrate") \
    .outputMode("append") \
    .start()


streamQuery3.awaitTermination(timeout=30)

False

### 11. Read from anon-en-wiki hydrate and try to grab the raw html diff for later parsing and analysis

In [28]:
hydrate_stream = spark.readStream.format("kafka") \
                     .option("kafka.bootstrap.servers","sandbox.hortonworks.com:6667") \
                     .option("subscribe", "anon-en-wiki-hydrate") \
                     .option("startingOffsets", "earliest") \
                     .load()

In [29]:
prunedSchema = StructType().add("bot", BooleanType()) \
						   .add("comment", StringType()) \
						   .add("id", IntegerType()) \
						   .add("len_old", IntegerType()) \
						   .add("len_new", IntegerType()) \
						   .add("rev_new", IntegerType()) \
						   .add("rev_old", IntegerType()) \
						   .add("parsedcomment", StringType()) \
						   .add("patrolled", BooleanType()) \
						   .add("title", StringType()) \
						   .add("type", StringType()) \
						   .add("user", StringType()) \
						   .add("wiki", StringType()) \
						   .add("log_action", StringType()) \
						   .add("log_action_comment", StringType()) \
						   .add("log_id", IntegerType()) \
						   .add("log_type", StringType()) \
						   .add("server_name", StringType()) \
						   .add("timestamp", StringType()) \
						   .add("timestamp_dt", DateType()) \
						   .add("minor",  BooleanType()) \
						   .add("anonymous", BooleanType()) \
						   .add("event_date", DateType()) \
						   .add("event_schema_uri", StringType()) \
						   .add("wikipage_uri", StringType()) \
						   .add("event_domain", StringType()) \
						   .add("namespace", IntegerType())

In [30]:
# note there might be an issue with how these values are coming back, i noticed far more null values coming back than expected
# even though confirming that values going into stream look alright.
hydrate_raw = hydrate_stream.select(from_json(col("value") \
                                .cast("string"), prunedSchema) \
                                .alias("hydrate_values"))

In [31]:
hydrate_df = hydrate_raw.select("hydrate_values.*")

In [32]:
hydrateQuery = hydrate_df.writeStream.outputMode("append").format("console") \
                                             .start()
hydrateQuery.awaitTermination(timeout=10)

False

In [33]:
hydrated_df = hydrate_df.withColumn("change_text", rawChangesUDF("rev_old", "rev_new")) \
                        .where(col("rev_old") != None) \
                        .where(col("rev_new") != None)

In [35]:
hydrated_df_to_hdfs = hydrated_df.where(col("change_text") != None)

In [36]:
query_hydrated_df_hdfs = write_to_hdfs(hydrated_df_to_hdfs, "hydrated_en_wiki")
query_hydrated_df_hdfs.awaitTermination(timeout=10)

False

### Conclusion

In theory the last part should work to grab any records and add in a field "change_text" that contains the HTML string which is the text diff between wikipedia changes for English language wikipedia. Though it is fairly untested how well it might perform on a running stream. To be conservative, it just bails if a scrape did not work and only records with found changes are sent to the HDFS store. 

Were there more time, I would probably test this further and also try to tweak things on the kafka side to make sure that change texts were more reliably being grabbed and stored.

Overall, I think maybe I bit off more than I could chew and it might have been good just to focus on this one aspect of getting English Wikipedia changes with their original change text into a storage point for later analysis and not worry too much about the windowed bot count.
