In [2]:
import findspark
findspark.init()

import wptools
import pyspark
import pyspark.sql
from pyspark.sql import *
import os.path
from pyspark.sql.functions import desc

from pyspark.sql import dataframe
from pyspark.sql import functions as F

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

import re
import pandas as pd
import matplotlib.pyplot as plt
import hashlib

%matplotlib inline

import os
import sys
my_fun = '../'
if my_fun not in sys.path:
    sys.path.append(my_fun)
    
from get_ref_info import *
my_fun = '../utils/cleaning/'
if my_fun not in sys.path:
    sys.path.append(my_fun)
    
from cleaning import *
from operator import add

try:
    from urllib.parse import urlparse
except ImportError:
     from urlparse import urlparse

import seaborn as sns
import numpy as np
from pyspark.sql.functions import regexp_replace, col

from sklearn.preprocessing import MinMaxScaler

DATA_DIR = '../../data/data_processed/' 

sc.addPyFile("../get_ref_info.py")
sc.addPyFile("../cleaning/cleaning.py")

In [3]:
# Loading the data
WIKIPEDIA_REFERENCES_PARQUET = DATA_DIR + "intermediate/wikipedia_no_outliers.parquet"

# loading the saved parquet files
wikipedia_ref = spark.read.parquet(WIKIPEDIA_REFERENCES_PARQUET)
wikipedia_ref.show(10)

+------+--------------------+-----------------+--------------------+
|    id|               title|       categories|            revision|
+------+--------------------+-----------------+--------------------+
|655845|Battle of Athens ...|military conflict|[, [,, 5175837,, ...|
|656035|  Battle of Calabria|military conflict|[[link,], [,, 268...|
|656087|   Italo-Turkish War|military conflict|[, [,,, 96.77.37....|
|656173|Third Anglo-Afgha...|military conflict|[[Formatting.,], ...|
|656175|Second Anglo-Afgh...|military conflict|[[/* 1879 */The B...|
|656473|   Illinois campaign|military conflict|[[Moved images.,]...|
|659156|Battle of Kock (1...|military conflict|[, [,, 29980587,,...|
|659506|Battle of Peachtr...|military conflict|[[c/e.,], [,, 193...|
|661656|Nicaraguan Revolu...|military conflict|[, [,, 33124044,,...|
|661864|       S11 (protest)|   civil conflict|[[Rescuing 3 sour...|
+------+--------------------+-----------------+--------------------+
only showing top 10 rows



In [4]:
articles = pd.read_csv(DATA_DIR + 'data_processed.csv', index_col=0)
articles.index.names = ['id']
articles.end_date_clean = articles.end_date_clean.fillna(0)
min_max_scaler = MinMaxScaler(feature_range=(0,1))
articles_minmax = min_max_scaler.fit_transform(articles[['article_lenght', 'views', 'refs_count', 'link_count']])
articles[['article_lenght', 'views', 'refs_count', 'link_count']] = articles_minmax
articles.head()


  return self.partial_fit(X, y)


Unnamed: 0_level_0,title,categories,article_lenght,views,refs_count,link_count,death,locations,end_date_clean,ongoing
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
771,American Revolutionary War,military conflict,0.375011,0.259929,0.598,0.634364,70000,Eastern North America|Caribbean Sea|Indian ...,1783.0,False
863,American Civil War,military conflict,0.31941,0.522999,0.212,0.532646,828000,Southern United States Q49042|Northern Uni...,1865.0,False
3793,Battle of Bosworth Field,military conflict,0.152325,0.032733,0.034,0.208935,0,Ambion Hill Q4741491|Battle of Bosworth F...,1485.0,False
4005,Battle of Pharsalus,military conflict,0.03451,0.017785,0.02,0.067354,230,Farsala Q985596,0.0,False
4049,Battle of Blenheim,military conflict,0.104371,0.013543,0.095,0.137457,24968,Blindheim Q514914,1704.0,False


In [5]:
articles['popularity_score'] = articles['article_lenght'] + articles['views'] \
                                + articles['refs_count'] + articles['link_count']
articles.head()

Unnamed: 0_level_0,title,categories,article_lenght,views,refs_count,link_count,death,locations,end_date_clean,ongoing,popularity_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
771,American Revolutionary War,military conflict,0.375011,0.259929,0.598,0.634364,70000,Eastern North America|Caribbean Sea|Indian ...,1783.0,False,1.867305
863,American Civil War,military conflict,0.31941,0.522999,0.212,0.532646,828000,Southern United States Q49042|Northern Uni...,1865.0,False,1.587055
3793,Battle of Bosworth Field,military conflict,0.152325,0.032733,0.034,0.208935,0,Ambion Hill Q4741491|Battle of Bosworth F...,1485.0,False,0.427993
4005,Battle of Pharsalus,military conflict,0.03451,0.017785,0.02,0.067354,230,Farsala Q985596,0.0,False,0.13965
4049,Battle of Blenheim,military conflict,0.104371,0.013543,0.095,0.137457,24968,Blindheim Q514914,1704.0,False,0.350371


In [6]:
articles.to_csv(DATA_DIR + 'articles_popularity_score.csv')

### Get the domains of references

In [7]:
wikipedia_ref.select('title').where("id = '655845'").head()

Row(title='Battle of Athens (1946)')

In [8]:
references_rrd = wikipedia_ref.rdd.flatMap(get_ref_info)
references = sqlContext.createDataFrame(references_rrd)
references.show(30)

+------+--------------+--------------------+--------------------+
|    id|      template|               title|                 url|
+------+--------------+--------------------+--------------------+
|655845|     cite book|Battle of Athens ...|                    |
|655845|     cite book|Battle of Athens ...|https://archive.o...|
|655845|     cite book|Battle of Athens ...|                    |
|655845|     cite book|Battle of Athens ...|https://archive.o...|
|655845|      cite web|Battle of Athens ...|http://www.americ...|
|655845|     cite book|Battle of Athens ...|https://archive.o...|
|655845|     cite book|Battle of Athens ...|https://archive.o...|
|655845|cite interview|Battle of Athens ...|http://volweb.utk...|
|655845|cite interview|Battle of Athens ...|http://volweb.utk...|
|655845|cite interview|Battle of Athens ...|http://volweb.utk...|
|655845|cite interview|Battle of Athens ...|http://volweb.utk...|
|655845|cite interview|Battle of Athens ...|http://volweb.utk...|
|655845|  

**URL parsing** Parse a URL into six components, returning a 6-tuple. This corresponds to the general structure of a URL: scheme://netloc/path;parameters?query#fragment. Each tuple item is a string, possibly empty.

In [None]:
"""def get_domain(row):
    # parse url and return for each row (url, 1) where 1 is the occurence of ref = 1 for that page
    try:
        parsed_uri = urlparse(row['url'])
        return ('{uri.netloc}'.format(uri=parsed_uri), 1)
    except:
        return None

mapped_domains = references.where("length(url)>0").rdd.map(get_domain).filter(lambda row: row is not None)
# for each domain count how many references there are in total to find top domains
domains_count = mapped_domains.reduceByKey(lambda a,b: a+b).filter(lambda r: len(r[0])>0).sortBy(lambda r: -r[1])
domains_count.take(30) """

In [None]:
"""def get_refs_info(entity):
    text = entity.revision.text._VALUE
    text = re.sub("(<!--.*?-->)", "", text, flags=re.MULTILINE) # remove comments
    refs = ref_regex.findall(text)
    result = []
    for r in refs:
        ref_content = r[1].split(r"|")
        template = ref_content.pop(0).strip()
        properties = {}
        for p in ref_content:
            eq_index = p.find("=")
            p_name = p[0:eq_index].strip()
            p_value = p[eq_index+1:].strip()
            properties[p_name] = p_value
        result.append(Row(id=entity.id, 
                          template=template.lower(), 
                          template_original=template, 
                          url=properties.get("url", ""), 
                          title=properties.get("title")))
    return result

"""

In [None]:
"""def get_refs_count(entity):
    text = entity.revision.text._VALUE
    refs = ref_regex.findall(text)
    return Row(id=entity.id, refs_count=len(refs))

articles = wikipedia.filter("ns = '0'").filter("redirect._title is null") \
    .filter("revision.text._VALUE is not null") \
    .filter("length(revision.text._VALUE) > 0")

references_count = sqlContext.createDataFrame(articles.rdd.map(get_refs_count))
references_count.show() """

In [9]:
def id_(entity):
    if entity.url != '': 
        parsed_url = urlparse(entity.url)
        print('{uri.netloc}'.format(uri=parsed_url))
        return Row(id=entity.id, title=entity.title, url='{uri.netloc}'.format(uri=parsed_url))
    else:
        return Row(id=entity.id, title=entity.title, url=entity.url)

# for each domain count how many references there are in total to find top domains
rdd__ = references.where("length(url)>0").rdd.map(id_)
rdd__.take(3)

[Row(id=655845, title='Battle of Athens (1946)', url='archive.org'),
 Row(id=655845, title='Battle of Athens (1946)', url='archive.org'),
 Row(id=655845, title='Battle of Athens (1946)', url='www.americanheritage.com')]

In [None]:
mapped_id_ref_df = sqlContext.createDataFrame(references.rdd.map(id_))
mapped_id_ref_df.show(30)

In [None]:
# count the number of links
number_links = domains_count.map(lambda r: r[1]).reduce(add)
print("Total number of links: {}".format(number_links))

In [None]:
def percentage(rdd):
    return Row(domain=rdd[0], count=rdd[1], perc=rdd[1]*100/number_links)

domains_distribution = sqlContext.createDataFrame(domains_count.map(percentage)).sort("count", ascending=False)
domains_distribution.show()

In [None]:
domains_distribution_pd = domains_distribution.toPandas().set_index('domain')

In [None]:
domains_distribution_pd20 = domains_distribution_pd.head(15)

# plot using bar plot
f, ax1 = plt.subplots()
plt.sca(ax1)
sns.barplot(domains_distribution_pd20['perc'], domains_distribution_pd20.index, palette="YlOrRd", ax=ax1, orient='h')
plt.xlabel('Percentage of references [%]',fontsize=18)
plt.ylabel('Domains',fontsize=18)
f.suptitle('Bar plot of percentage of references for domains', fontsize=20)
f.set_size_inches(10, 10)
plt.show()


In [None]:
domains_distribution_pd20 = domains_distribution_pd.head(15)

# plot using bar plot
f, ax1 = plt.subplots()
plt.sca(ax1)
sns.barplot(domains_distribution_pd20['perc'], domains_distribution_pd20.index, palette="YlOrRd", ax=ax1, orient='h')
plt.xlabel('Percentage of references [%]',fontsize=18)
plt.ylabel('Domains',fontsize=18)
f.suptitle('Bar plot of percentage of references for domains', fontsize=20)
f.set_size_inches(10, 10)
plt.show()
