## Common Crawl data analysis with RDDs

Initialize Spark Context to read domain graph

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .getOrCreate()

sc = spark.sparkContext

your 131072x1 screen size is bogus. expect trouble
23/08/15 22:20:26 WARN Utils: Your hostname, DESKTOP-1UBRHFB resolves to a loopback address: 127.0.1.1; using 172.22.204.54 instead (on interface eth0)
23/08/15 22:20:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/15 22:20:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
common_crawl_domain_counts = sc.textFile('./datafiles/Crawl_Data/cc-main-limited-domains.csv')
common_crawl_domain_counts.take(10)

                                                                                

['367855\t172-in-addr\tarpa\t1',
 '367856\taddr\tarpa\t1',
 '367857\tamphic\tarpa\t1',
 '367858\tbeta\tarpa\t1',
 '367859\tcallic\tarpa\t1',
 '367860\tch\tarpa\t1',
 '367861\td\tarpa\t1',
 '367862\thome\tarpa\t7',
 '367863\tiana\tarpa\t1',
 '367907\tlocal\tarpa\t1']

Reformatting rdd

In [3]:
def fmt_domain_graph_entry(entry):
    site_id, domain, tld, num_subdomains = entry.split('\t')
    return int(site_id), domain, tld, int(num_subdomains)

formatted_host_counts = common_crawl_domain_counts\
    .map(lambda e: fmt_domain_graph_entry(e))

formatted_host_counts.take(10)

[(367855, '172-in-addr', 'arpa', 1),
 (367856, 'addr', 'arpa', 1),
 (367857, 'amphic', 'arpa', 1),
 (367858, 'beta', 'arpa', 1),
 (367859, 'callic', 'arpa', 1),
 (367860, 'ch', 'arpa', 1),
 (367861, 'd', 'arpa', 1),
 (367862, 'home', 'arpa', 7),
 (367863, 'iana', 'arpa', 1),
 (367907, 'local', 'arpa', 1)]

Extract host counts

In [4]:
def extract_subdomain_counts(entry):
    site_id, domain, tld, num_subdomains = entry.split('\t')
    return int(num_subdomains)

host_counts = common_crawl_domain_counts\
    .map(lambda e: extract_subdomain_counts(e))
    
host_counts.take(10)

[1, 1, 1, 1, 1, 1, 1, 7, 1, 1]

Total number of subdomains using `.reduce()`

In [5]:
total_host_counts = host_counts\
    .reduce(lambda a,b: a+b)
    
total_host_counts

595466

In [6]:
# Alternative approach:

from operator import add

total_host_counts = host_counts\
    .reduce(add)
    
total_host_counts

595466

Closing and restarting pyspark.sql session

In [7]:
spark.stop()

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .getOrCreate()

Read csv as Spark DataFrame

In [18]:
common_crawl = spark.read\
    .option('delimiter', '\t')\
    .option('inferSchema', True)\
    .csv('./datafiles/Crawl_Data/cc-main-limited-domains.csv')
    
common_crawl.show(5, truncate=False)

+------+-----------+----+---+
|_c0   |_c1        |_c2 |_c3|
+------+-----------+----+---+
|367855|172-in-addr|arpa|1  |
|367856|addr       |arpa|1  |
|367857|amphic     |arpa|1  |
|367858|beta       |arpa|1  |
|367859|callic     |arpa|1  |
+------+-----------+----+---+
only showing top 5 rows



Create columns for DataFrame

In [19]:
common_crawl = common_crawl.toDF('site_id', 'domain', 'top_level_domain', 'num_subdomains')

In [21]:
common_crawl.show(5, truncate=False)
common_crawl.printSchema()

+-------+-----------+----------------+--------------+
|site_id|domain     |top_level_domain|num_subdomains|
+-------+-----------+----------------+--------------+
|367855 |172-in-addr|arpa            |1             |
|367856 |addr       |arpa            |1             |
|367857 |amphic     |arpa            |1             |
|367858 |beta       |arpa            |1             |
|367859 |callic     |arpa            |1             |
+-------+-----------+----------------+--------------+
only showing top 5 rows

root
 |-- site_id: integer (nullable = true)
 |-- domain: string (nullable = true)
 |-- top_level_domain: string (nullable = true)
 |-- num_subdomains: integer (nullable = true)



Saving as parquet

In [22]:
common_crawl\
    .write\
    .parquet('./datafiles/Crawl_Data/results', mode='overwrite')

                                                                                

Checking parquet save

In [24]:
common_crawl_domains = spark.read\
    .parquet('./datafiles/Crawl_Data/results/')

In [25]:
common_crawl_domains.show(5, truncate=False)
common_crawl_domains.printSchema()

+-------+-----------+----------------+--------------+
|site_id|domain     |top_level_domain|num_subdomains|
+-------+-----------+----------------+--------------+
|367855 |172-in-addr|arpa            |1             |
|367856 |addr       |arpa            |1             |
|367857 |amphic     |arpa            |1             |
|367858 |beta       |arpa            |1             |
|367859 |callic     |arpa            |1             |
+-------+-----------+----------------+--------------+
only showing top 5 rows

root
 |-- site_id: integer (nullable = true)
 |-- domain: string (nullable = true)
 |-- top_level_domain: string (nullable = true)
 |-- num_subdomains: integer (nullable = true)



Creating temporary view for SQL

In [26]:
common_crawl_domains.createOrReplaceTempView('crawl')

Total number of domains for each top-level domain

In [27]:
common_crawl_domains\
    .groupBy('top_level_domain')\
    .count()\
    .orderBy('count', ascending=False)\
    .show(10, truncate=False)

+----------------+-----+
|top_level_domain|count|
+----------------+-----+
|edu             |18547|
|gov             |15007|
|travel          |6313 |
|coop            |5319 |
|jobs            |3893 |
|post            |117  |
|map             |34   |
|arpa            |11   |
+----------------+-----+



Using SQL

In [28]:
spark.sql(
    """
    SELECT top_level_domain, COUNT(domain) AS count
    FROM crawl
    GROUP BY top_level_domain
    ORDER BY count DESC;
    """
).show(10, truncate=False)

+----------------+-----+
|top_level_domain|count|
+----------------+-----+
|edu             |18547|
|gov             |15007|
|travel          |6313 |
|coop            |5319 |
|jobs            |3893 |
|post            |117  |
|map             |34   |
|arpa            |11   |
+----------------+-----+



Calculating number of subdomains for each top-level domain

In [29]:
common_crawl_domains\
    .groupBy('top_level_domain')\
    .sum('num_subdomains')\
    .orderBy('sum(num_subdomains)', ascending=False)\
    .show(10, truncate=False)

+----------------+-------------------+
|top_level_domain|sum(num_subdomains)|
+----------------+-------------------+
|edu             |484438             |
|gov             |85354              |
|travel          |10768              |
|coop            |8683               |
|jobs            |6023               |
|post            |143                |
|map             |40                 |
|arpa            |17                 |
+----------------+-------------------+



Using SQL

In [30]:
spark.sql(
    """
    SELECT top_level_domain, SUM(num_subdomains) AS total_count
    FROM crawl
    GROUP BY top_level_domain
    ORDER BY total_count DESC;
    """
).show(10, truncate=False)

+----------------+-----------+
|top_level_domain|total_count|
+----------------+-----------+
|edu             |484438     |
|gov             |85354      |
|travel          |10768      |
|coop            |8683       |
|jobs            |6023       |
|post            |143        |
|map             |40         |
|arpa            |17         |
+----------------+-----------+



Calculating total amount of subdomains from `nps.gov`

In [32]:
common_crawl_domains\
    .select(['top_level_domain', 'domain', 'num_subdomains'])\
    .filter(common_crawl_domains.domain == 'nps')\
    .filter(common_crawl_domains.top_level_domain == 'gov')\
    .show(truncate=False)

+----------------+------+--------------+
|top_level_domain|domain|num_subdomains|
+----------------+------+--------------+
|gov             |nps   |178           |
+----------------+------+--------------+



In [33]:
spark.sql(
    """
    SELECT top_level_domain, domain, num_subdomains
    FROM crawl
    WHERE domain = 'nps' AND top_level_domain = 'gov';
    """
).show(truncate=False)

+----------------+------+--------------+
|top_level_domain|domain|num_subdomains|
+----------------+------+--------------+
|gov             |nps   |178           |
+----------------+------+--------------+



Closing Spark Session

In [34]:
spark.stop()