In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import modules.parsers as parsers
import glob

In [2]:
import configparser
import os
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.3")\
                     .master("spark://pop-os.localdomain:7077")\
                     .enableHiveSupport()\
                     .getOrCreate()
sc = spark.sparkContext

22/04/12 14:46:14 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.2.99 instead (on interface eno1)
22/04/12 14:46:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/paul/.ivy2/cache
The jars for the packages stored in: /home/paul/.ivy2/jars
:: loading settings :: url = jar:file:/home/paul/Projects/DataEngineering/Capstone/env/lib/python3.8/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-47d55f3f-81ff-4b88-8319-5166a9e36d92;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;2.7.3 in central
	found org.apache.hadoop#hadoop-common;2.7.3 in central
	found org.apache.hadoop#hadoop-annotations;2.7.3 in central
	found com.google.guava#guava;11.0.2 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found commons-cli#commo

In [4]:
# File is 1.5 Gb will take a couple minutes to load into spark
reddit_data_url = "s3a://paul-udacity-capstone/reddit/002.ndjson" # For Testing
# reddit_data_url = "s3a://paul-udacity-capstone/reddit/RC_2018_01_01"
df_reddit = spark.read.json(reddit_data_url)

                                                                                

In [5]:
df_reddit.count()

                                                                                

1000

In [6]:
git_data_url = "s3a://paul-udacity-capstone/git-dump/freeCodeCamp/freeCodeCamp.json" # For Testing
# git_data_url = "s3a://paul-udacity-capstone/git-dump/*/*.json"
df_git = spark.read.json(git_data_url)

                                                                                

In [7]:
df_git.count()

4789

## Working with git repos

In [8]:
git_repo_df = parsers.df_parse_email(df_git,  "author_email")

In [9]:
git_repo_df.limit(5).toPandas()

Unnamed: 0,author_email,author_name,commits,remote_url,email_username,email_domain
0,0nitinchauhan@gmail.com,Nitin Chauhan,3,https://github.com/freeCodeCamp/freeCodeCamp.git,0nitinchauhan,gmail.com
1,0x0936@users.noreply.github.com,Robert Richey,1,https://github.com/freeCodeCamp/freeCodeCamp.git,0x0936,users.noreply.github.com
2,10049458+dallyingllama@users.noreply.github.com,dallyingllama,2,https://github.com/freeCodeCamp/freeCodeCamp.git,10049458+dallyingllama,users.noreply.github.com
3,1010636+JeremyBlanc@users.noreply.github.com,Jeremias Blanco-Choncén,2,https://github.com/freeCodeCamp/freeCodeCamp.git,1010636+JeremyBlanc,users.noreply.github.com
4,101513652+anjiqbal@users.noreply.github.com,Anjum Iqbal,1,https://github.com/freeCodeCamp/freeCodeCamp.git,101513652+anjiqbal,users.noreply.github.com


## Working with reddit data

In [10]:
reddit_df = parsers.df_extract_url(df_reddit, "body")
reddit_df = parsers.df_parse_domainname(reddit_df, "url")

In [11]:
reddit_df.printSchema()

root
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: boolean (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_type: string (nullable = true)
 |-- url: string (nullable = true)
 |-- domain_name: string (nullable = true)



## Data Integration Time
Separately grouBy the domain names, one for reddit, one for git then perform join on the dataframes 

In [12]:
reddit_domains_grouped = reddit_df.filter( F.col("url") != "" ).groupBy("domain_name").count()
reddit_domains_grouped = reddit_domains_grouped.withColumnRenamed("count", "reddit_domain_count")

In [14]:
reddit_df.show(3, False)

+-----------+----------------------+--------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+----------------+-----------+-------------+------+------+-------+------------+---------+----------+--------------------------------------------------------------------------------+------------+-----+--------+--------------+------------+-----

In [15]:
reddit_df.count()

1000

In [16]:
reddit_df = reddit_df[["body", "url"]].filter( reddit_df["url"] == "")

In [17]:
reddit_df.filter( F.col("url") == None).count()

0

In [21]:
git_domains_grouped = git_repo_df.groupBy("email_domain").count()
git_domains_grouped = git_domains_grouped.withColumnRenamed("count", "git_domain_count")

In [22]:
# reddit_domains_grouped.show(3)

In [23]:
git_domains_grouped.show(10, False)

                                                                                

+---------------------------+----------------+
|email_domain               |git_domain_count|
+---------------------------+----------------+
|mowoe.com                  |1               |
|tutanota.com               |1               |
|biscui.tech                |1               |
|nl.rogers.com              |1               |
|PRASHANTs-MacBook-Air.local|1               |
|my.csun.edu                |1               |
|yandex.by                  |1               |
|coderain.co.uk             |1               |
|oxfordni.com               |1               |
|riseup.net                 |1               |
+---------------------------+----------------+
only showing top 10 rows



In [24]:
joined_domains = reddit_domains_grouped.alias("reddit")\
  .join(git_domains_grouped.alias("git"), F.col("git.email_domain") == F.col("reddit.domain_name"))

In [25]:
joined_domains.show()



+-----------+-------------------+------------+----------------+
|domain_name|reddit_domain_count|email_domain|git_domain_count|
+-----------+-------------------+------------+----------------+
| github.com|                 13|  github.com|               3|
+-----------+-------------------+------------+----------------+



                                                                                

In [26]:
# joined_domains.write.parquet("s3a://paul-udacity-capstone/joined_domains.parquet", mode="overwrite")

In [27]:
git_domains_grouped = git_domains_grouped.withColumnRenamed("email_domain", "domain_name")

In [32]:
# reddit_domains_grouped.unionByName( git_domains_grouped,  allowMissingColumns=True ).show()
diff1 = [c for c in reddit_domains_grouped.columns if c not in git_domains_grouped.columns]
diff2 = [c for c in git_domains_grouped.columns if c not in reddit_domains_grouped.columns]
df = git_domains_grouped.select('*', *[F.lit(None).alias(c) for c in diff1]) \
    .unionByName(reddit_domains_grouped.select('*', *[F.lit(None).alias(c) for c in diff2]))

In [37]:
df.show()

[Stage 40:>                                                         (0 + 1) / 1]

+--------------------+----------------+-------------------+
|         domain_name|git_domain_count|reddit_domain_count|
+--------------------+----------------+-------------------+
|           mowoe.com|               1|               null|
|        tutanota.com|               1|               null|
|         biscui.tech|               1|               null|
|       nl.rogers.com|               1|               null|
|PRASHANTs-MacBook...|               1|               null|
|         my.csun.edu|               1|               null|
|           yandex.by|               1|               null|
|      coderain.co.uk|               1|               null|
|        oxfordni.com|               1|               null|
|          riseup.net|               1|               null|
|              usp.br|               1|               null|
|         nagarro.com|               1|               null|
|           ghosh.pro|               1|               null|
|           swedin.nu|               1| 

                                                                                