In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import modules.parsers as parsers
import glob

In [2]:
spark = SparkSession.builder\
                     .master("spark://pop-os.localdomain:7077")\
                     .enableHiveSupport()\
                     .getOrCreate()
sc = spark.sparkContext

22/04/08 14:09:15 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.2.99 instead (on interface eno1)
22/04/08 14:09:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/08 14:09:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Working with git repos

In [3]:
files = glob.glob('./out/git_out/**/*.json',  recursive=True)
first_file = files.pop()
git_repo_df = spark.read.json(first_file)
git_repo_df = parsers.df_parse_email(git_repo_df,  "author_email")
git_repo_df.first()
for tmp_df_path in files:
    tmp_git_repo_df = spark.read.json(tmp_df_path)
    tmp_git_repo_df = parsers.df_parse_email(tmp_git_repo_df,  "author_email")
    final_df = git_repo_df.unionByName(tmp_git_repo_df)

                                                                                

In [4]:
final_df.toPandas()

Unnamed: 0,author_email,author_name,commits,remote_url,email_username,email_domain
0,100390577+anitajov@users.noreply.github.com,anitajov,1,https://github.com/cardano-foundation/CIPs.git,100390577+anitajov,users.noreply.github.com
1,12563287+kevinhammond@users.noreply.github.com,Kevin Hammond,2,https://github.com/cardano-foundation/CIPs.git,12563287+kevinhammond,users.noreply.github.com
2,19835357+intricate@users.noreply.github.com,Luke,4,https://github.com/cardano-foundation/CIPs.git,19835357+intricate,users.noreply.github.com
3,31965230+katomm@users.noreply.github.com,Tommy Kammerer,1,https://github.com/cardano-foundation/CIPs.git,31965230+katomm,users.noreply.github.com
4,35738310+alessandrokonrad@users.noreply.github...,alessandrokonrad,2,https://github.com/cardano-foundation/CIPs.git,35738310+alessandrokonrad,users.noreply.github.com
...,...,...,...,...,...,...
508,z_robzizo@yahoo.com,robzizo,1,https://github.com/ethereum/EIPs.git,z_robzizo,yahoo.com
509,zaq1tomo@gmail.com,zaq1tomo,10,https://github.com/ethereum/EIPs.git,zaq1tomo,gmail.com
510,zhous1998@gmail.com,Derek周朝晖,5,https://github.com/ethereum/EIPs.git,zhous1998,gmail.com
511,zzn-github@zzn.im,xinbenlv,2,https://github.com/ethereum/EIPs.git,zzn-github,zzn.im


## Working with reddit data

In [5]:
def df_extract_url(tmp_df, tmp_col):
    return tmp_df.withColumn("url", F.regexp_extract(F.col(tmp_col), r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", 0))          

def df_parse_domainname(tmp_df, tmp_col):
    tmp_df = tmp_df.withColumn( "domain_name", F.regexp_extract(F.col(tmp_col) , r'^(?:http:\/\/|www\.|https:\/\/)([^\/]+)', 1)) 
    return tmp_df

reddit_df = spark.read.json('./data/002.ndjson')# RC_2018-01-01.ndjson')
reddit_df = df_extract_url(reddit_df, "body")
reddit_df = df_parse_domainname(reddit_df, "url")

In [6]:
reddit_df[["body", "url", "domain_name"]].filter( F.col("url") != "" ).show(3)

+--------------------+--------------------+--------------+
|                body|                 url|   domain_name|
+--------------------+--------------------+--------------+
|Good post but unf...|https://www.reddi...|www.reddit.com|
|First of all, mut...|http://gen.lib.ru...|gen.lib.rus.ec|
|May I ask why the...|https://imgur.com...|     imgur.com|
+--------------------+--------------------+--------------+
only showing top 3 rows



In [7]:
reddit_df.limit(1).toPandas()

22/04/08 14:09:22 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Unnamed: 0,archived,author,author_cakeday,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,...,retrieved_on,score,send_replies,stickied,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,url,domain_name
0,,brice_krispy,,1547600379,,,[],,,,...,1554273048,1,True,False,Sneakers,t5_2qrtt,r/Sneakers,public,,


## Data Integration Time
Separately grouBy the domain names, one for reddit, one for git then perform join on the dataframes 

In [8]:
reddit_domains_grouped = reddit_df.groupBy("domain_name").count()
reddit_domains_grouped = reddit_domains_grouped.withColumnRenamed("count", "reddit_domain_count")

In [None]:
git_domains_grouped = git_repo_df.groupBy("email_domain").count()
git_domains_grouped = git_domains_grouped.withColumnRenamed("count", "git_domain_count")

In [None]:
reddit_domains_grouped.show(3)

In [None]:
git_domains_grouped.show(3)

In [None]:
domains_list = reddit_domains_grouped[["domain_name", "reddit_domain_count"]].\
    withColumnRenamed("domain_name","domain_name").\
    union(git_domains_grouped[["email_domain", "git_domain_count"]])

In [None]:
joined_domains = reddit_domains_grouped.alias("reddit")\
  .join(git_domains_grouped.alias("git"), F.col("git.email_domain") == F.col("reddit.domain_name"))

In [None]:
git_domains_grouped.show()

In [None]:
joined_domains.show()

In [None]:
unioned_domains = reddit_domains_grouped.unionByName(git_domains_grouped.withColumnRenamed("email_domain", "domain_name"),  allowMissingColumns=True)

In [None]:
unioned_domains.show()

In [None]:
unioned_domains = reddit_domains_grouped.unionByName(git_domains_grouped)