In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import modules.parsers as parsers
import glob

In [2]:
spark = SparkSession.builder\
                     .master("spark://pop-os.localdomain:7077")\
                     .enableHiveSupport() \
                     .getOrCreate()
sc = spark.sparkContext

22/04/07 22:22:45 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 10.10.64.126 instead (on interface wlp59s0)
22/04/07 22:22:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/07 22:22:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
import pyspark.sql.functions as F
# Extract a URL from a string

## df_extract_url
#  * Take input of a dataframe
#  * Takes a column name to extract url from body of text
# Regex Source: https://stackoverflow.com/questions/28185064/python-infinite-loop-in-regex-to-match-url
def df_extract_url(tmp_df, tmp_col):
    return tmp_df.withColumn("url", F.regexp_extract(F.col(tmp_col), r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", 0))          


# Get domain name from URL and put it in domainname table
# Regex Source: https://stackoverflow.com/questions/25703360/regular-expression-extract-subdomain-domain
def df_parse_domainname(tmp_df, tmp_col):
    tmp_df = tmp_df.withColumn( "domain_name", F.regexp_extract(F.col(tmp_col) , r'^(?:http:\/\/|www\.|https:\/\/)([^\/]+)', 1)) 
    return tmp_df

# Get the domain name of an email from an email address
def df_parse_email(tmp_df, tmp_col):
    tmp_df = tmp_df.withColumn("email_username" , F.regexp_extract(F.col(tmp_col),   r'([^@]+)', 1)) 
    tmp_df = tmp_df.withColumn("email_domain"   , F.regexp_extract(F.col(tmp_col) ,  r'@(.*)'  , 1)) 
    return tmp_df

## Working with git repos

In [4]:
files = glob.glob('./out/git_out/**/*.json',  recursive=True)
first_file = files.pop()
git_repo_df = spark.read.json(first_file)
git_repo_df = parsers.df_parse_email(git_repo_df,  "author_email")
git_repo_df.first()
for tmp_df_path in files:
    tmp_git_repo_df = spark.read.json(tmp_df_path)
    tmp_git_repo_df = parsers.df_parse_email(tmp_git_repo_df,  "author_email")
    final_df = git_repo_df.unionByName(tmp_git_repo_df)

                                                                                

In [5]:
final_df.show(10, False) 

+----------------------------------------------+----------------+-------+------------------------------------+---------------------+------------------------+
|author_email                                  |author_name     |commits|remote_url                          |email_username       |email_domain            |
+----------------------------------------------+----------------+-------+------------------------------------+---------------------+------------------------+
|0xfawkes@protonmail.com                       |0xfawkes        |1      |https://github.com/ethereum/EIPs.git|0xfawkes             |protonmail.com          |
|12873030+mryalamanchi@users.noreply.github.com|mr.yalamanchi   |3      |https://github.com/ethereum/EIPs.git|12873030+mryalamanchi|users.noreply.github.com|
|14004106+lightclient@users.noreply.github.com |lightclient     |62     |https://github.com/ethereum/EIPs.git|14004106+lightclient |users.noreply.github.com|
|1591639+s1na@users.noreply.github.com         |Sina

In [35]:
final_df.toPandas()

Unnamed: 0,author_email,author_name,commits,remote_url,email_username,email_domain
0,0xfawkes@protonmail.com,0xfawkes,1,https://github.com/ethereum/EIPs.git,0xfawkes,protonmail.com
1,12873030+mryalamanchi@users.noreply.github.com,mr.yalamanchi,3,https://github.com/ethereum/EIPs.git,12873030+mryalamanchi,users.noreply.github.com
2,14004106+lightclient@users.noreply.github.com,lightclient,62,https://github.com/ethereum/EIPs.git,14004106+lightclient,users.noreply.github.com
3,1591639+s1na@users.noreply.github.com,Sina Mahmoodi,2,https://github.com/ethereum/EIPs.git,1591639+s1na,users.noreply.github.com
4,1641795+vikmeup@users.noreply.github.com,Viktor Radchenko,1,https://github.com/ethereum/EIPs.git,1641795+vikmeup,users.noreply.github.com
...,...,...,...,...,...,...
508,rooooooooob@users.noreply.github.com,rooooooooob,2,https://github.com/cardano-foundation/CIPs.git,rooooooooob,users.noreply.github.com
509,rphair@cosd.com,Robert Phair,2,https://github.com/cardano-foundation/CIPs.git,rphair,cosd.com
510,sebastiengllmt@gmail.com,Sebastien Guillemot,6,https://github.com/cardano-foundation/CIPs.git,sebastiengllmt,gmail.com
511,shawn_mcmurdo@yahoo.com,Shawn McMurdo,5,https://github.com/cardano-foundation/CIPs.git,shawn_mcmurdo,yahoo.com


## Working with reddit data

In [6]:
text = ['My Profile: https://auth.geeksforgeeks.org/user/Chinmoy%20Lenka/articles in the portal of https://www.geeksforgeeks.org/',
        "Hello world https://www.udacity.com/", 
        "Hello World www.udacity.com", 
        "What about these URLs google.com NOPE",  
]

df = spark.createDataFrame(text, T.StringType()).toDF("text")

df = df.withColumn("url", F.regexp_extract(df.text, r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", 0))          
df = df.withColumn("domain", F.regexp_extract(df.url, r'^(?:http:\/\/|www\.|https:\/\/)([^\/]+)', 0))            
df[["url", "domain"]].show(5, False)

                                                                                

+------------------------------------------------------------+------------------------------+
|url                                                         |domain                        |
+------------------------------------------------------------+------------------------------+
|https://auth.geeksforgeeks.org/user/Chinmoy%20Lenka/articles|https://auth.geeksforgeeks.org|
|https://www.udacity.com/                                    |https://www.udacity.com       |
|www.udacity.com                                             |www.udacity.com               |
|                                                            |                              |
+------------------------------------------------------------+------------------------------+



In [7]:
reddit_df = spark.read.json('./data/reddit-stuff/xaa')# RC_2018-01-01.ndjson')

In [8]:
reddit_df = df_extract_url(reddit_df, "body")

In [9]:
reddit_df[["body", "url"]].filter( F.col("url") != "" ).show(3)

+--------------------+--------------------+
|                body|                 url|
+--------------------+--------------------+
|We agree and that...|https://pecuniare...|
|We agree and that...|https://pecuniare...|
|We agree and that...|https://pecuniare...|
+--------------------+--------------------+
only showing top 3 rows



In [10]:
reddit_df = parsers.df_parse_domainname(reddit_df, "url")

In [11]:
reddit_df[["url", "domain_name"]].filter( F.col("domain_name") != "" ).show(5, False)

+----------------------------+-------------------+
|url                         |domain_name        |
+----------------------------+-------------------+
|https://pecuniaresearch.com/|pecuniaresearch.com|
|https://pecuniaresearch.com/|pecuniaresearch.com|
|https://pecuniaresearch.com/|pecuniaresearch.com|
|https://pecuniaresearch.com/|pecuniaresearch.com|
|https://pecuniaresearch.com/|pecuniaresearch.com|
+----------------------------+-------------------+
only showing top 5 rows



## Data Integration Time
Separately grouBy the domain names, one for reddit, one for git then perform join on the dataframes 

In [12]:
reddit_domains_grouped = reddit_df.groupBy("domain_name").count()
reddit_domains_grouped = reddit_domains_grouped.withColumnRenamed("count", "reddit_domain_count")

In [13]:
git_domains_grouped = git_repo_df.groupBy("email_domain").count()
git_domains_grouped = git_domains_grouped.withColumnRenamed("count", "git_domain_count")

In [14]:
reddit_domains_grouped.show(3)

+-------------------+-------------------+
|        domain_name|reddit_domain_count|
+-------------------+-------------------+
|         www.nhs.uk|                 13|
|           youtu.be|                 13|
|pecuniaresearch.com|                 13|
+-------------------+-------------------+
only showing top 3 rows



In [15]:
git_domains_grouped.show(3)

+--------------------+----------------+
|        email_domain|git_domain_count|
+--------------------+----------------+
|         tokenate.io|               1|
|patriciopalladino...|               1|
|          aragon.one|               2|
+--------------------+----------------+
only showing top 3 rows



In [16]:
domains_list = reddit_domains_grouped[["domain_name", "reddit_domain_count"]].\
    withColumnRenamed("domain_name","domain_name").\
    union(git_domains_grouped[["email_domain", "git_domain_count"]])

In [23]:
joined_domains = reddit_domains_grouped.alias("reddit")\
  .join(git_domains_grouped.alias("git"), F.col("git.email_domain") == F.col("reddit.domain_name"))

In [27]:
git_domains_grouped.show()

+--------------------+----------------+
|        email_domain|git_domain_count|
+--------------------+----------------+
|         tokenate.io|               1|
|patriciopalladino...|               1|
|          aragon.one|               2|
|          majoolr.io|               1|
|           zoltu.net|               1|
|        tutanota.com|               1|
|        stanford.edu|               1|
|          riseup.net|               1|
|mohamedabdulaziz.com|               1|
|       sigmaprime.io|               1|
|       paulrberg.com|               1|
|users.noreply.git...|             119|
|           swende.se|               1|
|      googlemail.com|               1|
|          twurst.com|               1|
|             fork.at|               1|
|           gmail.com|             172|
|        bitspill.net|               1|
|       mtpelerin.com|               1|
|         io.builders|               1|
+--------------------+----------------+
only showing top 20 rows



In [24]:
joined_domains.show()

+-----------+-------------------+------------+----------------+
|domain_name|reddit_domain_count|email_domain|git_domain_count|
+-----------+-------------------+------------+----------------+
|           |                435|            |               1|
+-----------+-------------------+------------+----------------+



In [33]:
unioned_domains = reddit_domains_grouped.unionByName(git_domains_grouped.withColumnRenamed("email_domain", "domain_name"),  allowMissingColumns=True)

In [34]:
unioned_domains.show()

+--------------------+-------------------+----------------+
|         domain_name|reddit_domain_count|git_domain_count|
+--------------------+-------------------+----------------+
|          www.nhs.uk|                 13|            null|
|            youtu.be|                 13|            null|
| pecuniaresearch.com|                 13|            null|
|      www.reddit.com|                 13|            null|
|                    |                435|            null|
|  bioshock.wikia.com|                 13|            null|
|         tokenate.io|               null|               1|
|patriciopalladino...|               null|               1|
|          aragon.one|               null|               2|
|          majoolr.io|               null|               1|
|           zoltu.net|               null|               1|
|        tutanota.com|               null|               1|
|        stanford.edu|               null|               1|
|          riseup.net|               nul

In [30]:
unioned_domains = reddit_domains_grouped.unionByName(git_domains_grouped)

AnalysisException: Cannot resolve column name "domain_name" among (email_domain, git_domain_count)