In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import modules.parsers as parsers
import glob

In [2]:
spark = SparkSession.builder\
                     .master("spark://pop-os.localdomain:7077")\
                     .enableHiveSupport() \
                     .getOrCreate()
sc = spark.sparkContext

22/04/07 21:28:26 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 10.10.64.126 instead (on interface wlp59s0)
22/04/07 21:28:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/07 21:28:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
import pyspark.sql.functions as F
# Extract a URL from a string

## df_extract_url
#  * Take input of a dataframe
#  * Takes a column name to extract url from body of text
# Regex Source: https://stackoverflow.com/questions/28185064/python-infinite-loop-in-regex-to-match-url
def df_extract_url(tmp_df, tmp_col):
    return tmp_df.withColumn("url", F.regexp_extract(F.col(tmp_col), r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", 0))          


# Get domain name from URL and put it in domainname table
# Regex Source: https://stackoverflow.com/questions/25703360/regular-expression-extract-subdomain-domain
def df_parse_domainname(tmp_df, tmp_col):
    tmp_df = tmp_df.withColumn( "domainname", F.regexp_extract(F.col(tmp_col) , r'^(?:http:\/\/|www\.|https:\/\/)([^\/]+)', 1)) 
    return tmp_df

# Get the domain name of an email from an email address
def df_parse_email(tmp_df, tmp_col):
    tmp_df = tmp_df.withColumn("email_username" , F.regexp_extract(F.col(tmp_col),   r'([^@]+)', 1)) 
    tmp_df = tmp_df.withColumn("email_domain"   , F.regexp_extract(F.col(tmp_col) ,  r'@(.*)'  , 1)) 
    return tmp_df

## Working with git repos

In [4]:
files = glob.glob('./out/git_out/**/*.json',  recursive=True)
first_file = files.pop()
git_repo_df = spark.read.json(first_file)
git_repo_df = parsers.df_parse_email(git_repo_df,  "author_email")
git_repo_df.first()
for tmp_df_path in files:
    tmp_git_repo_df = spark.read.json(tmp_df_path)
    tmp_git_repo_df = parsers.df_parse_email(tmp_git_repo_df,  "author_email")
    final_df = git_repo_df.unionByName(tmp_git_repo_df)
final_df.show(100, False) 

                                                                                

+--------------------------------------------------------+-------------------------+-------+------------------------------------+-------------------------------+------------------------+
|author_email                                            |author_name              |commits|remote_url                          |email_username                 |email_domain            |
+--------------------------------------------------------+-------------------------+-------+------------------------------------+-------------------------------+------------------------+
|0xfawkes@protonmail.com                                 |0xfawkes                 |1      |https://github.com/ethereum/EIPs.git|0xfawkes                       |protonmail.com          |
|12873030+mryalamanchi@users.noreply.github.com          |mr.yalamanchi            |3      |https://github.com/ethereum/EIPs.git|12873030+mryalamanchi          |users.noreply.github.com|
|14004106+lightclient@users.noreply.github.com           |lightcl

## Working with reddit data

In [5]:
text = ['My Profile: https://auth.geeksforgeeks.org/user/Chinmoy%20Lenka/articles in the portal of https://www.geeksforgeeks.org/',
        "Hello world https://www.udacity.com/", 
        "Hello World www.udacity.com", 
        "What about these URLs google.com NOPE",  
]

df = spark.createDataFrame(text, T.StringType()).toDF("text")

df = df.withColumn("url", F.regexp_extract(df.text, r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", 0))          
df = df.withColumn("domain", F.regexp_extract(df.url, r'^(?:http:\/\/|www\.|https:\/\/)([^\/]+)', 0))            
df[["url", "domain"]].show(5, False)

                                                                                

+------------------------------------------------------------+------------------------------+
|url                                                         |domain                        |
+------------------------------------------------------------+------------------------------+
|https://auth.geeksforgeeks.org/user/Chinmoy%20Lenka/articles|https://auth.geeksforgeeks.org|
|https://www.udacity.com/                                    |https://www.udacity.com       |
|www.udacity.com                                             |www.udacity.com               |
|                                                            |                              |
+------------------------------------------------------------+------------------------------+



In [6]:
reddit_df = spark.read.json('./data/reddit-stuff/RC_2018-01-01.ndjson')

                                                                                

In [7]:
reddit_df = df_extract_url(reddit_df, "body")

In [8]:
reddit_df[["body", "url"]].filter( F.col("url") != "" ).show(3)

+--------------------+--------------------+
|                body|                 url|
+--------------------+--------------------+
|We agree and that...|https://pecuniare...|
|We agree and that...|https://pecuniare...|
|We agree and that...|https://pecuniare...|
+--------------------+--------------------+
only showing top 3 rows



In [9]:
reddit_df = parsers.df_parse_domainname(reddit_df, "url")

In [10]:
reddit_df[["url", "domainname"]].filter( F.col("domainname") != "" ).show(5, False)

+----------------------------+-------------------+
|url                         |domainname         |
+----------------------------+-------------------+
|https://pecuniaresearch.com/|pecuniaresearch.com|
|https://pecuniaresearch.com/|pecuniaresearch.com|
|https://pecuniaresearch.com/|pecuniaresearch.com|
|https://pecuniaresearch.com/|pecuniaresearch.com|
|https://pecuniaresearch.com/|pecuniaresearch.com|
+----------------------------+-------------------+
only showing top 5 rows

