In [176]:
import json

from pyspark.sql import SparkSession

from pyspark.sql.types import *
import pyspark.sql.types as spark_types
import pyspark.sql.functions as sf

def spark_schema_from_json(j):
    return StructType([
        StructField(
            f["name"],
            spark_types._parse_datatype_string(f.get("type", "string")),
            f.get("nullable", True),
            f.get("metadata", None)
        )
        for f in j
    ])

spark = SparkSession.builder.master("local").appName("GH Users").getOrCreate()

with open("ghtorrent-schema.json") as f:
    db_schema = json.load(f)

In [177]:
# Read projects.csv
df_projects = spark.read.csv(
    path="/home/hthuwal/Desktop/Ghtorrent_heads/projects.csv",
    schema=spark_schema_from_json(db_schema["projects.csv"]),
    nullValue="\\N",
)

In [178]:
# Read issues.csv
df_issues = spark.read.csv(
    path="/home/hthuwal/Desktop/Ghtorrent_heads/issues.csv",
    schema=spark_schema_from_json(db_schema["issues.csv"]),
    nullValue="\\N",
)

In [179]:
# Read commits.csv
df_commits = spark.read.csv(
    path="/home/hthuwal/Desktop/Ghtorrent_heads/commits.csv",
    schema=spark_schema_from_json(db_schema["commits.csv"]),
    nullValue="\\N",
)

In [180]:
# Read pull_requests.csv
df_pull_requests = spark.read.csv(
    path="/home/hthuwal/Desktop/Ghtorrent_heads/pull_requests.csv",
    schema=spark_schema_from_json(db_schema["pull_requests.csv"]),
    nullValue="\\N",
)

In [181]:
# Read pull_request_history.csv
df_pull_request_history = spark.read.csv(
    path="/home/hthuwal/Desktop/Ghtorrent_heads/pull_request_history.csv",
    schema=spark_schema_from_json(db_schema["pull_request_history.csv"]),
    nullValue="\\N",
)

# Lifespan of projects based on commits


In [186]:
# first and last commit per project
df_commits.createOrReplaceTempView("temp")
lifespan_commits = spark.sql("""SELECT project_id, 
                      min(created_at) as first,
                      max(created_at) as last
                      FROM temp
                      WHERE
                      created_at < CURRENT_TIMESTAMP
                      AND project_id IS NOT NULL
                      GROUP BY project_id
                      """
)
lifespan_commits = lifespan_commits.withColumn('duration', sf.datediff(sf.col("last"),sf.col("first"))) \
                   .select("project_id", "duration").sort(sf.desc("duration"))
lifespan_commits.show()

+----------+--------+
|project_id|duration|
+----------+--------+
|    646812|   15589|
|     26340|   15588|
|    688528|   15587|
|    703528|   15585|
|    331548|   15583|
|    642580|   15583|
|    149978|   15583|
|     19275|   15566|
|    193164|   15562|
|    143064|   15561|
|     59931|   15554|
|     18215|   15536|
|    202787|   15531|
|    162304|   15457|
|     50473|   15380|
|    333497|   15240|
|    447436|   15173|
|    782192|   15156|
|    633117|   14993|
|     57532|   14666|
+----------+--------+
only showing top 20 rows



# Lifespan of projects based on issues

In [196]:
# first and last issue per project
df_issues.createOrReplaceTempView("temp")
lifespan_issues = spark.sql("""SELECT repo_id as project_id,
                      min(created_at) as first,
                      max(created_at) as last
                      FROM temp
                      WHERE created_at < CURRENT_TIMESTAMP
                      AND repo_id IS NOT NULL
                      GROUP BY repo_id
                      """
)
lifespan_issues = lifespan_issues.withColumn('duration', sf.datediff(sf.col("last"),sf.col("first"))) \
                   .select("project_id", "duration").sort(sf.desc("duration"))
lifespan_issues.show()

+----------+--------+
|project_id|duration|
+----------+--------+
|    825924|    1969|
|     32678|    1716|
|     32023|    1636|
|      1334|    1615|
|      1707|    1611|
|     33145|    1609|
|      1043|    1599|
|   3746639|    1598|
|      1678|    1598|
|      6584|    1598|
|     12558|    1597|
|     28304|    1597|
|       411|    1597|
|      6883|    1596|
|     57415|    1595|
|       189|    1595|
|      5245|    1594|
|    269963|    1594|
|     72295|    1593|
|      7894|    1593|
+----------+--------+
only showing top 20 rows



# Lifespan of project based on pull request activity

In [197]:
# last action of each pull request
last_act_plreq = df_pull_request_history \
                               .groupBy(df_pull_request_history.pull_request_id) \
                               .agg(sf.max(df_pull_request_history.created_at).alias('last_action_time')) \
                               .sort(sf.desc("last_action_time"))
last_act_plreq.show()

+---------------+-------------------+
|pull_request_id|   last_action_time|
+---------------+-------------------+
|        2922704|2014-02-04 23:53:19|
|        2542092|2014-02-04 23:53:12|
|        2754127|2014-02-04 23:47:51|
|        2925089|2014-02-04 23:34:39|
|        2924069|2014-02-04 23:00:53|
|        2864099|2014-02-04 22:55:40|
|        2913490|2014-02-04 22:43:24|
|        2916280|2014-02-04 22:39:29|
|         133323|2014-02-04 22:10:28|
|        2889989|2014-02-04 22:06:13|
|        2919026|2014-02-04 21:59:10|
|        2872286|2014-02-04 21:55:03|
|        2743840|2014-02-04 21:53:25|
|        2504948|2014-02-04 21:39:08|
|        2485363|2014-02-04 21:22:29|
|        2813838|2014-02-04 21:19:57|
|        2584093|2014-02-04 21:15:13|
|        2885327|2014-02-04 21:02:52|
|        2881601|2014-02-04 20:56:44|
|        2928250|2014-02-04 20:55:50|
+---------------+-------------------+
only showing top 20 rows



In [198]:
df1 = df_pull_requests.alias("df1")
df2 = last_act_plreq.alias("df2")

# last action of pull request per (base repo, head repo)
df_temp = df1.join(df2, sf.col("df1.id") == sf.col("df2.pull_request_id"),"inner") \
          .select(df1.head_repo_id, df1.base_repo_id, df2.last_action_time) \
          .where(df1.head_repo_id.isNotNull() & df1.base_repo_id.isNotNull()) \
          .sort(sf.desc("last_action_time"))
          
df_temp.show()

+------------+------------+-------------------+
|head_repo_id|base_repo_id|   last_action_time|
+------------+------------+-------------------+
|     6013443|     5940987|2014-02-04 23:53:19|
|     6680430|     1958118|2014-02-04 23:53:12|
|     7247086|     6031587|2014-02-04 23:47:51|
|     7749253|     7340343|2014-02-04 23:34:39|
|     7746587|       82594|2014-02-04 23:00:53|
|     7579868|     1356577|2014-02-04 22:55:40|
|     7249635|     7249635|2014-02-04 22:43:24|
|     6193996|     6184871|2014-02-04 22:39:29|
|      382315|      174554|2014-02-04 22:10:28|
|     6307316|     6307316|2014-02-04 22:06:13|
|     7730159|     4736847|2014-02-04 21:59:10|
|      795282|      139327|2014-02-04 21:55:03|
|     5049920|     5049920|2014-02-04 21:53:25|
|     6328603|     1905872|2014-02-04 21:39:08|
|     4511467|     4346078|2014-02-04 21:22:29|
|     7436039|     2418719|2014-02-04 21:19:57|
|     6421675|     6199549|2014-02-04 21:15:13|
|     2207012|     2207012|2014-02-04 21

In [200]:
df_temp.createOrReplaceTempView("temp")

# takes two passes need to do in single pass
query = """ 
            SELECT temp.head_repo_id as project_id, temp.last_action_time 
            FROM temp WHERE temp.head_repo_id IS NOT NULL
            UNION ALL
            SELECT temp.base_repo_id as project_id, temp.last_action_time 
            FROM temp WHERE temp.base_repo_id IS NOT NULL
        """
# last pull activity per repo
last_pull_act_repo = spark.sql(query)
last_pull_act_repo.show()

+----------+-------------------+
|project_id|   last_action_time|
+----------+-------------------+
|   6013443|2014-02-04 23:53:19|
|   6680430|2014-02-04 23:53:12|
|   7247086|2014-02-04 23:47:51|
|   7749253|2014-02-04 23:34:39|
|   7746587|2014-02-04 23:00:53|
|   7579868|2014-02-04 22:55:40|
|   7249635|2014-02-04 22:43:24|
|   6193996|2014-02-04 22:39:29|
|    382315|2014-02-04 22:10:28|
|   6307316|2014-02-04 22:06:13|
|   7730159|2014-02-04 21:59:10|
|    795282|2014-02-04 21:55:03|
|   5049920|2014-02-04 21:53:25|
|   6328603|2014-02-04 21:39:08|
|   4511467|2014-02-04 21:22:29|
|   7436039|2014-02-04 21:19:57|
|   6421675|2014-02-04 21:15:13|
|   2207012|2014-02-04 21:02:52|
|   4179265|2014-02-04 20:56:44|
|   7756497|2014-02-04 20:55:50|
+----------+-------------------+
only showing top 20 rows



In [203]:
# first and last pull_request activity
last_pull_act_repo.createOrReplaceTempView("temp")
lifespan_pulls = spark.sql("""SELECT project_id,
                      min(last_action_time) as first,
                      max(last_action_time) as last
                      FROM temp
                      WHERE last_action_time < CURRENT_TIMESTAMP
                      AND project_id IS NOT NULL
                      GROUP BY project_id
                      """
)
lifespan_pulls = lifespan_pulls.withColumn('duration', sf.datediff(sf.col("last"),sf.col("first"))) \
                   .select("project_id", "duration").sort(sf.desc("duration"))
lifespan_pulls.show()

+----------+--------+
|project_id|duration|
+----------+--------+
|    251975|    1249|
|       770|    1247|
|     18474|    1243|
|     22810|    1242|
|    499790|    1242|
|     48582|    1242|
|     23830|    1241|
|    390616|    1241|
|     50331|    1239|
|       634|    1238|
|      1920|    1237|
|      3826|    1235|
|     53603|    1235|
|     91833|    1233|
|     14073|    1233|
|       183|    1232|
|     35079|    1230|
|        37|    1230|
|      5219|    1229|
|    552190|    1225|
+----------+--------+
only showing top 20 rows



# Lifespan single table and reduction

In [217]:
df_projects.createOrReplaceTempView("temp")
lifespan_commits.createOrReplaceTempView("t1")
lifespan_issues.createOrReplaceTempView("t2")
lifespan_pulls.createOrReplaceTempView("t3")

query = """
    SELECT temp.url, temp.owner_id, temp.language,
           t1.duration as commits, t2.duration as issues, t3.duration as pull_requests
    FROM temp, t1, t2, t3
"""

lifespan = spark.sql(query)
