# Spark Commits Analysis

In [14]:
# initialize Spark Session
import os
import sys
top_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
if top_dir not in sys.path:
    sys.path.append(top_dir)

from init_spark import init_spark
spark = init_spark()
sc = spark.sparkContext

Initializing Spark...
Spark found in :  /home/ubuntu/dev/spark
Spark config:
	 executor.memory=2g
	some_property=some_value
	spark.app.name=TestApp
	spark.master=local[*]
	spark.sql.warehouse.dir=/tmp/tmpctke06vv
	spark.submit.deployMode=client
	spark.ui.showConsoleProgress=true
Spark UI running on port 4041


In [15]:
commits = spark.read. \
              option("header", "true"). \
              option("delimiter", "|"). \
              csv("/data/spark-commits/spark-commits.log")

In [16]:
commits.printSchema()
commits.show()

root
 |-- sha: string (nullable = true)
 |-- committer: string (nullable = true)
 |-- email: string (nullable = true)
 |-- date: string (nullable = true)
 |-- comment: string (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 sha|           committer|               email|                date|             comment|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|68f2142cfd2ca632a...|        Dilip Biswal|  dbiswal@us.ibm.com|Sat Feb 25 23:56:...|[SQL] Duplicate t...|
|89608cf26226e28f3...|         Wenchen Fan|wenchen@databrick...|Sat Feb 25 23:01:...|[SPARK-17075][SQL...|
|6ab60542e8e803b1d...|   Joseph K. Bradley|joseph@databricks...|Sat Feb 25 22:24:...|[MINOR][ML][DOC] ...|
|410392ed75da64c69...|           Devaraj K|  devaraj@apache.org|Sat Feb 25 21:48:...|[SPARK-15288][MES...|
|fe07de9566b345c7a...|             lvdongr|lv.dongdong@zt

In [17]:
# setup a sql table
commits.createOrReplaceTempView("commits")

spark.sql("select * from commits").show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 sha|           committer|               email|                date|             comment|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|68f2142cfd2ca632a...|        Dilip Biswal|  dbiswal@us.ibm.com|Sat Feb 25 23:56:...|[SQL] Duplicate t...|
|89608cf26226e28f3...|         Wenchen Fan|wenchen@databrick...|Sat Feb 25 23:01:...|[SPARK-17075][SQL...|
|6ab60542e8e803b1d...|   Joseph K. Bradley|joseph@databricks...|Sat Feb 25 22:24:...|[MINOR][ML][DOC] ...|
|410392ed75da64c69...|           Devaraj K|  devaraj@apache.org|Sat Feb 25 21:48:...|[SPARK-15288][MES...|
|fe07de9566b345c7a...|             lvdongr|lv.dongdong@zte.c...|Sat Feb 25 21:47:...|[SPARK-19673][SQL...|
|061bcfb869fe5f64e...|          Boaz Mohar| boazmohar@gmail.com|Sat Feb 25 11:32:...|[MINOR][DOCS] Fix...|
|8f0511ed49a353fb0...|   Herman van H

In [18]:
#find commits from databricks.com
spark.sql("select count(*) from commits where email like '%databricks.com'").show()
spark.sql("select * from commits where email like '%databricks.com'").show()


+--------+
|count(1)|
+--------+
|    4116|
+--------+

+--------------------+-----------------+--------------------+--------------------+--------------------+
|                 sha|        committer|               email|                date|             comment|
+--------------------+-----------------+--------------------+--------------------+--------------------+
|89608cf26226e28f3...|      Wenchen Fan|wenchen@databrick...|Sat Feb 25 23:01:...|[SPARK-17075][SQL...|
|6ab60542e8e803b1d...|Joseph K. Bradley|joseph@databricks...|Sat Feb 25 22:24:...|[MINOR][ML][DOC] ...|
|8f0511ed49a353fb0...|Herman van Hovell|hvanhovell@databr...|Fri Feb 24 23:05:...|[SPARK-19650] Com...|
|4fa4cf1d4ce51ce61...|      Wenchen Fan|wenchen@databrick...|Thu Feb 23 13:22:...|[SPARK-19706][PYS...|
|9bf4e2baad0e2851d...|     Shixiong Zhu|shixiong@databric...|Thu Feb 23 11:25:...|[SPARK-19497][SS]...|
|78eae7e67fd5dec0c...|Herman van Hovell|hvanhovell@databr...|Thu Feb 23 10:25:...|[SPARK-19459] Sup...|
|10c566c

In [21]:
# find top committers

s = """
select email, count(*) as total  
from commits 
group by email 
order by total desc 
limit 10
"""
spark.sql(s).show(n=10, truncate=False)

+---------------------------+-----+
|email                      |total|
+---------------------------+-----+
|matei@eecs.berkeley.edu    |1341 |
|pwendell@gmail.com         |791  |
|rxin@databricks.com        |700  |
|tathagata.das1565@gmail.com|502  |
|rxin@apache.org            |430  |
|davies@databricks.com      |389  |
|meng@databricks.com        |338  |
|sowen@cloudera.com         |332  |
|joshrosen@databricks.com   |311  |
|wenchen@databricks.com     |282  |
+---------------------------+-----+

