# Chapter 3

## 3.2.2 Create spark session and load json file

In [3]:
spark = SparkSession.builder \
    .appName('GitHub push counter') \
    .master('local') \
    .getOrCreate()

In [4]:
df_git_log = spark.read.json('dataIn/2015-03-01-0.json.gz')

In [25]:
df_git_log.printSchema()

root
 |-- actor: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id: string (nullable = true)
 |-- org: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- action: string (nullable = true)
 |    |-- before: string (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _links: struct (nullable = true)
 |    |    |    |-- html: struct (nullable = true)
 |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- pull_request: struct (nullable = true)
 |    |    |    |    |-- href: strin

In [26]:
df_git_log.show(5)

+--------------------+--------------------+----------+--------------------+--------------------+------+--------------------+-----------------+
|               actor|          created_at|        id|                 org|             payload|public|                repo|             type|
+--------------------+--------------------+----------+--------------------+--------------------+------+--------------------+-----------------+
|[https://avatars....|2015-03-01T00:00:00Z|2614896652|[https://avatars....|[null,null,null,n...|  true|[23934080,Early-M...|      CreateEvent|
|[https://avatars....|2015-03-01T00:00:00Z|2614896653|                null|[null,6dda286a3a1...|  true|[31481156,bezerra...|        PushEvent|
|[https://avatars....|2015-03-01T00:00:00Z|2614896654|                null|[null,6089ce1d78d...|  true|[31475673,demianb...|        PushEvent|
|[https://avatars....|2015-03-01T00:00:00Z|2614896656|                null|[created,null,[nu...|  true|[31481077,chrsmit...|IssueCommentEvent|

## 3.2.3 Filter only PushEvents
### filter takes a boolean condition or a SQL string 

In [5]:
df_push_type1 = df_git_log.filter(df_git_log.type == 'PushEvent')

In [18]:
df_push_type1.count()

8793

In [16]:
df_push_type1.show(5)

+--------------------+--------------------+----------+--------------------+--------------------+------+--------------------+---------+
|               actor|          created_at|        id|                 org|             payload|public|                repo|     type|
+--------------------+--------------------+----------+--------------------+--------------------+------+--------------------+---------+
|[https://avatars....|2015-03-01T00:00:00Z|2614896653|                null|[null,6dda286a3a1...|  true|[31481156,bezerra...|PushEvent|
|[https://avatars....|2015-03-01T00:00:00Z|2614896654|                null|[null,6089ce1d78d...|  true|[31475673,demianb...|PushEvent|
|[https://avatars....|2015-03-01T00:00:00Z|2614896663|                null|[null,3f55d3ea1fc...|  true|[31481269,ricardo...|PushEvent|
|[https://avatars....|2015-03-01T00:00:00Z|2614896667|[https://avatars....|[null,15bf6c90255...|  true|[24902852,actorap...|PushEvent|
|[https://avatars....|2015-03-01T00:00:00Z|2614896668| 

In [13]:
df_push_type2 = df_git_log.filter("type = 'PushEvent'")

In [19]:
df_push_type2.count()

8793

In [17]:
df_push_type2.show(5)

+--------------------+--------------------+----------+--------------------+--------------------+------+--------------------+---------+
|               actor|          created_at|        id|                 org|             payload|public|                repo|     type|
+--------------------+--------------------+----------+--------------------+--------------------+------+--------------------+---------+
|[https://avatars....|2015-03-01T00:00:00Z|2614896653|                null|[null,6dda286a3a1...|  true|[31481156,bezerra...|PushEvent|
|[https://avatars....|2015-03-01T00:00:00Z|2614896654|                null|[null,6089ce1d78d...|  true|[31475673,demianb...|PushEvent|
|[https://avatars....|2015-03-01T00:00:00Z|2614896663|                null|[null,3f55d3ea1fc...|  true|[31481269,ricardo...|PushEvent|
|[https://avatars....|2015-03-01T00:00:00Z|2614896667|[https://avatars....|[null,15bf6c90255...|  true|[24902852,actorap...|PushEvent|
|[https://avatars....|2015-03-01T00:00:00Z|2614896668| 

## 3.2.4 GroupBy user and count

In [6]:
df_grouped = df_push_type1.groupBy('actor.login').count()

In [24]:
df_grouped.show(5)

+------------+-----+
|       login|count|
+------------+-----+
|john-griffin|    1|
|   digitized|    3|
| theCodeBear|    1|
|      WillHK|    1|
|  sakuya3834|    1|
+------------+-----+
only showing top 5 rows



### Sort/OrderBy

In [29]:
df_grouped.sort('count', ascending=False).show(5)

+------------------+-----+
|             login|count|
+------------------+-----+
|      greatfirebot|  192|
|diversify-exp-user|  146|
|     KenanSulayman|   72|
|        manuelrp07|   45|
|    mirror-updates|   42|
+------------------+-----+
only showing top 5 rows



In [31]:
df_grouped.orderBy('count', ascending=False).show(5)

+------------------+-----+
|             login|count|
+------------------+-----+
|      greatfirebot|  192|
|diversify-exp-user|  146|
|     KenanSulayman|   72|
|        manuelrp07|   45|
|    mirror-updates|   42|
+------------------+-----+
only showing top 5 rows



In [15]:
df_ordered = df_grouped.orderBy('count', ascending=False)

## 3.2.5 Excluding non employees
## 3.2.6 Broadcast variables

In [1]:
d_employee = {emp.strip() : 1 for emp in open('dataIn/ghEmployees.txt')}

In [2]:
d_employee

{'AiMadobe': 1,
 'Akkyie': 1,
 'BatMiles': 1,
 'Battleroid': 1,
 'BhawanVirk': 1,
 'BitKiwi': 1,
 'ChroniXEcho': 1,
 'DomT4': 1,
 'EmanueleMinotto': 1,
 'Gix075': 1,
 'GroovyCarrot': 1,
 'Halexsson': 1,
 'IrinaDmt': 1,
 'IsaacAU': 1,
 'JustScience': 1,
 'Juxnist': 1,
 'KenanSulayman': 1,
 'LeendersR': 1,
 'Lithium64': 1,
 'MichaelCTH': 1,
 'NathanNg': 1,
 'Pykee': 1,
 'Ramzawulf': 1,
 'Reddraft': 1,
 'RuiqingQiu': 1,
 'Somasis': 1,
 'StuntsPT': 1,
 'TheRingMaster': 1,
 'ToluOlayinka': 1,
 'Tookmund': 1,
 'Valicek1': 1,
 'WhiteHalmos': 1,
 'ZombieHippie': 1,
 'aclindsa': 1,
 'adamschwartz': 1,
 'ahsojar': 1,
 'albertn198': 1,
 'alexanderdidenko': 1,
 'allelos': 1,
 'andy-armstrong': 1,
 'aprilx2222': 1,
 'aquira246': 1,
 'aried3r': 1,
 'avilunin': 1,
 'barnardn': 1,
 'battlesnake': 1,
 'bcherny': 1,
 'bdiegel': 1,
 'bitemyapp': 1,
 'bjaiyen': 1,
 'bmcfluff': 1,
 'chapuni': 1,
 'cptmashek': 1,
 'craigem': 1,
 'danielrasmuson': 1,
 'danieltcv': 1,
 'dayanyrec': 1,
 'dcsan': 1,
 'dhirajbod

In [21]:
spark_context = spark.sparkContext
bc_d_employee = spark_context.broadcast(d_employee)

In [22]:
bc_d_employee.value

{'AiMadobe': 1,
 'Akkyie': 1,
 'BatMiles': 1,
 'Battleroid': 1,
 'BhawanVirk': 1,
 'BitKiwi': 1,
 'ChroniXEcho': 1,
 'DomT4': 1,
 'EmanueleMinotto': 1,
 'Gix075': 1,
 'GroovyCarrot': 1,
 'Halexsson': 1,
 'IrinaDmt': 1,
 'IsaacAU': 1,
 'JustScience': 1,
 'Juxnist': 1,
 'KenanSulayman': 1,
 'LeendersR': 1,
 'Lithium64': 1,
 'MichaelCTH': 1,
 'NathanNg': 1,
 'Pykee': 1,
 'Ramzawulf': 1,
 'Reddraft': 1,
 'RuiqingQiu': 1,
 'Somasis': 1,
 'StuntsPT': 1,
 'TheRingMaster': 1,
 'ToluOlayinka': 1,
 'Tookmund': 1,
 'Valicek1': 1,
 'WhiteHalmos': 1,
 'ZombieHippie': 1,
 'aclindsa': 1,
 'adamschwartz': 1,
 'ahsojar': 1,
 'albertn198': 1,
 'alexanderdidenko': 1,
 'allelos': 1,
 'andy-armstrong': 1,
 'aprilx2222': 1,
 'aquira246': 1,
 'aried3r': 1,
 'avilunin': 1,
 'barnardn': 1,
 'battlesnake': 1,
 'bcherny': 1,
 'bdiegel': 1,
 'bitemyapp': 1,
 'bjaiyen': 1,
 'bmcfluff': 1,
 'chapuni': 1,
 'cptmashek': 1,
 'craigem': 1,
 'danielrasmuson': 1,
 'danieltcv': 1,
 'dayanyrec': 1,
 'dcsan': 1,
 'dhirajbod

In [23]:
from pyspark.sql.types import BooleanType
spark.udf.register("isEmpolyeeUDF", lambda employee: employee in bc_d_employee.value, returnType=BooleanType())

In [24]:
df_filtered_employee = df_ordered.filter("isEmpolyeeUDF(login)")

In [25]:
df_filtered_employee.show(5)

+---------------+-----+
|          login|count|
+---------------+-----+
|  KenanSulayman|   72|
|     manuelrp07|   45|
|        Somasis|   26|
|direwolf-github|   24|
|EmanueleMinotto|   22|
+---------------+-----+
only showing top 5 rows



In [26]:
df_filtered_employee.count()

207

In [29]:
df_filtered_employee.write.json('dataOut/filtered_employee')