## Reading Git Final Project

In [1]:
import os
import subprocess
import datetime
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.sql import functions as F
from pyspark.sql.types import *

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [3]:
gcs_folder = 'gs://msca-bdp-data-open/final_project_git'

#### Check data size in GCS

In [4]:
cmd = 'gsutil du -s -h ' + gcs_folder

p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
for line in p.stdout.readlines():
    print (f'Total directory size: {line}')
    
retval = p.wait() # Wait for the child process to terminate.

Total directory size: 1.36 TiB     gs://msca-bdp-data-open/final_project_git



### Read Git data from GCS

#### Languages
Programming languages by repository as reported by GitHub's https://developer.github.com/v3/repos/#list-languages API

In [5]:
%%time   
    
df_languages = spark.read.parquet(os.path.join(gcs_folder, 'languages'))
print(f'Records read from dataframe *languages*: {df_languages.count():,.0f}')



Records read from dataframe *languages*: 3,325,634
CPU times: user 13.7 ms, sys: 556 Âµs, total: 14.3 ms
Wall time: 11.4 s


                                                                                

In [6]:
df_languages.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- language: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- bytes: long (nullable = true)



In [18]:
df_languages.show(5)

+-------------------+------------+
|          repo_name|    language|
+-------------------+------------+
|  lemi136/puntovent|   [{C, 80}]|
|     taxigps/nctool| [{C, 4461}]|
|        ahy1/strbuf| [{C, 5573}]|
|nleiten/mod_rpaf-ng|[{C, 30330}]|
|kmcallister/alameda|[{C, 17077}]|
+-------------------+------------+
only showing top 5 rows



                                                                                

#### Licenses
Open source license SPDX code for each repository as detected by https://developer.github.com/v3/licenses/

In [7]:
%%time   
    
df_licenses = spark.read.parquet(os.path.join(gcs_folder, 'licenses'))
print(f'Records read from dataframe *licenses*: {df_licenses.count():,.0f}')



Records read from dataframe *licenses*: 3,325,634
CPU times: user 0 ns, sys: 6.39 ms, total: 6.39 ms
Wall time: 1.84 s


                                                                                

In [8]:
df_licenses.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- license: string (nullable = true)



In [19]:
df_licenses.show(5)



+--------------------+------------+
|           repo_name|     license|
+--------------------+------------+
|autarch/Dist-Zill...|artistic-2.0|
|thundergnat/Prime...|artistic-2.0|
|kusha-b-k/Turabia...|artistic-2.0|
|onlinepremiumoutl...|artistic-2.0|
|huangyuanlove/Lia...|artistic-2.0|
+--------------------+------------+
only showing top 5 rows



                                                                                

#### Commits
Unique Git commits from open source repositories on GitHub, pre-grouped by repositories they appear in.

In [9]:
%%time   
    
df_commits = spark.read.parquet(os.path.join(gcs_folder, 'commits'))
print(f'Records read from dataframe *commits*: {df_commits.count():,.0f}')



Records read from dataframe *commits*: 265,419,190
CPU times: user 350 ms, sys: 73.5 ms, total: 423 ms
Wall time: 1min 56s


                                                                                

In [10]:
df_commits.printSchema()

root
 |-- commit: string (nullable = true)
 |-- tree: string (nullable = true)
 |-- parent: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- author: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- time_sec: long (nullable = true)
 |    |-- tz_offset: long (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: long (nullable = true)
 |    |    |-- nanos: long (nullable = true)
 |-- committer: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- time_sec: long (nullable = true)
 |    |-- tz_offset: long (nullable = true)
 |    |-- date: struct (nullable = true)
 |    |    |-- seconds: long (nullable = true)
 |    |    |-- nanos: long (nullable = true)
 |-- subject: string (nullable = true)
 |-- message: string (nullable = true)
 |-- trailer: array (nullable = true)
 |    |-- element: struct (contains

In [20]:
df_commits.show(5)

[Stage 28:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------+
|              commit|                tree|              parent|              author|           committer|             subject|             message|             trailer|difference|difference_truncated|           repo_name|encoding|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------+
|aa358905a1b12c6fa...|df3f8bf61bf1cb0df...|[ea230a45a0e97e4d...|{conda-forge-coor...|{conda-forge-coor...|Updated the qceng...|Updated the qceng...|                  []|        []|                true|[conda-forge/feed...|    NULL|
|5a6b6d6d29489f858...|ff89accb7e283ca88...|[4ee369feb64ee97d...|{Rob All

                                                                                

#### Contents
Unique file contents of text files under 1 MiB on the HEAD branch.  
Can be joined to `files` dataset using the id columns to identify the repository and file path.

In [11]:
%%time   
    
df_contents = spark.read.parquet(os.path.join(gcs_folder, 'contents'))
print(f'Records read from dataframe *commits*: {df_contents.count():,.0f}')



Records read from dataframe *commits*: 281,191,977
CPU times: user 461 ms, sys: 134 ms, total: 595 ms
Wall time: 2min 39s


                                                                                

In [12]:
df_contents.printSchema()

root
 |-- id: string (nullable = true)
 |-- size: long (nullable = true)
 |-- content: string (nullable = true)
 |-- binary: boolean (nullable = true)
 |-- copies: long (nullable = true)



In [21]:
df_contents.show(5)

[Stage 29:>                                                         (0 + 1) / 1]

+--------------------+-----+--------------------+------+------+
|                  id| size|             content|binary|copies|
+--------------------+-----+--------------------+------+------+
|d5b1049fdaa182fa5...| 1570|{"version":3,"sou...| false|   256|
|896830f9ea31efd6b...|18616|                NULL|  true|     1|
|bf1e2a8490344601c...|15580|                NULL|  true|     1|
|e5976431eba91aa73...| 3328|                NULL|  true|     1|
|311532e41682cab22...| 8970|                NULL|  true|     1|
+--------------------+-----+--------------------+------+------+
only showing top 5 rows



                                                                                

#### Files
File metadata for all files at HEAD.  
Join with `contents` dataset on id columns to search text.

In [13]:
%%time   
    
df_files = spark.read.parquet(os.path.join(gcs_folder, 'files'))
print(f'Records read from dataframe *files*: {df_files.count():,.0f}')



Records read from dataframe *files*: 2,309,424,945
CPU times: user 88.1 ms, sys: 26 ms, total: 114 ms
Wall time: 31.4 s


                                                                                

In [14]:
df_files.printSchema()

root
 |-- repo_name: string (nullable = true)
 |-- ref: string (nullable = true)
 |-- path: string (nullable = true)
 |-- mode: long (nullable = true)
 |-- id: string (nullable = true)
 |-- symlink_target: string (nullable = true)



In [22]:
df_files.show(5)

[Stage 30:>                                                         (0 + 1) / 1]

+--------------------+-----------------+--------------------+-----+--------------------+--------------+
|           repo_name|              ref|                path| mode|                  id|symlink_target|
+--------------------+-----------------+--------------------+-----+--------------------+--------------+
|    enzbang/diouzhtu|refs/heads/master|gwiad_wiki_servic...|33261|49365044eed287691...|          NULL|
|TheMrNomis/Latex-...|refs/heads/master|             LFM.php|33261|ef8cb78feed7f2111...|          NULL|
|TheMrNomis/Latex-...|refs/heads/master|PHP/LatexFlavored...|33261|d989ce59652f57efa...|          NULL|
|    xurigan/uexJPush|refs/heads/master|EUExJPush/EUExJPu...|33261|85268b90caa19efa2...|          NULL|
|    xurigan/uexJPush|refs/heads/master|EUExJPush/uexJPus...|33261|e1623bb9d8dc7db60...|          NULL|
+--------------------+-----------------+--------------------+-----+--------------------+--------------+
only showing top 5 rows



                                                                                

## Data Cleaning

In [24]:
from pyspark.sql.functions import col, isnan, count

# Check for nulls
df_commits.select([count(col(c)).alias(c) for c in df_commits.columns]).show()



+---------+---------+---------+---------+---------+---------+---------+---------+----------+--------------------+---------+--------+
|   commit|     tree|   parent|   author|committer|  subject|  message|  trailer|difference|difference_truncated|repo_name|encoding|
+---------+---------+---------+---------+---------+---------+---------+---------+----------+--------------------+---------+--------+
|265419190|265419190|265419190|265419190|265419190|265419190|265419190|265419190| 265419190|              220943|265419190|  126182|
+---------+---------+---------+---------+---------+---------+---------+---------+----------+--------------------+---------+--------+



                                                                                

In [25]:
for name, df in [('Languages', df_languages), ('Commits', df_commits), ('Contents', df_contents), ('Files', df_files), ('Licenses', df_licenses)]:
    print(f'--- Missing values in {name} ---')
    df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()

--- Missing values in Languages ---


                                                                                

+---------+--------+
|repo_name|language|
+---------+--------+
|        0|       0|
+---------+--------+

--- Missing values in Commits ---


                                                                                

+---+----+--------+------+------+
| id|size| content|binary|copies|
+---+----+--------+------+------+
|  0|   0|53037932|     0|     0|
+---+----+--------+------+------+

--- Missing values in Files ---


                                                                                

+---------+---+----+----+---+--------------+
|repo_name|ref|path|mode| id|symlink_target|
+---------+---+----+----+---+--------------+
|        0|  0|   0|   0|  0|    2304200645|
+---------+---+----+----+---+--------------+

--- Missing values in Licenses ---




+---------+-------+
|repo_name|license|
+---------+-------+
|        0|      0|
+---------+-------+



                                                                                

In [15]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Tue, 11 March 2025 03:48:12'