In [1]:
import os
import subprocess
import datetime
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, count, from_unixtime, floor, date_format, hour
from pyspark.sql.types import *
from pyspark.sql.functions import sum as spark_sum

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [3]:
gcs_folder = 'gs://msca-bdp-data-open/final_project_git'

## Data Cleaning Functions

In [4]:
def null_count(df):
    return df.select([
    F.sum(F.col(c).isNull().cast("int")).alias(c) 
     for c in df.columns
    ]).show(truncate=False)

In [5]:
def show_duplicates(df):
    df.groupBy(*df.columns) \
    .agg(count("*").alias("duplicate_count")) \
    .filter(col("duplicate_count") > 1) \
    .show(truncate=False)

# Files Data

In [7]:
df_files = spark.read.parquet(os.path.join(gcs_folder, 'files'))
print(f'Records read from dataframe *licenses*: {df_files.count():,.0f}')
df_files.printSchema()



Records read from dataframe *licenses*: 2,309,424,945
root
 |-- repo_name: string (nullable = true)
 |-- ref: string (nullable = true)
 |-- path: string (nullable = true)
 |-- mode: long (nullable = true)
 |-- id: string (nullable = true)
 |-- symlink_target: string (nullable = true)



                                                                                

In [8]:
# Taking a sample of the data
sample_df_files = df_files.sample(fraction = 0.01, seed = 42)
sample_df_files.show(5)

                                                                                

+--------------------+-----------------+--------------------+-----+--------------------+--------------+
|           repo_name|              ref|                path| mode|                  id|symlink_target|
+--------------------+-----------------+--------------------+-----+--------------------+--------------+
|         thx922/japi|refs/heads/master| html/img/yellow.png|33188|f2c08520ed9e9ac6a...|          NULL|
|         thx922/japi|refs/heads/master|   html/img/next.png|33188|3530eda75a1ed196e...|          NULL|
| tuxbox/matemonkey4j|refs/heads/master|matemonkey4j-api/...|33188|1eadeef6110f54782...|          NULL|
|walterpalladino/c...|refs/heads/master|ComicStripLiveWal...|33188|c37372acb9ce85afc...|          NULL|
|      wxylon/jeesite|refs/heads/master|src/main/webapp/s...|33188|513de77da7b5cb5a5...|          NULL|
+--------------------+-----------------+--------------------+-----+--------------------+--------------+
only showing top 5 rows



In [9]:
# Missing Values
null_count(sample_df_files)

[Stage 8:>                                                          (0 + 1) / 1]

+---------+---+----+----+---+--------------+
|repo_name|ref|path|mode|id |symlink_target|
+---------+---+----+----+---+--------------+
|0        |0  |0   |0   |0  |23037032      |
+---------+---+----+----+---+--------------+



                                                                                

In [13]:
df_cleaned_files = sample_df_files.drop("symlink_target")

In [16]:
# Check missing values again
null_count(df_cleaned_files)



+---------+---+----+----+---+
|repo_name|ref|path|mode|id |
+---------+---+----+----+---+
|0        |0  |0   |0   |0  |
+---------+---+----+----+---+



                                                                                

In [14]:
# Duplicate values
show_duplicates(df_cleaned_files)



+---------+---+----+----+---+---------------+
|repo_name|ref|path|mode|id |duplicate_count|
+---------+---+----+----+---+---------------+
+---------+---+----+----+---+---------------+



                                                                                