In [1]:
import os
import subprocess
import datetime
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, count, from_unixtime, floor, date_format, hour
from pyspark.sql.types import *
from pyspark.sql.functions import sum as spark_sum

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [3]:
gcs_folder = 'gs://msca-bdp-data-open/final_project_git'

## Data Cleaning Functions

In [4]:
def null_count(df):
    return df.select([
    F.sum(F.col(c).isNull().cast("int")).alias(c) 
     for c in df.columns
    ]).show(truncate=False)

In [5]:
def show_duplicates(df):
    df.groupBy(*df.columns) \
    .agg(count("*").alias("duplicate_count")) \
    .filter(col("duplicate_count") > 1) \
    .show(truncate=False)

# Contents Data

In [6]:
df_contents = spark.read.parquet(os.path.join(gcs_folder, 'contents'))
print(f'Records read from dataframe *contents*: {df_contents.count():,.0f}')
df_contents.printSchema()

[Stage 3:>                                                          (0 + 1) / 1]

Records read from dataframe *contents*: 281,191,977
root
 |-- id: string (nullable = true)
 |-- size: long (nullable = true)
 |-- content: string (nullable = true)
 |-- binary: boolean (nullable = true)
 |-- copies: long (nullable = true)



                                                                                

In [7]:
# Taking a sample of the data
sample_df_contents = df_contents.sample(fraction = 0.01, seed = 42)
sample_df_contents.show(5)

                                                                                

+--------------------+------+-------+------+------+
|                  id|  size|content|binary|copies|
+--------------------+------+-------+------+------+
|aa596e6c20a3fff6c...|   462|   NULL|  true|     1|
|2340397bdc5e80fcf...|  5187|   NULL|  true|     1|
|79a32dd0782247f2a...| 58350|   NULL|  true|     1|
|a29adf479806a92b8...|502609|   NULL|  true|     1|
|431a6f4969e89dc26...| 25615|   NULL|  true|     1|
+--------------------+------+-------+------+------+
only showing top 5 rows



In [8]:
# Missing Values
null_count(sample_df_contents)

[Stage 7:>                                                          (0 + 1) / 1]

+---+----+-------+------+------+
|id |size|content|binary|copies|
+---+----+-------+------+------+
|0  |0   |530556 |0     |0     |
+---+----+-------+------+------+



                                                                                

In [12]:
df_cleaned_contents = sample_df_contents.dropna()

In [13]:
# Check missing values again
null_count(df_cleaned_contents)



+---+----+-------+------+------+
|id |size|content|binary|copies|
+---+----+-------+------+------+
|0  |0   |0      |0     |0     |
+---+----+-------+------+------+



                                                                                

In [14]:
# Duplicate values
show_duplicates(df_cleaned_contents)



+---+----+-------+------+------+---------------+
|id |size|content|binary|copies|duplicate_count|
+---+----+-------+------+------+---------------+
+---+----+-------+------+------+---------------+



                                                                                