In [1]:
from pyspark.sql.functions import udf,col,split,regexp_extract_all,regexp_extract,explode,size,countDistinct, when, max, min, count
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType


### Load dataset

In [2]:
# Create a Spark session
spark = SparkSession.builder.appName("SurvivalDataAnalysis").master("local[*]") \
    .config("spark.driver.memory", "32g") \
    .config("spark.executor.meomory", "40g") \
        .getOrCreate()

    # .config("spark.executor.meomory", "24g") \
    #     .config("spark.driver.maxResultSize", "16g") \


24/01/28 21:46:07 WARN Utils: Your hostname, cssh-alpha resolves to a loopback address: 127.0.1.1; using 134.130.186.164 instead (on interface bond0)
24/01/28 21:46:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/28 21:46:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# spark.stop()

In [4]:
# Specify the path to the folder containing Orc files
input_folder_path = "/home/cxu/snapshot-revisions/snap_revisions.csv"

# Define the schema for your CSV file
# custom_schema = StructType([
#     StructField("index", IntegerType(), True),
#     StructField("value", StringType(), True)
# ])

# Read Orc files into a DataFrame
snp_revisions = spark.read.format("csv").option("sep", ":").option("header", True).load(input_folder_path)

# Print the schema of the dataset
snp_revisions.printSchema()

# Show the first 5 rows of the dataset
snp_revisions.show(5)

# Get the length of the dataset
dataset_length = snp_revisions.count()

# Show the length of the dataset
print(f"Length of the dataset: {dataset_length} rows")

root
 |-- value: string (nullable = true)

+--------------------+
|               value|
+--------------------+
|e782789e19560aed3...|
|4d2dac5c607a2cacb...|
|bc1a0babc0c8485a5...|
|0fbaa115e7fe163ce...|
|6b8f724a4fbd9a2c7...|
+--------------------+
only showing top 5 rows





Length of the dataset: 19537 rows


                                                                                

In [5]:
# snp_revisions.take(3)

### New columns: snapshot, revisions, num_revisions

filter the rows with num_revisions==0


In [6]:
df = snp_revisions.withColumn('snapshot', col('value').substr(0,40))

In [7]:
# Extract strings matching the pattern "swh:1:rev:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# pattern = r"swh:1:rev:([a-zA-Z0-9]{40})"
# df = df.withColumn("revisions", regexp_extract(df["value"], pattern, 1))

df = df.withColumn('revisions', F.expr(r"regexp_extract_all(value, 'swh:1:rev:([a-zA-Z0-9]{40})', 1)"))


In [8]:
df = df.withColumn("num_revisions", size(col("revisions")))

In [9]:
# df.take(3)

In [10]:
df.show()

+--------------------+--------------------+--------------------+-------------+
|               value|            snapshot|           revisions|num_revisions|
+--------------------+--------------------+--------------------+-------------+
|e782789e19560aed3...|e782789e19560aed3...|                  []|            0|
|4d2dac5c607a2cacb...|4d2dac5c607a2cacb...|[5b98a95ef18f8048...|            4|
|bc1a0babc0c8485a5...|bc1a0babc0c8485a5...|[fae76cf418675b17...|           18|
|0fbaa115e7fe163ce...|0fbaa115e7fe163ce...|[1a1f62c50a72b5e4...|            5|
|6b8f724a4fbd9a2c7...|6b8f724a4fbd9a2c7...|[9daca5d1037e3b95...|           32|
|da84a8b50ea49efd9...|da84a8b50ea49efd9...|[cfa46b2c34e024e8...|           25|
|a15e803497a7da06f...|a15e803497a7da06f...|[396890f255dd7840...|          121|
|20270dbf7bd07a326...|20270dbf7bd07a326...|[d409b8e161995423...|           21|
|74d2a6faf8aaa9b6a...|74d2a6faf8aaa9b6a...|[851df60ef12ab09b...|            6|
|826cd41fdd676d334...|826cd41fdd676d334...|         

In [11]:
filtered_df= df.filter(col("num_revisions") != 0)

In [12]:
filtered_df.show()

+--------------------+--------------------+--------------------+-------------+
|               value|            snapshot|           revisions|num_revisions|
+--------------------+--------------------+--------------------+-------------+
|4d2dac5c607a2cacb...|4d2dac5c607a2cacb...|[5b98a95ef18f8048...|            4|
|bc1a0babc0c8485a5...|bc1a0babc0c8485a5...|[fae76cf418675b17...|           18|
|0fbaa115e7fe163ce...|0fbaa115e7fe163ce...|[1a1f62c50a72b5e4...|            5|
|6b8f724a4fbd9a2c7...|6b8f724a4fbd9a2c7...|[9daca5d1037e3b95...|           32|
|da84a8b50ea49efd9...|da84a8b50ea49efd9...|[cfa46b2c34e024e8...|           25|
|a15e803497a7da06f...|a15e803497a7da06f...|[396890f255dd7840...|          121|
|20270dbf7bd07a326...|20270dbf7bd07a326...|[d409b8e161995423...|           21|
|74d2a6faf8aaa9b6a...|74d2a6faf8aaa9b6a...|[851df60ef12ab09b...|            6|
|3612d51f656842079...|3612d51f656842079...|[c06ed62eef81c9d2...|            3|
|67503f4e72c5c3bf4...|67503f4e72c5c3bf4...|[3cd657c9

In [13]:
filtered_df.printSchema()

root
 |-- value: string (nullable = true)
 |-- snapshot: string (nullable = true)
 |-- revisions: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- num_revisions: integer (nullable = false)



In [14]:
final_df = filtered_df.select('snapshot','revisions','num_revisions')

In [15]:
# final_df.count()

In [16]:
# final_df.write.json('./snapshot_revisions/')

### Explode the revisions

In [17]:
exploded_df = final_df.select("snapshot", explode("revisions").alias("revision"))

In [18]:
exploded_df.show()

+--------------------+--------------------+
|            snapshot|            revision|
+--------------------+--------------------+
|4d2dac5c607a2cacb...|5b98a95ef18f8048a...|
|4d2dac5c607a2cacb...|449231747dff1fdb7...|
|4d2dac5c607a2cacb...|201b68d571c4427bf...|
|4d2dac5c607a2cacb...|4ad4585798a150547...|
|bc1a0babc0c8485a5...|fae76cf418675b179...|
|bc1a0babc0c8485a5...|7d6752d0e35b8a01d...|
|bc1a0babc0c8485a5...|b5f2a4a7b7ab55cf1...|
|bc1a0babc0c8485a5...|ee845b8287f864d49...|
|bc1a0babc0c8485a5...|2ca137e860fff9b74...|
|bc1a0babc0c8485a5...|7b9d12e76d5c3d3f3...|
|bc1a0babc0c8485a5...|2be928b62fac9cbdb...|
|bc1a0babc0c8485a5...|4334fd1aa89f6ca17...|
|bc1a0babc0c8485a5...|2c2e363acc0cab4bc...|
|bc1a0babc0c8485a5...|d8420690b538ada42...|
|bc1a0babc0c8485a5...|72d3752d2086c455f...|
|bc1a0babc0c8485a5...|0caf0b9e05ee016a8...|
|bc1a0babc0c8485a5...|b1b35fdb894239395...|
|bc1a0babc0c8485a5...|e8a81a2af0c9667ed...|
|bc1a0babc0c8485a5...|d7d6d75a6c375465e...|
|bc1a0babc0c8485a5...|54c297b921

In [19]:
exploded_df.printSchema()

root
 |-- snapshot: string (nullable = true)
 |-- revision: string (nullable = true)



In [20]:
# df.count()

In [21]:
exploded_df.write.json('./snapshot_revisions_explored/')

                                                                                

In [22]:
spark.stop()