In [None]:
%pip install pyspark



In [14]:
# Initializing Spark

old_way = False
if old_way:
  from pyspark import SparkContext, SparkConf
  conf = SparkConf().setAppName("Archive_PySpark").setMaster("local[*]")
  sc = SparkContext(conf=conf)
else:
  from pyspark.sql import SparkSession
  # Start a Spark session
  spark = SparkSession.builder \
    .appName("ArXiv Analysis") \
    .getOrCreate()

In [15]:
# Read and Load Data to Spark
# Data source: https://www.kaggle.com/Cornell-University/arxiv/version/1
# https://www.kaggle.com/datasets/Cornell-University/arxiv/data

In [16]:
from pyspark.sql import DataFrame

In [17]:
using_GoogleColab = True
if using_GoogleColab:
  # to work with files inside Google Colab,
  from google.colab import drive
  drive.mount('/content/drive')
  # and right-click copy path of the files you want
else:
  # idk man
  pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
# Load the JSON file
# data: DataFrame = spark.read.json("path/to/arxiv-metadata.json")
data: DataFrame = spark.read.json("drive/MyDrive/Colab Notebooks/arxiv-metadata-oai-snapshot.json")

In [45]:
from pyspark.sql.functions import explode, col

.withColumn() - Returns a new DataFrame by adding a column or replacing the existing column that has the same name.

explode() - Returns a new row for each element in the given array or map. Uses the default column name col for elements in the array and key and value for elements in the map unless specified otherwise

col() - Returns a pyspark.sql.Column based on the given column name

In [120]:
# Explode the versions to access version 1 details
data_with_versions = data.withColumn("version", explode(col("versions")))

In [121]:
data_with_versions.show(5)

+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+---------+--------------------+--------------------+----------------+--------------+--------------------+-----------+--------------------+--------------------+
|            abstract|             authors|      authors_parsed|    categories|            comments|                 doi|       id|         journal-ref|             license|       report-no|     submitter|               title|update_date|            versions|             version|
+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+---------+--------------------+--------------------+----------------+--------------+--------------------+-----------+--------------------+--------------------+
|  A fully differe...|C. Bal\'azs, E. L...|[[Balázs, C., ], ...|        hep-ph|37 pages, 15 figu...|10.1103/PhysRevD....|0704.0001|Phys.Rev.D76:0130...|     

In [132]:
# Filter to keep only version 1
version_1_data = data_with_versions.filter(col("version.version") == "v1")

In [133]:
version_1_data.show()

+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------------------+--------------------+
|            abstract|             authors|      authors_parsed|       categories|            comments|                 doi|       id|         journal-ref|             license|           report-no|         submitter|               title|update_date|            versions|             version|
+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------------------+--------------------+
|  A fully differe...|C. Bal\'azs, E. L...|[[Balázs, C., ], ...|           hep-ph|37 pages, 15 figu...|10.1103/PhysRevD....|

In [134]:
from pyspark.sql.functions import to_date, to_timestamp

to_date() - Converts a pyspark.sql.Column into pyspark.sql.types.DateType using the optionally specified format.

If the column has dates like Mon, 01 Jan 2023 12:34:56 +0000, the format string should be:

EEE, dd MMM yyyy HH:mm:ss Z

In [135]:
version_1_data.select("version.created").show(5, truncate=False)

+-----------------------------+
|created                      |
+-----------------------------+
|Mon, 2 Apr 2007 19:18:42 GMT |
|Sat, 31 Mar 2007 02:26:18 GMT|
|Sun, 1 Apr 2007 20:46:54 GMT |
|Sat, 31 Mar 2007 03:16:14 GMT|
|Mon, 2 Apr 2007 18:09:58 GMT |
+-----------------------------+
only showing top 5 rows



this format is actually `EEE, d MMM yyyy HH:mm:ss z`

d is the day of the month (without leading zeros)

In [137]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

not ideal but at least works...

In [138]:
# Extract the submission date and convert it to a date format
# version_1_data = version_1_data.withColumn("submission_date", to_date(col("version.created"), "EEE, dd MMM yyyy HH:mm:ss Z"))

"""
version_1_data = version_1_data.withColumn(
    "submission_timestamp",
    to_timestamp(col("created"), "EEE, d MMM yyyy HH:mm:ss X")
)


version_1_data = version_1_data.withColumn(
    "submission_date",
    to_date(col("version.created"), "EEE, d MMM yyyy HH:mm:ss X")
)
"""
# Apply the timestamp conversion using the correct pattern
version_1_data = version_1_data.withColumn(
    "submission_timestamp",
    to_timestamp(col("version.created"), "EEE, d MMM yyyy HH:mm:ss z")
)

# Extract only the date (if you need it)
from pyspark.sql.functions import to_date
version_1_data = version_1_data.withColumn(
    "submission_date",
    to_date(col("submission_timestamp"))
)

# Verify the result
version_1_data.select("version.created", "submission_date").show(5, truncate=False)

+-----------------------------+---------------+
|created                      |submission_date|
+-----------------------------+---------------+
|Mon, 2 Apr 2007 19:18:42 GMT |2007-04-02     |
|Sat, 31 Mar 2007 02:26:18 GMT|2007-03-31     |
|Sun, 1 Apr 2007 20:46:54 GMT |2007-04-01     |
|Sat, 31 Mar 2007 03:16:14 GMT|2007-03-31     |
|Mon, 2 Apr 2007 18:09:58 GMT |2007-04-02     |
+-----------------------------+---------------+
only showing top 5 rows



In [139]:
version_1_data.show(5)

+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+---------+--------------------+--------------------+----------------+------------------+--------------------+-----------+--------------------+--------------------+--------------------+---------------+
|            abstract|             authors|      authors_parsed|     categories|            comments|                 doi|       id|         journal-ref|             license|       report-no|         submitter|               title|update_date|            versions|             version|submission_timestamp|submission_date|
+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+---------+--------------------+--------------------+----------------+------------------+--------------------+-----------+--------------------+--------------------+--------------------+---------------+
|  A fully differe...|C. Bal\'a

In [140]:
version_1_data.printSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- authors_parsed: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- categories: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- id: string (nullable = true)
 |-- journal-ref: string (nullable = true)
 |-- license: string (nullable = true)
 |-- report-no: string (nullable = true)
 |-- submitter: string (nullable = true)
 |-- title: string (nullable = true)
 |-- update_date: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |    |-- version: string (nullable = true)
 |-- version: struct (nullable = true)
 |    |-- created: string (nullable = true)
 |    |-- version: string (nullable = true)
 |-- submission_timestamp: timestamp (nullable = true)
 |-- subm

In [141]:
from pyspark.sql.functions import regexp_extract

regex_extract() - Extract a specific group matched by the Java regex regexp, from the specified string column. If the regex did not match, or the specified group did not match, an empty string is returned

In [142]:
# Extract the page count from the comments (assuming comments contain "X pages")
version_1_data = version_1_data.withColumn("page_count",
                                           regexp_extract(col("comments"),
                                                          r"(\d+) pages", 1).
                                           cast("int"))

In [143]:
version_1_data.show(5)

+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+---------+--------------------+--------------------+----------------+------------------+--------------------+-----------+--------------------+--------------------+--------------------+---------------+----------+
|            abstract|             authors|      authors_parsed|     categories|            comments|                 doi|       id|         journal-ref|             license|       report-no|         submitter|               title|update_date|            versions|             version|submission_timestamp|submission_date|page_count|
+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+---------+--------------------+--------------------+----------------+------------------+--------------------+-----------+--------------------+--------------------+--------------------+---------------+----------

In [71]:
from pyspark.sql.functions import sum as _sum

In [144]:
# Group by submission_date and calculate total pages
pages_per_day = (
    version_1_data.groupBy("submission_date")
    .agg(_sum("page_count").alias("total_pages"))
)

In [147]:
from pyspark.sql.functions import avg

avg() - Aggregate function: returns the average of the values in a group

In [145]:
# Calculate the average number of pages per day
average_pages_per_day = pages_per_day.select(avg(col("total_pages")).alias("avg_pages_per_day"))

In [146]:
# Show the result
average_pages_per_day.show()

+------------------+
| avg_pages_per_day|
+------------------+
|1912.9052574427653|
+------------------+

