In [1]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
pd.options.plotting.backend = "plotly"

In [2]:
# Create a SparkSession
spark = SparkSession.builder.appName("YDN EDA").getOrCreate()

In [3]:
df = spark.read.parquet("data/raw_data_*.parquet")
df.show()

+--------------------+-------------------+--------------------+--------+--------------------+------------------------------+
|                 url|               date|               title|subtitle|             content|estimated_reading_time_minutes|
+--------------------+-------------------+--------------------+--------+--------------------+------------------------------+
|https://yaledaily...|2000-10-09 00:00:00|Women's soccer fa...|    NULL|This game feature...|                           4.0|
|https://yaledaily...|2000-10-09 00:00:00|Fellowship season...|    NULL|A few days after ...|                           2.0|
|https://yaledaily...|2000-10-09 00:00:00|Dartmouth corners...|    NULL|Heading into Hano...|                           4.0|
|https://yaledaily...|2000-10-09 00:00:00|Yale panel debate...|    NULL|What is the defin...|                           2.0|
|https://yaledaily...|2000-10-09 00:00:00|Who's the real cr...|    NULL|To the Editor:\nI...|                           1.0|


In [4]:
df.count(), len(df.columns)

(79746, 6)

In [5]:
df.agg(
    F.mean("estimated_reading_time_minutes").alias("Average Estimated Reading Time"),
    F.max("estimated_reading_time_minutes").alias("Max Estimated Reading Time"),
).show()

+------------------------------+--------------------------+
|Average Estimated Reading Time|Max Estimated Reading Time|
+------------------------------+--------------------------+
|            3.6233225358630263|                      59.0|
+------------------------------+--------------------------+



In [6]:
max_reading_time = df.agg(F.max("estimated_reading_time_minutes")).collect()[0][0]

In [7]:
df.select("url").where(F.col("estimated_reading_time_minutes") == max_reading_time).collect()[0][0]

'https://yaledailynews.com/blog/2024/04/22/live-police-begin-arresting-pro-divestment-protesters-on-beinecke-plaza/'

In [16]:
yearly_publications = (
    df.withColumn("year", F.year("date"))
    .groupBy("year")
    .count()
    .sort("year")
    .toPandas()
)
yearly_publications.plot.bar(x="year", y="count")

In [17]:
yearly_avg_minutes = (
    df.withColumn("year", F.year("date"))
    .groupBy("year")
    .agg(F.avg("estimated_reading_time_minutes").alias("avg_minutes"))
    .sort("year")
    .toPandas()
)
yearly_avg_minutes.plot.bar(x="year", y="avg_minutes")

In [10]:
# spark.stop()