In [1]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("Local PySpark Session") \
    .config("spark.driver.memory", "24g") \
    .master("local[*]") \
    .getOrCreate()

# Check if the SparkSession is running
print("Spark version:", spark.version)

Spark version: 3.5.3


In [None]:
# Show the Driver and Executor memory
print(spark.sparkContext.getConf().get('spark.driver.memory'))
print(spark.sparkContext.getConf().get('spark.executor.memory'))

24g
None


In [None]:
# Show spark Context specs
for key, value in spark.sparkContext.getConf().getAll():
    print(f"{key}: {value}")

spark.driver.memory: 24g
spark.driver.extraJavaOptions: -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false
spark.app.name: Local PySpark Session
spark.executor.id: driver
spark.driver.port: 51659

In [None]:
import os

# Load entire video metadata dataset from parquet into pyspark dataframe
directory = os.fsencode('YouNiverse/df_video_meta_parquet.gz')
directory_str = 'YouNiverse/df_video_meta_parquet.gz'

dfs_spark_list = []
    
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".parquet"): 
        dfs_spark_list.append(spark.read.parquet(f'{directory_str}/{filename}'))
        continue
    else:
        continue


df_spark_video_meta = dfs_spark_list[0]
for df in dfs_spark_list[1:]:
    df_spark_video_meta = df_spark_video_meta.unionAll(df)

df_spark_video_meta.show()

+----------------+--------------------+--------------------+--------------------+-------------+-----------+--------+----------+--------------------+--------------------+-------------------+----------+-------------------+
|      categories|          channel_id|          crawl_date|         description|dislike_count| display_id|duration|like_count|                tags|               title|        upload_date|view_count|__null_dask_index__|
+----------------+--------------------+--------------------+--------------------+-------------+-----------+--------+----------+--------------------+--------------------+-------------------+----------+-------------------+
|Film & Animation|UCzWrhkg9eK5I8Bm3...|2019-10-31 20:19:...|Lego City Police ...|          1.0|SBqSc91Hn9g|    1159|       8.0|lego city,lego po...|Lego City Police ...|2016-09-28 00:00:00|    1057.0|                  0|
|Film & Animation|UCzWrhkg9eK5I8Bm3...|2019-10-31 20:19:...|Lego Marvel Super...|          1.0|UuugEl86ESY|    2681|

In [None]:
# Filter dataframe for descriptions containing 'The Lion King' and 'Trailer'
df_spark_lion_king = df_spark_video_meta.filter(df_spark_video_meta.description.contains('The Lion King') & df_spark_video_meta.description.contains('Trailer'))
df_spark_lion_king.show(truncate=True)

+----------------+--------------------+--------------------+--------------------+-------------+-----------+--------+----------+--------------------+--------------------+-------------------+----------+-------------------+
|      categories|          channel_id|          crawl_date|         description|dislike_count| display_id|duration|like_count|                tags|               title|        upload_date|view_count|__null_dask_index__|
+----------------+--------------------+--------------------+--------------------+-------------+-----------+--------+----------+--------------------+--------------------+-------------------+----------+-------------------+
|   Entertainment|UCzOrpRAPhzLet9lC...|2019-11-01 06:58:...|Jagapathi Babu EM...|          0.0|z8deNFzK8uU|     125|       8.0|Socialpost,social...|Jagapathi Babu EM...|2019-07-09 00:00:00|     462.0|             141951|
|Film & Animation|UCzNWVDZQ55bjq8uI...|2019-11-18 02:11:...|Bring the Magic o...|        142.0|UPSPj3kx3Cc|      63|

In [None]:
# Filter dataframe for titles containing 'The Lion King' and 'Trailer'
df_spark_lion_king_title = df_spark_video_meta.filter(df_spark_video_meta.title.contains('The Lion King') & df_spark_video_meta.title.contains('Trailer'))
df_spark_lion_king_title.show(truncate=True)

+----------------+--------------------+--------------------+--------------------+-------------+-----------+--------+----------+--------------------+--------------------+-------------------+----------+-------------------+
|      categories|          channel_id|          crawl_date|         description|dislike_count| display_id|duration|like_count|                tags|               title|        upload_date|view_count|__null_dask_index__|
+----------------+--------------------+--------------------+--------------------+-------------+-----------+--------+----------+--------------------+--------------------+-------------------+----------+-------------------+
|           Music|UCzWLloEwMWZYnUEr...|2019-11-19 14:22:...|DOWNLOAD FOR FREE...|          7.0|tqWrUc2ITcQ|      42|      83.0|piano,pianist,cov...|The Lion King Off...|2018-11-23 00:00:00|    4845.0|               8133|
| Travel & Events|UCzMPu2bszENwQQch...|2019-11-04 06:17:...|The Lion King 201...|          7.0|7lloquq84tQ|     160|

In [None]:
# Show 500 lines (test memory capabilities)
df_spark_lion_king_title.show(500)

+----------------+--------------------+--------------------+--------------------+-------------+-----------+--------+----------+--------------------+--------------------+-------------------+-----------+-------------------+
|      categories|          channel_id|          crawl_date|         description|dislike_count| display_id|duration|like_count|                tags|               title|        upload_date| view_count|__null_dask_index__|
+----------------+--------------------+--------------------+--------------------+-------------+-----------+--------+----------+--------------------+--------------------+-------------------+-----------+-------------------+
|           Music|UCzWLloEwMWZYnUEr...|2019-11-19 14:22:...|DOWNLOAD FOR FREE...|          7.0|tqWrUc2ITcQ|      42|      83.0|piano,pianist,cov...|The Lion King Off...|2018-11-23 00:00:00|     4845.0|               8133|
| Travel & Events|UCzMPu2bszENwQQch...|2019-11-04 06:17:...|The Lion King 201...|          7.0|7lloquq84tQ|     

In [None]:
# Show 500 lines (test memory capabilities)
df_spark_lion_king.show(500)

+----------------+--------------------+--------------------+--------------------+-------------+-----------+--------+----------+--------------------+-------------------------------+-------------------+-----------+-------------------+
|      categories|          channel_id|          crawl_date|         description|dislike_count| display_id|duration|like_count|                tags|                          title|        upload_date| view_count|__null_dask_index__|
+----------------+--------------------+--------------------+--------------------+-------------+-----------+--------+----------+--------------------+-------------------------------+-------------------+-----------+-------------------+
|   Entertainment|UCzOrpRAPhzLet9lC...|2019-11-01 06:58:...|Jagapathi Babu EM...|          0.0|z8deNFzK8uU|     125|       8.0|Socialpost,social...|           Jagapathi Babu EM...|2019-07-09 00:00:00|      462.0|             141951|
|Film & Animation|UCzNWVDZQ55bjq8uI...|2019-11-18 02:11:...|Bring th