In [14]:
from pyspark.sql import SparkSession, functions as F
from delta import configure_spark_with_delta_pip
from pyspark.sql.functions import col, trim, lower, regexp_replace
import os

In [3]:
path_landing = "../../../delta_lake/csv"
path_creation = "/delta_lake/creation"
path_exploitation = "/delta_lake/exploitation"

In [5]:
builder = SparkSession.builder \
    .appName("Delta Lake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/22 22:02:00 WARN Utils: Your hostname, provira-ERAZER-P6705-MD61203, resolves to a loopback address: 127.0.1.1; using 192.168.1.55 instead (on interface wlo1)
25/06/22 22:02:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/provira/anaconda3/envs/py_tfm_env/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/provira/.ivy2.5.2/cache
The jars for the packages stored in: /home/provira/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6a109570-b15b-4526-a034-d40d154f02a3;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 230ms :: art

In [19]:
df_csv = spark.read.format("delta").load(f"{path_landing}")
df_csv.head(5)

[Row(_c0='519522', text='i do feel badly about is being so rude mean and over the top in doing so', Emotion='neutral'),
 Row(_c0='519523', text='i feel its a part of my passionate nature that makes me a valuable human being', Emotion='neutral'),
 Row(_c0='519524', text='i feel a little jaded a little grey a little less than i was', Emotion='neutral'),
 Row(_c0='519525', text='i feel like i m rich even though i probably only square up on the middle class rung of the ladder but the place is not that nice', Emotion='neutral'),
 Row(_c0='519526', text='i feel very appreciative thankful and grateful', Emotion='neutral')]

In [None]:
distinct_emotions = df_csv.select("Emotion").distinct()
distinct_emotions.show()

count_distinct = df_csv.select("Emotion").distinct().count()
total = df_csv.select("Emotion").count()
print(f"Number of distinct Emotion labels: {count_distinct} / {total}")


+--------------------+
|             Emotion|
+--------------------+
|             boredom|
|                love|
|omg!!! loving thi...|
|why the nazis stu...|
| @user as forecas...|
|so simple, but so...|
|   #friday  xoxos...|
|#tgif   #ff to my...|
|@user don't forge...|
|loved that season...|
|@user pay of #ric...|
|#gameshow   bull ...|
|lack of access sp...|
|family 5k @user #...|
|@user fg introduc...|
|@user the library...|
|men's footjoy bla...|
| @user a great #m...|
|happy friday.   #...|
|badminton bareng ...|
+--------------------+
only showing top 20 rows
Number of distinct Emotion labels: 78800 / 1100992


In [29]:
# Get count per Emotion
emotion_counts = df_csv.groupBy("Emotion").count()

# Order by count descending and take top 10
top_10_emotions = emotion_counts.orderBy(col("count").desc()).limit(20)

top_10_emotions.show()

+--------------------+------+
|             Emotion| count|
+--------------------+------+
|             neutral|674538|
|                love| 39553|
|           happiness| 27175|
|                NULL| 18177|
|             sadness| 17491|
|              relief| 16729|
|                hate| 15267|
|               anger| 12356|
|                 fun| 10075|
|          enthusiasm|  9304|
|            surprise|  6954|
|               empty|  5542|
|               worry|  4475|
|           [deleted]|  3861|
|#model   i love u...|   319|
|             boredom|   126|
|              #NAME?|   123|
|        CakeDay--Bot|    96|
|i finally found a...|    82|
|aww yeah it's all...|    75|
+--------------------+------+



In [None]:
df_csv_clean = df_csv.dropna() \
    .withColumnRenamed("_c0", "id") \
    .withColumnRenamed("Emotion", "emotion") \
    .withColumn("text", trim(col("text"))) \
    .withColumn("text", lower(col("text"))) \
    .withColumn("text", regexp_replace(col("text"), r"\bi m\b", "i'm")) \
    .withColumn("text", regexp_replace(col("text"), r"[^a-zA-Z0-9\s']", ""))  # keep letters, digits, spaces, apostrophes

df_csv_clean.head(5)

[Row(id='0', text='i seriously hate one subject to death but now i feel reluctant to drop it', Emotion='hate'),
 Row(id='1', text='im so full of life i feel appalled', Emotion='neutral'),
 Row(id='2', text='i sit here to write i start to dig out my feelings and i think that i am afraid to accept the possibility that he might not make it', Emotion='neutral'),
 Row(id='3', text='ive been really angry with r and i feel like an idiot for trusting him in the first place', Emotion='anger'),
 Row(id='4', text='i feel suspicious if there is no one outside like the rapture has happened or something', Emotion='neutral')]