**SENTIMENT ANALYSIS WITH INCREMENTAL LOADING**

In [10]:
df = spark.sql("SELECT * FROM Bing_lake_house.us_2024_election_latest_news")
display(df)

StatementMeta(, 532b9431-0e02-47f4-bdf8-85fd5f1c1482, 12, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 8dfffa43-ccf9-4ea6-a5dd-09fa4df82520)

In [11]:
import synapse.ml.core
from synapse.ml.services import AnalyzeText

StatementMeta(, 532b9431-0e02-47f4-bdf8-85fd5f1c1482, 13, Finished, Available, Finished)

In [12]:
#initialize the model and configure the input and output columns
model = (AnalyzeText()
        .setTextCol("description") ## set the column we want to perform sentiments on
        .setKind("SentimentAnalysis") ## specifying the sentiment analysis model to be performed.
        .setOutputCol("response")
        .setErrorCol("error")) 



StatementMeta(, 532b9431-0e02-47f4-bdf8-85fd5f1c1482, 14, Finished, Available, Finished)

In [13]:
#Apply the model to our dataframe
result = model.transform(df)

display(result.limit(10))



StatementMeta(, 532b9431-0e02-47f4-bdf8-85fd5f1c1482, 15, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 17ee12e5-3cd2-4f0e-a49b-585d038bf094)

In [14]:
#To get the Sentiment Column from the response column
from pyspark.sql.functions import col

sentiment_df = result.withColumn("sentiment", col("response.documents.sentiment"))
display(sentiment_df.limit(7))



StatementMeta(, 532b9431-0e02-47f4-bdf8-85fd5f1c1482, 16, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, de23ff71-30ac-4559-a520-96d083d29039)

In [22]:
#Droping the error and response columns

sentiment_df_final = sentiment_df.drop("error","response")
display(sentiment_df_final.limit(10))

StatementMeta(, 532b9431-0e02-47f4-bdf8-85fd5f1c1482, 24, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 4ee35471-8050-4e58-be5f-aedf52c8a379)

In [24]:
# Adopting TYPE 1 SCD incremental loading for our data.

'''In a Type 1 SCD the new data overwrites the existing data without duplicate. Thus the existing data
 is lost as it is not stored anywhere else. This is typically used when there is no need to keep 
 a history of the data.'''

from pyspark.sql.utils import AnalysisException

#Exception Handling
try:

    table_name = 'Bing_lake_house.us_2024_election_sentiment_analysis'
    sentiment_df_final.write.format("delta").saveAsTable(table_name)

except AnalysisException:

    print ("Table Already Exist")

    sentiment_df_final.createOrReplaceTempView("vw_sentiment_df_final")

    spark.sql(f"""  MERGE INTO {table_name} target_table
                    USING vw_sentiment_df_final source_view

                    ON source_view.link = target_table.link

                    WHEN MATCHED AND
                    source_view.title <> target_table.title OR
                    source_view.description <> target_table.description OR
                    source_view.image <> target_table.image OR
                    source_view.link <> target_table.link OR
                    source_view.datePublished <> target_table.datePublished OR
                    source_view.provider <> target_table.provider OR
                    source_view.published_date <> target_table.published_date OR
                    source_view.published_time<> target_table.published_time
                    
                    THEN UPDATE SET *

                    WHEN NOT MATCHED THEN INSERT * 

                """)

StatementMeta(, 532b9431-0e02-47f4-bdf8-85fd5f1c1482, 26, Finished, Available, Finished)

Table Already Exist
