# Sentiment Analysis on News Articles of Vendors

## Dealing with Semi-Structured Data stored in HDFS of SQL 2019

- In this notebook we will understand how to process, transform, prepare a JSON file data which can be further used for tasks like ML model trainings. Since we already developed the underlying ML model using external environment we consume the model within SQL 2019 BDC for predictions. 
- The model end point is hosted in Azure Kubenetes Cluster (ACS) for batch and live model inferences.

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession\
        .builder\
        .appName("Spark_Ingestion_Job")\
        .config("spark.executor.memory", "20g")\
        .config("spark.executor.instances", "3")\
        .config("spark.master", "yarn")\
        .config("spark.submit.deployMode", "client")\
        .config("spark.driver.memory", "30g")\
        .enableHiveSupport()\
        .getOrCreate()

###  Loading Data

We can use import files option from the HDFS directory where we want to store the data to import local files into HDFS since there is no programmable way.

In [None]:
# do with RDD
import json
news_data_rdd = sc.textFile('/COE/news_data/contify_insights_new.json').map(json.loads)
news_data_rdd.take(1)

In [None]:
news_data_rdd.count()

In [None]:
from pyspark.sql.types import Row
import pyspark.sql.functions as sf 
import requests

def spliter(lines):
    data = {}
    line = lines['results']
    if line:
        for d in line:
            data['id'] = d['id']
            data['title'] = d['title']
            data['summary'] = d['summary']
    else:    
        data['id'] = ''
        data['title'] = ''
        data['summary'] = ''
    data['search_company'] = lines['search_company']
    return data

rdd_df = news_data_rdd.map(lambda x: Row(**spliter(x)))
rdd_df.collect()

In [None]:
df = rdd_df.toDF()
df.printSchema

In [None]:
type(df)

In [None]:
# convert pyspark dataframe to pandas dataframe
#
pd_df = df.toPandas()
# pd_df = pd_df[pd_df['summary'] != '']
# pd_df.reset_index(inplace = True)
# pd_df.drop('index', axis = 1, inplace = True)

In [None]:
# model scoring 
def sentiment_scores(text_input):
    response = requests.post("http://52.187.124.32:80/api/v1/service/absa-sentiment-predictor-v2/score", text_input, headers = {'Content-Type' : 'application/json', 'Authorization': 'Bearer 1Q7d5p2SqViNlQbhe6gtHBAiZ5MB58rU'})
    response = response.json()
    polarity = response['_doc_polarity']
    scores = response['scores']
    return(polarity, scores)

# attach model results to dataframe
def model_scores(dataframe):
    for index, row in dataframe.iterrows():
        pol, scores = sentiment_scores(row['scoring_text'].encode('utf-8'))
        dataframe.loc[index, 'polarity'] = pol
        dataframe.loc[index, 'positive'] = str(list(filter(None, [v if k == 'Positive' else 0 for k, v in scores.items()])))
        dataframe.loc[index, 'neutral'] = str(list(filter(None, [v if k == 'Neutral' else 0 for k, v in scores.items()])))
        dataframe.loc[index, 'negative'] = str(list(filter(None, [v if k == 'Negative' else 0 for k, v in scores.items()])))
    return dataframe

In [None]:
# text pre-processing
#
pd_df['scoring_text'] = '{"news":"' + pd_df['summary'] + '",' + '"name":"' + pd_df['search_company'] + '"}'
pd_df

# application of model scoring 
#
model_score_df = model_scores(pd_df)
model_score_df[['polarity', 'positive', 'negative', 'neutral']].head() # print top 5 results

In [None]:
# convert pandas dataframe to Pyspark dataframe
#
model_scores_spark_df = spark.createDataFrame(model_score_df)
print(type(model_scores_spark_df))
print(model_scores_spark_df.printSchema())
print(model_scores_spark_df.show(5))

In [None]:
# save spark dataframe to hdfs 
#
model_scores_spark_df.write.format('csv').mode('overwrite').option('header', True).save('/COE/news_data/news_rdd/sentiment_scores.csv')