In [0]:
# 부정적인 리뷰에서 가장 자주 등장하는 단어는 무엇인가?
#What are the most frequently occurring words in negative reviews?

In [0]:
%pip install nltk

In [0]:
import nltk

In [0]:
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, lit, when, udf, explode, lower
from pyspark.sql.types import IntegerType

In [0]:
spark = SparkSession.builder.appName("analysis_painpoint").getOrCreate()

In [0]:
appstore_df = spark.table("workspace.growth_poc.silver_appstore_reviews") \
                   .filter(year(col("updated")) >= lit(2023)) \
                   .select(
                       col("updated").alias("review_date"),
                       col("rating").alias("review_rate"),
                       col("content_translated").alias("review_content"),
                       col("sentences").alias("review_sentences"),
                       col("words").alias("review_words"),
                       col("thumbsUpCount").alias("review_thumbsUpCount"),
                       col("appName"),
                       col("country"),
                       col("language")
                   ) 
playstore_df = spark.table("workspace.growth_poc.silver_playstore_reviews") \
                    .filter(year(col("at")) >= lit(2023)) \
                    .select(
                       col("at").alias("review_date"),
                       col("score").alias("review_rate"),
                       col("content_translated").alias("review_content"),
                       col("sentences").alias("review_sentences"),
                       col("words").alias("review_words"),
                       col("thumbsUpCount").alias("review_thumbsUpCount"),
                       col("appName"),
                       col("language")
                   ) 
reddit_df = spark.table("workspace.growth_poc.silver_reddit_reviews")\
                    .filter(year(col("created_datetime")) >= lit(2023)) \
                    .select(
                       col("created_datetime").alias("review_date"),
                       col("content_translated").alias("review_content"),
                       col("sentences").alias("review_sentences"),
                       col("words").alias("review_words"),
                       col("score").alias("review_thumbsUpCount"),
                       col("language")
                   ) 


In [0]:
def detect_negative_review(sentence):
    if not sentence: # None or empty string
        return 0 
    score = analyzer.polarity_scores(sentence)["compound"]
    result = 1 if score < 0 else 0
    return result

In [0]:
review_contents_df = appstore_df.unionByName(playstore_df,allowMissingColumns=True) \
                                .unionByName(reddit_df,allowMissingColumns=True)

# Use VADER for sentiment analysis
analyzer = SentimentIntensityAnalyzer()

# Mark negative reviews
# If review_rate is 3 or less than 3, mark as negative
# If review_rate is not available, use VADER to detect
detect_negative_review_udf = udf(detect_negative_review, IntegerType())
mark_negative_reviews_df = review_contents_df.withColumn("is_negative", \
    when(col("review_rate").isNull(), detect_negative_review_udf(col("review_content")))\
    .when(col("review_rate") <= 3, 1)\
    .otherwise(0)   
)

# get only negative reviews
negative_df = mark_negative_reviews_df.filter(col("is_negative") == 1)

# flatten words
words_exploded = negative_df.select(explode(col("review_words")).alias("word"))

# set lowercase and count 
word_counts = words_exploded.withColumn("word", lower(col("word"))) \
                            .groupBy("word").count() \
                            .orderBy(col("count").desc()).limit(100)

display(word_counts)

In [0]:
print(negative_df.count())

I extracted the 100 most frequently occuring words in 554 negative reviews and categorized those into three key problems. </br>
<ol>
<li><b>Foreigner-Specific Issues</b></li>
korean(138), english(33), foreigner(31), foreign(30), call(29)<br/>
This shows there are high possibilities of language-related or systemic difficulty for foreigners' to use the apps.

<li><b>Delivery Service Quality</b></li>
time(111), service(51), restaurant(40), door(35), item(34), driver(28)<br/>
This shows there are issues with user experience with the app usage, such as delivery time, restaurant service, delivery issue, communication with driver, etc.

<li><b>Payment & Verification</b></li>
number(76), card(75), phone(73), pay(54), money(43), account(41), id(38), cards(33), bank(28), payment(26)<br/>
This shows there are difficulties with completing orders due to payment or verification issues. I assume the problems will be related to "foreign card" or "phone verification", etc. 

<li><b>Special Attention</b></li>
It is intersting the name of a specific app, <b>coupang</b>, is mentioned frequently in the reviews. This may indicate coupang is considered as an alternative app to move on to when the users' current apps fail to satisfy them.


</ol>

