<h1>Pain Point Analysis</h1>

In [0]:
%pip install nltk

In [0]:
import nltk

In [0]:
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, lit, when, udf, explode, lower, count, size, isnan, sum
from pyspark.sql.types import IntegerType, ArrayType, StringType

In [0]:
spark = SparkSession.builder.appName("analysis_painpoint").getOrCreate()

In [0]:
reddit_url_to_exclude = ["https://www.reddit.com/r/korea/comments/1it9gty/exclusive_being_taken_prisoner_is_treason_in/", "https://www.reddit.com/r/korea/comments/11a53p7/colonial_police_warned_residents_about_police/", "https://www.reddit.com/r/korea/comments/13wkuv7/this_is_how_my_ukrainian_neighbor_responded_to/", "https://www.reddit.com/r/korea/comments/w6phlt/colonial_authorities_discussed_how_to_reduce/", "https://www.reddit.com/r/korea/comments/15xk99n/autopsy_identifies_strangulation_as_preliminary/", "https://www.reddit.com/r/korea/comments/6afzls/my_complicated_visa_issue_with_seemingly_no/"]

In [0]:
appstore_df = spark.table("workspace.growth_poc.silver_appstore_reviews") \
                   .filter(year(col("updated")) >= 2023)\
                   .select(
                       col("updated").alias("review_date"),
                       col("rating").alias("review_rate"),
                       col("content_translated").alias("review_content"),
                       col("sentences").alias("review_sentences"),
                       col("words").alias("review_words"),
                       col("thumbsUpCount").alias("review_thumbsUpCount"),
                       col("appName"),
                       col("country"),
                       col("language")
                   ) 
playstore_df = spark.table("workspace.growth_poc.silver_playstore_reviews") \
                    .filter(year(col("at")) >= 2023)\
                    .select(
                       col("at").alias("review_date"),
                       col("score").alias("review_rate"),
                       col("content_translated").alias("review_content"),
                       col("sentences").alias("review_sentences"),
                       col("words").alias("review_words"),
                       col("thumbsUpCount").alias("review_thumbsUpCount"),
                       col("appName"),
                       col("language")
                   ) 
reddit_df = spark.table("workspace.growth_poc.silver_reddit_reviews")\
                    .filter((year(col("created_datetime")) >= 2023) & ~(col("url").isin(reddit_url_to_exclude))) \
                    .select(
                       col("created_datetime").alias("review_date"),
                       col("content_translated").alias("review_content"),
                       col("sentences").alias("review_sentences"),
                       col("words").alias("review_words"),
                       col("score").alias("review_thumbsUpCount"),
                       col("language")
                   ) 
                    
review_contents_df = appstore_df.unionByName(playstore_df,allowMissingColumns=True) \
                                .unionByName(reddit_df,allowMissingColumns=True)


In [0]:
# create temp view for faster access in later steps
review_contents_df.createOrReplaceTempView("review_contents_temp_view")
review_contents_df = spark.sql("SELECT * FROM review_contents_temp_view")

In [0]:
print(appstore_df.count())
print(playstore_df.count())
print(reddit_df.count())

<h2>Check Data</h2>

In [0]:
# 1. check schema
review_contents_df.printSchema()
print()
# 2. check column names
print(review_contents_df.columns)
print()
# 3. check data typue
print(review_contents_df.dtypes)
print()
# 4. check number of rows
review_contents_df.count()
print()
# 5. check statistical status
review_contents_df.describe().show()

# 6. check missing values
# get numeric columns
numeric_cols = [name for name, dtype in review_contents_df.dtypes if dtype in ('double', 'float', 'bigint')]
# numeric columns: count null values
for c in numeric_cols:
    null_count = review_contents_df.select(count(when(col(c).isNull() | isnan(c), c))).collect()[0][0]
    print(f"{c}: {null_count} nulls")



<h2>1. What are the most frequently occurring words in negative reviews?</h2>

<h3>1-1. Simple Frequency Analysis</h3>

<h4>1-1-1. Single Keyword Frequency Analysis </h4>

In [0]:
keyword_analysis_df = review_contents_df.select("review_rate", "review_content", "review_words", "review_thumbsUpCount")

In [0]:
def detect_negative_review(sentence):
    if not sentence: # None or empty string
        return 0 
    score = analyzer.polarity_scores(sentence)["compound"]
    result = 1 if score < 0 else 0
    return result

In [0]:
# Use VADER for sentiment analysis
analyzer = SentimentIntensityAnalyzer()

# Mark negative reviews
# If review_rate is 2 or less than 2, mark as negative
# If review_rate is not available, use VADER to detect
detect_negative_review_udf = udf(detect_negative_review, IntegerType())
mark_negative_reviews_df = keyword_analysis_df.withColumn("is_negative", \
    when(col("review_rate").isNull(), detect_negative_review_udf(col("review_content")))\
    .when(col("review_rate") <= 2, 1)\
    .otherwise(0)   
)

# get only negative reviews
negative_df = mark_negative_reviews_df.filter(col("is_negative") == 1)

# flatten words
words_exploded = negative_df.select(explode(col("review_words")).alias("word"), col("review_thumbsUpCount"))

# set lowercase and count 
word_counts = words_exploded.withColumn("word", lower(col("word"))) \
                            .groupBy("word").count() \
                            .orderBy(col("count").desc()).limit(100)

word_counts.show(100, truncate= False)


| word      | count |
|-----------|-------|
| delivery  | 161   |
| order     | 153   |
| food      | 147   |
| korea     | 139   |
| korean    | 134   |
| get       | 132   |
| app       | 130   |
| time      | 128   |
| card      | 101   |
| one       | 91    |
| even      | 90    |
| like      | 87    |
| use       | 80    |
| number    | 80    |
| thing     | 72    |
| phone     | 71    |
| go        | 68    |
| restaurant| 67    |
| work      | 61    |
| foreigner | 60    |
| service   | 60    |
| also      | 59    |
| back      | 57    |
| make      | 57    |
| need      | 56    |
| know      | 56    |
| people    | 54    |
| place     | 54    |
| problem   | 53    |
| pay       | 52    |
| country   | 52    |
| review    | 51    |
| item      | 51    |
| driver    | 49    |
| want      | 49    |
| really    | 48    |
| way       | 47    |
| day       | 47    |
| never     | 44    |
| every     | 44    |
| take      | 44    |
| account   | 44    |
| still     | 41    |
| something | 40    |
| got       | 40    |
| think     | 39    |
| come      | 38    |
| door      | 37    |
| money     | 37    |
| going     | 37    |
| bad       | 36    |
| lot       | 35    |
| used      | 35    |
| without   | 34    |
| good      | 34    |
| id        | 33    |
| coupang   | 33    |
| try       | 33    |
| english   | 32    |
| apps      | 32    |
| much      | 32    |
| option    | 31    |
| year      | 31    |
| call      | 31    |
| ordered   | 31    |
| live      | 31    |
| customer  | 30    |
| see       | 30    |
| bank      | 30    |
| issue     | 30    |
| many      | 29    |
| say       | 29    |
| u         | 29    |
| worst     | 28    |
| ever      | 28    |
| payment   | 28    |
| said      | 28    |
| using     | 27    |
| since     | 27    |
| tourist   | 27    |
| foreign   | 27    |
| sure      | 27    |
| hour      | 26    |
| first     | 26    |
| may       | 26    |
| delivered | 25    |
| able      | 25    |
| always    | 25    |
| could     | 25    |
| someone   | 24    |
| leave     | 24    |
| new       | 24    |
| tip       | 24    |
| tried     | 23    |
| wont      | 23    |
| living    | 23    |
| find      | 23    |
| wrong     | 23    |
| minute    | 23    |
| hard      | 23    |


In [0]:
print(negative_df.count())

I extracted the 100 most frequently occurring words in 520 negative reviews and categorized those into three key problems. </br>
<ol>
<li><b>Foreigner-Specific Issues</b></li>
korean(134), english(32), foreigner(60), foreign(27), call(31), tourist(27)<br/>
This shows there are high possibilities of language-related or systemic difficulty for foreigners to use the apps.

<li><b>Delivery Service Quality</b></li>
time(128), service(60), restaurant(67), door(37), item(51), driver(49), option(31), customer(30)<br/>
This shows there are issues with user experience with the app usage, such as delivery time, restaurant service, delivery issue, communication with driver, etc.

<li><b>Payment & Verification</b></li>
card(101), use(80), number(80), phone(71), pay(52), account(44), money(37), id(33), bank(30), payment(28)<br/>
This shows there are difficulties with completing orders due to payment or verification issues. I assume the problems will be related to "foreign card" or "phone verification", etc.

<li><b>Special Attention</b></li>
The mention of the specific name "coupang (33)" is a unique point. Through N-gram analysis, it can be determined whether this refers to the Coupang company itself or to the Coupang Eats app.
</ol>


<h4>1-1-2. N-gram Analysis</h4>

In [0]:
# take a list and return a list of tuples cotaining two words
def create_bigrams_from_list(words):
    if not words or len(words) < 2:
        return []
    bigrams_list = list(zip(words[:], words[1:])) # zip stops when the short sized list meets the end
    # words[:]   = ['I',    'EAT',    'BANANA']
    # words[1:]  = ['EAT',  'BANANA']
    # => [('I', 'EAT'), ('EAT', 'BANANA')]


    bigrams = [" ".join(grams) for grams in bigrams_list]
    return bigrams

# take a list and return a list of tuples cotaining three words
def create_trigrams_from_list(words):
    if not words or len(words) < 3:
        return []
    trigrams_list = list(zip(words[:], words[1:], words[2:])) 

    trigrams = [" ".join(grams) for grams in trigrams_list]
    return trigrams

 # register the function as udf 
create_bigrams_udf = udf(create_bigrams_from_list, ArrayType(StringType()))
create_trigrams_udf = udf(create_trigrams_from_list, ArrayType(StringType()))

In [0]:
# get biagram result
bigrams_df = negative_df.withColumn("keywords_paired", 
                                    create_bigrams_udf(col("review_words"))) \
                        .select("keywords_paired", "review_thumbsUpCount")

In [0]:
# flatten keywords and aggregate (count)
bigrams_flat_df = bigrams_df.select(explode(col("keywords_paired")).alias("keywords"))\
                            .groupBy("keywords")\
                            .count()\
                            .orderBy(col("count").desc())

bigrams_flat_df.show(50,truncate = False)

**Bigram Analysis Results**
| keywords        | count |
|-----------------|-------|
| phone number    | 43    |
| food delivery   | 24    |
| customer service| 19    |
| credit card     | 18    |
| korean phone    | 18    |
| delivery driver | 13    |
| delivery service| 11    |
| order delivery  | 11    |
| food delivered  | 11    |
| coupang eats    | 11    |
| use app         | 10    |
| delivery apps   | 9     |
| cancel order    | 9     |
| foreign card    | 9     |
| first time      | 9     |
| bank account    | 9     |
| apple pay       | 8     |
| delivery app    | 8     |
| korean bank     | 8     |
| feel like       | 8     |
| gon na          | 8     |
| app ever        | 7     |
| even though     | 7     |
| every time      | 7     |
| bank card       | 7     |
| every country   | 7     |
| money back      | 6     |
| uber eats       | 6     |
| order food      | 6     |
| worst app       | 6     |
| go back         | 6     |
| front door      | 6     |
| delivery guy    | 6     |
| delivery time   | 6     |
| need korean     | 6     |
| place live      | 6     |
| thing like      | 5     |
| make sure       | 5     |
| get money       | 5     |
| negative review | 5     |
| app even        | 5     |
| new one         | 5     |
| payment card    | 5     |
| order something | 5     |
| without korean  | 5     |
| thing korea     | 5     |
| waste time      | 5     |
| hard time       | 5     |
| able use        | 5     |
| tmoney card     | 5     |




In [0]:
# get biagram result
trigrams_df = negative_df.withColumn("keywords_paired", 
                                    create_trigrams_udf(col("review_words"))) \
                         .select("keywords_paired", "review_thumbsUpCount")

In [0]:
trigrams_flat_df = trigrams_df.select(explode(col("keywords_paired")).alias("keywords"))\
                            .groupBy("keywords")\
                            .count()\
                            .orderBy(col("count").desc())

trigrams_flat_df.show(20, truncate = False)

**Trigram Analysis Result**
| keywords                | count |
|--------------------------|-------|
| korean phone number      | 17    |
| worst app ever           | 5     |
| need phone number        | 5     |
| food delivery service    | 4     |
| foreign credit card      | 4     |
| need korean phone        | 4     |
| get money back           | 3     |
| korean bank account      | 3     |
| alien registration card  | 3     |
| arc alien registration   | 3     |
| english eye english      | 2     |
| phone number set         | 2     |
| use non korean           | 2     |
| without korea phone      | 2     |
| foreign card work        | 2     |
| support apple pay        | 2     |
| credit card accepted     | 2     |
| contact customer service | 2     |
| eye english eye          | 2     |
| food discarded even      | 2     |




By applying N-gram analysis, I was able to better capture the context of word usage, which single keyword analysis alone could not fully reveal.

<ol>
<li><b>Dominant Issue: Foreigner Verification & Payment</b></li>
Key Words:
<ul>
<li>Bigram: phone number (43), credit card (18), korean phone (18), bank account (9), korean bank (8), apple pay (8), foreign card (9), payment card (5), without korean (5), tmoney card (5)</li>
<li>Trigram: korean phone number (17), need phone number (5), foreign credit card (4), need korean phone (4), korean bank account (3), alien registration card (3), arc alien registration (3), phone number set (2), without korea phone (2), foreign card work (2), support apple pay (2), credit card accepted (2)</li>
</ul>
Quantitative evidence shows that the biggest difficulty for users is the verification process requiring a Korean phone number. Payment failures due to lack of foreign credit cards or Korean bank accounts also emerge as a clear issue.

<li><b>Service Quality Issues</b></li>
Key Words:
<ul>
<li>Bigram: customer service (19), delivery driver (13), delivery service (11), food delivery (24), order delivery (11), food delivered (11), delivery apps (9), cancel order (9), delivery app (8), delivery guy (6), delivery time (6)</li>
<li>Trigram: worst app ever (5), food delivery service (4), get money back (3), contact customer service (2), food discarded even (2)</li>
</ul>
Even after completing verification and payment, users frequently express dissatisfaction with service quality, including customer service, delivery drivers, and delivery times.

<li><b>Foreigner-Specific Issues</b></li>
Key Words:
<ul>
<li>Bigram: need korean (6), without korean (5), able use (5)</li>
<li>Trigram: use non korean (2), without korea phone (2)</li>
</ul>
Foreign users experience inconvenience not only from language barriers but also from structural requirements such as needing a Korean phone number or bank account. N-gram analysis shows that the term "Korean" is more often associated with these requirements rather than just language support.

<li><b>Mentions of Specific Apps</b></li>
Key Words:
<ul>
<li>Bigram: coupang eats (11), uber eats (6)</li>
<li>Trigram: worst app ever (5)</li>
</ul>
Through bigram analysis, the word “Coupang,” which appeared in single keyword analysis, is revealed to specifically refer to the food delivery app <b>Coupang Eats</b>. Additionally, <b>Uber Eats</b>, which is widely used internationally, also appears. The frequent mentions of specific apps indicate their high market visibility and their role as benchmarks for user expectations.
</ol>


<h3>1-2. Weighted Frequency Analysis</h3>
<h4>1-2-1. Single Keyword Frequency Analysis</h4>

In [0]:
weighted_word_counts = words_exploded.withColumn("word", lower(col("word"))) \
    .groupBy("word") \
    .agg(
        sum(col("review_thumbsUpCount") + 1).alias("weighted_count")) \
    .orderBy(col("weighted_count").desc()) \
    .limit(100)

weighted_bigrams_flat_df = bigrams_df.withColumn("keywords", explode(col("keywords_paired")))\
                            .groupBy("keywords")\
                            .agg(
                                sum(col("review_thumbsUpCount") + 1).alias("weighted_count")) \
                            .orderBy(col("weighted_count").desc()).limit(100)

weighted_trigrams_flat_df = trigrams_df.withColumn("keywords", explode(col("keywords_paired")))\
                            .groupBy("keywords")\
                            .agg(
                                sum(col("review_thumbsUpCount") + 1).alias("weighted_count")) \
                            .orderBy(col("weighted_count").desc()).limit(100)

In [0]:
weighted_word_counts.show(100, truncate = False)

| word        | weighted_count |
|-------------|----------------|
| delivery    | 2201           |
| food        | 1797           |
| order       | 1506           |
| korea       | 1484           |
| korean      | 1474           |
| arc         | 1338           |
| phone       | 1277           |
| get         | 1240           |
| card        | 1200           |
| number      | 1198           |
| even        | 1118           |
| like        | 1062           |
| apps        | 1036           |
| app         | 1017           |
| time        | 929            |
| account     | 928            |
| restaurant  | 925            |
| one         | 854            |
| foreigner   | 813            |
| people      | 808            |
| thing       | 802            |
| review      | 786            |
| need        | 782            |
| back        | 763            |
| country     | 695            |
| go          | 668            |
| place       | 614            |
| use         | 602            |
| item        | 589            |
| way         | 588            |
| got         | 572            |
| know        | 562            |
| pay         | 556            |
| bank        | 555            |
| make        | 538            |
| money       | 510            |
| also        | 509            |
| find        | 504            |
| bad         | 499            |
| really      | 495            |
| service     | 474            |
| every       | 470            |
| work        | 466            |
| want        | 465            |
| connected   | 442            |
| still       | 436            |
| deal        | 429            |
| price       | 425            |
| shuttle     | 412            |
| sub         | 403            |
| registration| 400            |
| business    | 396            |
| register    | 389            |
| driver      | 389            |
| wanted      | 366            |
| tried       | 363            |
| visa        | 363            |
| city        | 362            |
| good        | 361            |
| visit       | 360            |
| without     | 357            |
| problem     | 354            |
| alien       | 350            |
| everything  | 348            |
| think       | 347            |
| day         | 346            |
| identity    | 343            |
| verify      | 339            |
| longterm    | 336            |
| wont        | 335            |
| week        | 335            |
| call        | 330            |
| thought     | 328            |
| come        | 317            |
| let         | 315            |
| many        | 314            |
| used        | 313            |
| isnt        | 310            |
| much        | 310            |
| plastic     | 307            |
| usually     | 306            |
| tip         | 305            |
| able        | 303            |
| great       | 300            |
| another     | 295            |
| spend       | 294            |
| see         | 290            |
| system      | 290            |
| tourist     | 284            |
| feel        | 284            |
| though      | 282            |
| might       | 282            |
| never       | 279            |
| may         | 278            |
| going       | 277            |
| two         | 275            |
| person      | 274            |
| first       | 274            |
| living      | 274            |
| big         | 274            |



단순 frequency 분석은 단어가 얼마나 자주 등장했는지를 보여주지만, 사용자 공감을 반영하지 못한다는 한계가 있습니다. 따라서 이번에는 리뷰의 thumbs up count를 반영하여 가중치를 부여함으로써, 사용자들에게 더 의미 있게 다가간 문제와 맥락을 확인할 수 있었습니다.

인증 및 결제 문제 (가장 높은 비중)

핵심 단어: korea (1484), korean (1474), arc (1338), phone (1277), card (1200), number (1198), account (928), bank (555), registration (400), alien (350), identity (343), verify (339)

해석: 단순 빈도 분석에서도 드러났던 외국인 인증 문제가, 가중치 분석에서는 더 명확하게 최상위 이슈로 부각되었습니다. 특히 ARC(외국인 등록증), 신원 인증, 한국 은행 계좌 및 카드 사용 문제는 사용자들이 실제로 ‘좋아요’를 눌러 공감한 불편 사항으로 확인됩니다.

서비스 이용 및 주문 경험

핵심 단어: delivery (2201), food (1797), order (1506), restaurant (925), driver (389), service (474), shuttle (412), sub (403)

해석: 배달 서비스와 관련된 경험은 여전히 주요 키워드로 등장했습니다. 단순히 ‘많이 언급된 것’에 그치지 않고, thumbs up이 높은 리뷰에 집중적으로 등장한 것을 보면, 음식 배달 품질과 서비스 경험(기사, 배달 지연, 서브 서비스 등)이 사용자 만족에 직결된 요소임을 알 수 있습니다.

외국인 사용자 불편함

핵심 단어: foreigner (813), country (695), without (357), tourist (284), living (274)

해석: 외국인이라는 사용자 정체성과 관련된 키워드도 높은 가중치를 기록했습니다. 단순히 “외국인이라 불편하다”는 표현을 넘어, 거주·관광 맥락 속에서 경험하는 불편함이 공감을 얻은 것으로 보입니다. 이는 서비스가 ‘외국인 맞춤 경험’을 충분히 제공하지 못하고 있음을 반영합니다.

In [0]:
weighted_bigrams_flat_df.show(20, truncate = False)

| keywords          | weighted_count |
|------------------|----------------|
| phone number      | 906           |
| food delivery     | 509           |
| need phone        | 348           |
| visit sub         | 344           |
| alien registration| 343           |
| arc alien         | 343           |
| registration card | 343           |
| number connected  | 340           |
| connected arc     | 340           |
| visa arc          | 338           |
| longterm visa     | 336           |
| apps longterm     | 336           |
| identity apps     | 336           |
| arc arc           | 336           |
| register verify   | 336           |
| arc food          | 336           |
| order register    | 336           |
| verify identity   | 336           |
| deal need         | 336           |
| delivery account  | 336           |



In [0]:
weighted_trigrams_flat_df.show(20, truncate = False)

| keywords                | weighted_count |
|-------------------------|----------------|
| need phone number       | 348           |
| alien registration card | 343           |
| arc alien registration  | 343           |
| connected arc alien     | 340           |
| number connected arc    | 340           |
| phone number connected  | 340           |
| arc arc food            | 336           |
| identity apps longterm  | 336           |
| deal need phone         | 336           |
| order register verify   | 336           |
| longterm visa arc       | 336           |
| verify identity apps    | 336           |
| registration card order | 336           |
| food delivery account   | 336           |
| visa arc arc            | 336           |
| card order register     | 336           |
| arc food delivery       | 336           |
| register verify identity| 336           |
| apps longterm visa      | 336           |
| korean phone number     | 299           |

