# Explore User Group Data - Arrivals

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, col, isnan, max, sum, explode, split, trim, avg, lag, round
import matplotlib.pyplot as plt
import pandas as pd
import builtins
from dateutil.relativedelta import relativedelta
import numpy as np

## 1. Overall
To examine foreigners entering Korea data, I used the monthly datasets provided by the 관광지식정보시스템, which provides the number of foreign arrivals by nationality and purpose, and by nationality and age group.

In [0]:
spark = SparkSession.builder.appName("analysis_user_group_arrivlas").getOrCreate()

In [0]:
query = """
select *
from workspace.growth_poc.silver_visitors
"""

columns_to_rename= {"Year" : "visit_year", "Month" : "visit_month", "Date" : "visit_date"}
arr_np_df = spark.sql(query).withColumnsRenamed(columns_to_rename)

In [0]:
query = """
select *
from workspace.growth_poc.silver_visitors_agegroup_nationality
"""
arr_na_df = spark.sql(query)

In [0]:
# 1. check schema
arr_np_df.printSchema()
print()
# 2. check column names
print(arr_np_df.columns)
print()
# 3. check data typue
print(arr_np_df.dtypes)
print()
# 4. check number of rows
print(f"Total number of rows: {arr_np_df.count()}")
print()
# 5. check statistical status
arr_np_df.describe().show()

# 6. check missing values
# get numeric columns
numeric_cols = [name for name, dtype in arr_np_df.dtypes if dtype in ('double', 'float', 'bigint')]
# numeric columns: count null values
for c in numeric_cols:
    null_count = arr_np_df.select(count(when(col(c).isNull() | isnan(c), c))).collect()[0][0]
    print(f"{c}: {null_count} nulls")



In [0]:
# 1. check schema
arr_na_df.printSchema()
print()
# 2. check column names
print(arr_na_df.columns)
print()
# 3. check data typue
print(arr_na_df.dtypes)
print()
# 4. check number of rows
print(f"Total number of rows: {arr_na_df.count()}")
print()
# 5. check statistical status
arr_na_df.describe().show()

# 6. check missing values
# get numeric columns
numeric_cols = [name for name, dtype in arr_na_df.dtypes if dtype in ('double', 'float', 'bigint')]
# numeric columns: count null values
for c in numeric_cols:
    null_count = arr_na_df.select(count(when(col(c).isNull() | isnan(c), c))).collect()[0][0]
    print(f"{c}: {null_count} nulls")



In [0]:

arr_np_df = arr_np_df.withColumn(
    "purpose_english",
    when(arr_np_df["Purpose"] == "유학연수", "Student")
    .when(arr_np_df["Purpose"] == "기타", "Other")
    .when(arr_np_df["Purpose"] == "상용", "Business")
    .when(arr_np_df["Purpose"] == "공용", "Official Business")
    .when(arr_np_df["Purpose"] == "관광", "Tourist")
    .otherwise("Other")
).withColumn(
    "visit_category",
    when(arr_np_df["Purpose"] == "유학연수", "long")
    .when(arr_np_df["Purpose"] == "기타", "short")
    .when(arr_np_df["Purpose"] == "상용", "short")
    .when(arr_np_df["Purpose"] == "공용", "long")
    .when(arr_np_df["Purpose"] == "관광", "short")
    .otherwise("Other")
)

In [0]:
max_year_np = arr_np_df.select(col("visit_year").cast("int")).agg(max("visit_year")).collect()[0][0]
max_year_na = arr_na_df.select(col("visit_year").cast("int")).agg(max("visit_year")).collect()[0][0]
max_year = max_year_np

max_month_np = arr_np_df.select(col("visit_date")).agg(max("visit_date")).collect()[0][0]
max_month_na = arr_na_df.select(col("visit_date")).agg(max("visit_date")).collect()[0][0]
max_month = max_month_np

if max_year_na != max_year_np:
    print(f"max_year_np: {max_year_np}")
    print(f"max_year_na: {max_year_na}")
    max_year = builtins.min(max_year_na, max_year_np)

if max_month_na != max_month_np:
    print(f"max_month_np: {max_month_np}")
    print(f"max_month_na: {max_month_na}")
    max_month = builtins.min(max_month_na, max_month_np)

print(f"max_year: {max_year}")
print(f"max_month: {max_month}")

In [0]:
# get only the last 3 years data
three_year_arr_np_df = arr_np_df.filter(col("visit_year") >= max_year-2)
three_year_arr_na_df = arr_na_df.filter(col("visit_year") >= max_year-2)

# get the recent 12 months data 
date_11_months_ago = max_month - relativedelta(months=11)

one_year_arr_np_df = arr_np_df.filter(col("visit_date") >= date_11_months_ago)
one_year_arr_na_df = arr_na_df.filter(col("visit_date") >= date_11_months_ago)

# get the most recent data 
current_arr_np_df = arr_np_df.filter(col("visit_date") == max_month)
current_arr_na_df = arr_na_df.filter(col("visit_date") == max_month)

In [0]:
print(f"--Number of Foreign Arrivals in the past 12 months--")
print(f"Total: {current_arr_np_df.select("Amount").agg(sum("Amount")).collect()[0][0]:,}")

### Number of Foreign Arrivals in the past 12 months: 1,735,147

In the past 12 months, there were approximately 1.7 million foreign arrivals entering Korea.


In [0]:
pd_df = three_year_arr_np_df.groupBy("visit_date").agg(sum("Amount").alias("Amount")).withColumnRenamed("visit_date", "Date").orderBy("Date").toPandas()

plt.figure(figsize=(9, 4))
plt.plot(pd_df["Date"], pd_df["Amount"], marker = 'o')

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Foreign Arrivals in the Past 3 Years')
plt.grid(True)
plt.tight_layout()
plt.show()


![](../analysis/images/EDA_user_group_analysis_arrivals/overall_line.png)

**Findings** <br/>
The above graph shows changes in foreign arrivals over the past three years. While the overall trend is upward, consistent drops are observed between October and January, followed by a recovery in March.


## 2. Nationality

As of July 2025, foreigners from 60 different countries enter Korea. To identify the countries with the largest numbers of arrivals, I visualized the data using bar charts.

### 2-1. Top 5 Countries


In [0]:
np_curr_top_5 = one_year_arr_np_df.groupBy("English_Nationality") \
    .agg(sum("Amount").alias("Count")) \
    .orderBy(col("Count").desc())\
    .limit(5) \
    .withColumnRenamed("English_Nationality", "Nationality").toPandas()

labels = np_curr_top_5["Nationality"]
sizes = np_curr_top_5["Count"]

# reverse the order(desc)
labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,}', ha='left', va='center')
plt.title("Nationality Distribution in the last 12 months")
plt.xlabel("Count")
plt.ylabel("Nationality")
plt.tight_layout()
plt.show()


In [0]:
np_avg_top_5 = three_year_arr_np_df.groupBy("visit_year", "English_Nationality") \
    .agg(sum("Amount").alias("yearly_total"))\
    .groupBy("English_Nationality")\
    .agg(avg("yearly_total").alias("Count"))\
    .orderBy(col("Count").desc())\
    .limit(5)\
    .withColumnRenamed("English_Nationality", "Nationality")\
    .toPandas()

labels = np_avg_top_5["Nationality"]
sizes = np_curr_top_5["Count"]

# reverse the order(desc)
labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,.0f}', ha='left', va='center')
plt.title("Monthly Average Nationality Distribution in the last 3 years")
plt.xlabel("Count")
plt.ylabel("Nationality")
plt.tight_layout()
plt.show()


![](../analysis/images/EDA_user_group_analysis_arrivals/nat_top_5.png)
![](../analysis/images/EDA_user_group_analysis_arrivals/nat_avg_5.png)

**Findings** <br/>
The data on foreign arrivals in Korea, looking at both the last 12 months and the three-year average, shows a consistent trend. China is the clear leader with roughly 5 million recent arrivals (3.2 million on average), followed by Japan (3.5 million; 2.5 million avg.), Taiwan, the US, and Hong Kong.

However, the key insight comes from comparing this "visitor" data with "resident" data. It's interesting to see that Japan, the second-largest source of arrivals, barely made the top five for short-term residents. Similarly, Taiwan(ranking third for arrivals) and Hong Kong (ranking fifth for arrivals) didn't appear in the top resident rankings at all.

This contrast strongly suggests that even within the broad "foreigner" category, there are distinct customer segments. These are primarily short-term visitors (like tourists from Japan, Taiwan, and Hong Kong) and long-term residents. Therefore, the unique needs of each persona must be analyzed separately, and strategies should be tailored accordingly.
 

## 3. Language
Building on the previous analysis, the data suggests that customer groups can be divided into different segments with different needs.  For instance, short-term visitors are likely to have a greater need for language support compared to long-term residents. With this hypothesis, I analyzed the top languages spoken by the foriegn arrivals in Korea. In cases where a country uses multiple languages, the resident count was duplicated across those languages to make the weight of each langauge equal, which may result in some multi-counting in the figures.

### 3-1. Top 5 Langauges


In [0]:
one_year_lang_np_df = one_year_arr_np_df.withColumn("languages_arr", split(col("Primary_Language"), ",")) \
    .withColumn("language", explode(col("languages_arr")))
one_year_lang_np_df = one_year_lang_np_df.withColumn("language", trim(col("language")))


In [0]:
np_lang_curr_top_5 = one_year_lang_np_df.groupBy("language") \
    .agg(sum("Amount").alias("Count")) \
    .orderBy(col("Count").desc())\
    .limit(5).toPandas()

labels = np_lang_curr_top_5["language"]
sizes = np_lang_curr_top_5["Count"]

# reverse the order(desc)
labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,}', ha='left', va='center')
plt.title("Language Distribution in the last 12 months")
plt.xlabel("Count")
plt.ylabel("Language")
plt.tight_layout()
plt.show()


In [0]:
three_year_lang_np_df = three_year_arr_np_df.withColumn("languages_arr", split(col("Primary_Language"), ",")) \
    .withColumn("language", explode(col("languages_arr")))
three_year_lang_np_df = three_year_lang_np_df.withColumn("language", trim(col("language")))

lang_avg_top_5 = three_year_lang_np_df.groupBy("visit_year", "language") \
    .agg(sum("Amount").alias("yearly_total"))\
    .groupBy("language")\
    .agg(avg("yearly_total").alias("Count"))\
    .orderBy(col("Count").desc())\
    .limit(5)\
    .toPandas()

labels = lang_avg_top_5["language"]
sizes = lang_avg_top_5["Count"]

# reverse the order(desc)
labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,.0f}', ha='left', va='center')
plt.title("Average Language Distribution over the past 3 years")
plt.xlabel("Count")
plt.ylabel("Language")
plt.tight_layout()
plt.show()


![](../analysis/images/EDA_user_group_analysis_arrivals/lang_count.png)
![](../analysis/images/EDA_user_group_analysis_arrivals/lang_average.png)

**Findings** <br/>
An analysis of the top 5 languages used by foreign arrivals shows that Mandarin is the overwhelming number one. In the last 12 months, the number of Mandarin speakers was 1.8 times higher than English (the second-ranked language), and 1.6 times higher based on the three-year average. This appears to be because both China and Taiwan, the first and third largest sources of arrivals, use Mandarin.

English followed in second place, likely due to arrivals from the US (4th) and Hong Kong (5th), as well as from various other countries where English is a common language.

What's particularly noteworthy is Japanese, which ranked third. Despite being used almost exclusively by arrivals from Japan, it has consistently maintained a top position in both the last 12 months and the three-year average. This suggests a stable and significant number of Japanese visitors to Korea. Therefore, it could be an effective strategy to not only support the Japanese language but also to develop marketing strategies specifically tailored for Japanese customers.

## 4. Purpose

### 4-1. Rank Purpose

In [0]:
purp_curr_top_5 = one_year_arr_np_df.groupBy("purpose_english") \
    .agg(sum("Amount").alias("Count")) \
    .orderBy(col("Count").desc())\
    .limit(5).toPandas()

labels = purp_curr_top_5["purpose_english"]
sizes = purp_curr_top_5["Count"]

# reverse the order(desc)
# labels = labels[::-1]
# sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.bar(labels, sizes)

# add margin to the right
plt.ylim(0, sizes.max() * 1.25)

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height,
             f'{height:,}', ha='center', va='bottom')
plt.title("Entry Purpose in the last 12 months")
plt.xlabel("Count")
plt.ylabel("Purpose")
plt.tight_layout()
plt.show()


In [0]:
tour = 14741307
other = 2414704
remaining = 479436+139720+74137
print(tour/(tour+other+remaining))
print(other/(tour+other+remaining))

![](./images/EDA_user_group_analysis_arrivals/purpose_count.png)

**Findings**<br/>
An analysis of foreign visitor data from the last 12 months, categorized by purpose of entry, reveals that approximately 83% visited Korea for "Tourism." This suggests that tourists constitute a significant portion of the delivery app's potential customer base.

Given that most tourists are presumed to be short-term visitors, it is highly unlikely they would learn Korean to use a delivery app during their brief stay. For travelers seeking diverse experiences within a limited time, a language barrier can be a critical factor that leads them to abandon the app.

Therefore, we conclude that an intuitive multilingual support feature is essential to attract foreign tourists as a core customer segment.
 




### 4-2. Purpose in Top Nationalities

In [0]:
top_nationality = np_curr_top_5["Nationality"].drop_duplicates().tolist()
top_language = np_lang_curr_top_5["language"].drop_duplicates().tolist()
print(top_nationality)
print(top_language)

In [0]:
from pyspark.sql.window import Window
# get data only for top nationality
purp_nat_df = one_year_arr_np_df.filter(col("English_Nationality").isin(top_nationality)) \
    .groupBy("English_Nationality", "purpose_english") \
    .agg(sum("Amount").alias("Amount"))

nat_window_spec_numerator = Window.partitionBy("English_Nationality", "purpose_english")
nat_window_spec_denominator = Window.partitionBy("English_Nationality")

purp_nat_df = purp_nat_df.withColumn("ratio", round(sum("Amount").over(nat_window_spec_numerator)/sum("Amount").over(nat_window_spec_denominator),2))



In [0]:
# get data only for top language
purp_lang_df = one_year_lang_np_df.filter(col("language").isin(top_language))\
    .groupBy("language", "purpose_english") \
    .agg(sum("Amount").alias("Amount"))
lang_window_spec_numerator = Window.partitionBy("language", "purpose_english")
lang_window_spec_denominator = Window.partitionBy("language")

purp_lang_df = purp_lang_df.withColumn("ratio", round(sum("Amount").over(lang_window_spec_numerator)/sum("Amount").over(lang_window_spec_denominator),2))

In [0]:
# pyspark df -> pandas 변환
pdf = purp_nat_df.toPandas()

# pivot: language 기준으로 purpose_english 별 ratio 분리
pivot_df = pdf.pivot_table(
    index="English_Nationality",
    columns="purpose_english",
    values="ratio",
    aggfunc="sum",  # ratio는 이미 비율이라 sum 사용 (중복 없으면 그대로)
    fill_value=0
)

# pivot 후 컬럼 순서 재정렬
pivot_df = pivot_df.reindex(top_nationality)

# stacked bar chart
ax = pivot_df.plot(
    kind="bar",
    stacked=True,
    figsize=(12,6),
    rot=0
)

for p in ax.patches:
    height = p.get_height()
    if height > 0:
        ax.annotate(f'{height*100:.1f}%',  # 0~1 -> 0~100%
                    (p.get_x() + p.get_width()/2, p.get_y() + height/2),
                    ha='center', va='center', fontsize=8)

plt.ylabel("Ratio")
plt.xlabel("Country")
plt.title("Purpose Distribution by Country (Stacked Ratios)")
plt.legend(title="Purpose", bbox_to_anchor=(1.05, 1), loc="upper left")  # 범례 오른쪽 밖으로
plt.tight_layout()
plt.show()

### 4-3. Purpose by Top Languages

In [0]:
# pyspark df -> pandas 변환
pdf = purp_lang_df.toPandas()

# pivot: language 기준으로 purpose_english 별 ratio 분리
pivot_df = pdf.pivot_table(
    index="language",
    columns="purpose_english",
    values="ratio",
    aggfunc="sum",  # ratio는 이미 비율이라 sum 사용 (중복 없으면 그대로)
    fill_value=0
)

pivot_df = pivot_df.reindex(top_language)

# stacked bar chart
ax = pivot_df.plot(
    kind="bar",
    stacked=True,
    figsize=(12,6),
    rot=0
)

for p in ax.patches:
    height = p.get_height()
    if height > 0:
        ax.annotate(f'{height*100:.1f}%',  # 0~1 -> 0~100%
                    (p.get_x() + p.get_width()/2, p.get_y() + height/2),
                    ha='center', va='center', fontsize=8)

plt.ylabel("Ratio")
plt.xlabel("Language")
plt.title("Purpose Distribution by Language (Stacked Ratios)")
plt.legend(title="Purpose", bbox_to_anchor=(1.05, 1), loc="upper left")  # 범례 오른쪽 밖으로
plt.tight_layout()
plt.show()

![](./images/EDA_user_group_analysis_arrivals/purpose_nat.png)
![](./images/EDA_user_group_analysis_arrivals/purpose_lang.png)

**Findings** <br/>
Following the previous analysis, I examined whether it is valid to target short-term tourists as the main audience for the delivery app. I analyzed the purpose of visit for the top 5 countries by arrivals and the top 5 languages spoken by visitors.

The analysis revealed that the proportion of tourism-related visits in these key groups (e.g., 98% for Japan, 99% for Taiwan) was significantly higher than the overall average for all foreigners.

In other words, the largest groups of foreigners visiting Korea are overwhelmingly tourists, making it a very reasonable strategy to define them as the core potential customers for the delivery app.

## 5. Age Group

### 5-1. Top 5 Age Group

### 5-2. Trend