# Explore User Group Data - Resident

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, col, isnan, max, sum, explode, split, trim, avg, lag
import matplotlib.pyplot as plt
import pandas as pd

## 1. Overall
To examine the nationality data of foreigners residing in Korea, I used the yearly datasets provided by the 공공데이터포털, which include the status of registered(long-term) foreigners by nationality and the status of short-term foreign residents by nationality.

In [0]:
spark = SparkSession.builder.appName("analysis_user_group").getOrCreate()

In [0]:
query = """
select *
from workspace.growth_poc.silver_residents
where year >= (select max(year)-4 from workspace.growth_poc.silver_residents)
"""
residents_df = spark.sql(query)

In [0]:
# 1. check schema
residents_df.printSchema()
print()
# 2. check column names
print(residents_df.columns)
print()
# 3. check data typue
print(residents_df.dtypes)
print()
# 4. check number of rows
print(f"Total number of rows: {residents_df.count()}")
print()
# 5. check statistical status
residents_df.describe().show()

# 6. check missing values
# get numeric columns
numeric_cols = [name for name, dtype in residents_df.dtypes if dtype in ('double', 'float', 'bigint')]
# numeric columns: count null values
for c in numeric_cols:
    null_count = residents_df.select(count(when(col(c).isNull() | isnan(c), c))).collect()[0][0]
    print(f"{c}: {null_count} nulls")



In [0]:
max_year = residents_df.select(col("Year")).agg(max("Year")).collect()[0][0]
current_residents_df = residents_df.filter(col("Year") == max_year)

total_shortterm_residents = current_residents_df.filter(col("Resident_Category") == "short-term").select("Amount").agg(sum("Amount")).collect()[0][0]
total_longterm_residents = current_residents_df.filter(col("Resident_Category") == "long-term").select("Amount").agg(sum("Amount")).collect()[0][0]

print(f"--Number of Foreign Residents in {max_year}--")
print(f"Short term: {total_shortterm_residents}")
print(f"Long term: {total_longterm_residents}")
print(f"Total: {total_shortterm_residents + total_longterm_residents}")

### Number of Foreign Residents in 2024  

| Category    | Count    |
|-------------|----------|
| Short term  | 608,602  |
| Long term   | 1,488,083 |
| **Total**   | **2,096,685** |

In the most recent year, 2024, there were approximately 600,000 short-term foreign residents and about 1.5 million long-term foreign residents, totaling roughly 2.1 million foreigners residing in Korea.


In [0]:
pd_df = residents_df.groupBy("Year", "Resident_Category") \
    .agg(sum("Amount").alias("count")) \
    .toPandas()

residents_pivot_df = pd_df.pivot_table(
    index = "Year",
    columns = "Resident_Category",
    values = "count",
    aggfunc= "sum"
).fillna(0)

residents_pivot_df["Total"] = residents_pivot_df.sum(axis=1) # add column level total

plt.figure(figsize=(8, 4))
for category in residents_pivot_df.columns:
    if category == "Total":
        plt.plot(residents_pivot_df.index, residents_pivot_df[category], marker='o', label=category, color="firebrick")
    else: plt.plot(residents_pivot_df.index, residents_pivot_df[category], marker='o', label=category)

# populate year as string in x axis
plt.xticks(residents_pivot_df.index, [str(int(year)) for year in residents_pivot_df.index])

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Foreign Resident Count by Duration')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()

![](../analysis/images/EDA_user_group_analysis_resident/overall_line.png)

**Findings** <br/>
The above graph shows the changes over the past five years for short-term foreign residents (orange), long-term foreign residents (blue), and the total (red). While the numbers of both short- and long-term residents have steadily increased since 2021, the number of short-term foreign residents slightly declined in 2024. In contrast, long-term residents and the overall total continued their upward trend.

Based on this pattern, it was considered that short-term and long-term foreign residents may exhibit different characteristics, so the analysis was conducted by examining these two groups separately.

## 2. Short-Term Residents
### 2-1. Nationality
As of 2024, short-term foreign residents come from a total of 190 countries. To identify the countries with the largest numbers of residents, I visualized the data using bar charts.

#### 2-1-1. Current Year


In [0]:
top_5_shortterm_nationality_df = current_residents_df.filter((col("Resident_Category") == "short-term"))\
    .groupBy("Nationality_English")\
    .agg(sum("Amount").alias("Count")) \
    .orderBy(col("Count").desc()) \
    .limit(5) \
    .toPandas()

top_5_shortterm_nationality_df
 
labels = top_5_shortterm_nationality_df['Nationality_English']
sizes = top_5_shortterm_nationality_df['Count']

# reverse the order(desc)
labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,}', ha='left', va='center')
plt.title("Nationality Distribution in Short-Term Residents")
plt.xlabel("Count")
plt.ylabel("Nationality")
plt.tight_layout()
plt.show()
 


In [0]:
top_5_average_shortterm_nationality_df = residents_df.filter((col("Resident_Category") == "short-term"))\
    .groupBy("Nationality_English")\
    .agg(avg("Amount").alias("average_count")) \
    .orderBy(col("average_count").desc()) \
    .limit(5) \
    .toPandas()

labels = top_5_average_shortterm_nationality_df['Nationality_English']
sizes = top_5_average_shortterm_nationality_df['average_count']

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,.0f}', ha='left', va='center')
plt.title("Average Nationality Distribution in Short-Term Residents")
plt.xlabel("Count")
plt.ylabel("Nationality")
plt.tight_layout()
plt.show()
 


![](../analysis/images/EDA_user_group_analysis_resident/short_count.png)
![](../analysis/images/EDA_user_group_analysis_resident/short_average.png)

**Findings** <br/>
The first graph shows the top five nationalities of short-term foreign residents in 2024, and the second graph displays the average number of residents by nationality over the past five years. The results are largely similar, with Thailand having the highest number-around 140,000 in 2024 (an average of 150,000). This is followed by China (100,000; average 80,000), the United States, and Vietnam. One difference is that in 2024, Japan ranked fifth (about 30,000), whereas over the five-year average, Mongolia held the fifth position. This suggests that the number of short-term residents from Mongolia may have recently declined, or that Japan has shown a significant recent increase.

Based on these results, I visualized the trends of short-term foreign residents from China, Japan, Mongolia, Thailand, the United States, and Vietnam over the past five years using a line chart.

#### 2-1-2. Trend (5 Year)

In [0]:
short_top_nationalities = pd.concat([
    top_5_average_shortterm_nationality_df["Nationality_English"],
    top_5_shortterm_nationality_df["Nationality_English"]
])

# remove duplicates
short_countries_to_focus = short_top_nationalities.drop_duplicates().tolist()

In [0]:
short_residents_pd = residents_df.filter((col("Nationality_English").isin(short_countries_to_focus)) & (col("Resident_Category") == "short-term")) \
    .groupBy("Year", "Resident_Category", "Nationality_English") \
    .agg(sum("Amount").alias("count")) \
    .withColumnRenamed("Nationality_English", "Nationality")\
    .toPandas()

shortterm_pivot_df = short_residents_pd.pivot_table(
    index = "Year",
    columns = "Nationality",
    values = "count",
    aggfunc= "sum"
).fillna(0)

plt.figure(figsize=(8, 4))
for nationality in shortterm_pivot_df.columns:
    plt.plot(shortterm_pivot_df.index, shortterm_pivot_df[nationality], marker='o', label=nationality)
# populate year as string in x axis
plt.xticks(shortterm_pivot_df.index, [str(int(year)) for year in shortterm_pivot_df.index])

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Short-Term Resident Count by Nationality')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()
 

![](../analysis/images/EDA_user_group_analysis_resident/short_nat_line.png)

**Findings**<br/>
Thailand consistently holds the top position, but it has shown a declining trend since 2022. China, ranked second, experienced a significant increase in 2023 but saw a slight decrease in 2024. The United States has maintained relatively stable numbers, consistently holding the third position. An interesting point is Japan’s upward trajectory; although it remains fifth in 2024, it has shown continuous growth since 2021. If this trend continues, Japan could potentially develop into a significant market.

Therefore, it is considered reasonable to focus the analysis on the top three countries by current size (Thailand, China, and the United States) along with Japan, which shows high growth potential.

### 2-2. Primary Language
Since the analysis targets foreign users, the languages supported by the delivery app are expected to play an important role in the user experience. Following the nationality-based analysis, I mapped the primary languages of each nationality to examine the languages most commonly used by short-term foreign residents.

For cases where multiple primary languages exist, the data was exploded so that each language plays equal weight, with the associated data copied accordingly. Please note that this process may result in some duplicate counts.

In [0]:
language_split_df = residents_df.withColumn("language_list", split(col("Primary_Language"), ','))
exploded_language_df = language_split_df.withColumn("language", explode(col("language_list")))
exploded_language_df = exploded_language_df.withColumn("language", trim(col("language")))

language_aggregated_df = exploded_language_df.groupBy(col("language"), col("Year"), col("Resident_Category")).agg(sum("Amount").alias("count"))

#### 2-2-1. Current Year

In [0]:
top_5_shortterm_language_df = language_aggregated_df.filter((col("Resident_Category") == "short-term") & (col("Year") == max_year)).orderBy(col("count").desc()).limit(5).toPandas()

labels = top_5_shortterm_language_df["language"]
sizes = top_5_shortterm_language_df["count"]

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,}', ha='left', va='center')

plt.title("Language Distribution in Short-Term Residents")
plt.xlabel("Count")
plt.ylabel("Language")
plt.tight_layout()
plt.xticks(rotation=30) 
plt.show()


In [0]:
top_5_average_shortterm_language_df = language_aggregated_df.filter((col("Resident_Category") == "short-term"))\
    .groupBy("language")\
    .agg(avg("count").alias("average_count")) \
    .orderBy(col("average_count").desc()) \
    .limit(5) \
    .toPandas()

labels = top_5_average_shortterm_language_df['language']
sizes = top_5_average_shortterm_language_df['average_count']

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,.0f}', ha='left', va='center')
    
plt.xlabel("Count")
plt.ylabel("Language")
plt.tight_layout()
plt.xticks(rotation=30) 
plt.title("Average Language Distribution in Short-Term Residents")
plt.show()


![](../analysis/images/EDA_user_group_analysis_resident/short_lang_count.png)
![](../analysis/images/EDA_user_group_analysis_resident/short_lang_average.png)

**Findings**<br/>
Unlike the nationality data, where Thailand ranked first in both 2024 and the five-year average, 
the most-used language among short-term foreign residents in 2024 was English. It may reflect either an increase in short-term residents from English-speaking nationalities in 2024 or, as noted earlier, the declining trend of short-term residents from Thailand. However, the difference in the number of residents using the top three languages is relatively small, around 7–8%.

An interesting point is that Japanese-speaking short-term residents ranked fifth in 2024. As previously observed, Japan is an emerging group with continuous growth since 2021, and the Japanese-speaking population has also been steadily increasing, highlighting its relevance for language support strategies.

Additionally, Russian-speaking short-term residents appeared as fifth on the five-year average, even though it was not present in the nationality-based data. This provides direct evidence that “nationalities with many short-term residents” do not necessarily coincide with “languages that should be prioritized for support.”

The trends of short-term residents using English, Japanese, Mandarin, Russian, Thai, and Vietnamese over the past five years, as indicated by the two bar charts, were visualized in a line chart below.

#### 2-2-2. Trend (5 Year)

In [0]:
short_top_languages = pd.concat([
    top_5_average_shortterm_language_df["language"],
    top_5_shortterm_language_df["language"]
])

# remove duplicates
short_languages_to_focus = short_top_languages.drop_duplicates().tolist()

short_residents_languages_pd = language_aggregated_df.filter(col("language").isin(short_languages_to_focus) & (col("Resident_Category") == "short-term")) \
    .groupBy("Year", "Resident_Category", "language") \
    .agg(sum("count").alias("count")) \
    .toPandas()

shortterm_language_pivot_df = short_residents_languages_pd.pivot_table(
    index = "Year",
    columns = "language",
    values = "count",
    aggfunc= "sum"
).fillna(0)
 
plt.figure(figsize=(8, 4))
for language in shortterm_language_pivot_df.columns:
    plt.plot(shortterm_language_pivot_df.index, shortterm_language_pivot_df[language], marker='o', label=language)
plt.xticks(shortterm_language_pivot_df.index, [str(int(year)) for year in shortterm_language_pivot_df.index])

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Short-Term Resident Count by Primary Language')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()

![](../analysis/images/EDA_user_group_analysis_resident/short_lang_line.png)

**Findings**<br/>
By comparing the nationality-based analysis with the language-based analysis, the following key insights were identified:
<ol>
<li>The influence of English: While Thailand ranked first by nationality, English was the top language in 2024. This indicates that rather than focusing solely on a single nationality, English—widely used across multiple nationalities—may have greater market potential.</li>

<li>Reaffirming Japan’s growth: Japanese ranked fifth in 2024, consistent with the nationality-based analysis, once again confirming its potential as an emerging group.</li>

<li>Discovery of a new market: Russian, which did not appear in the nationality rankings, ranked fifth in the five-year language average. This highlights the important fact that “countries with many residents” may differ from “languages actually used,” a point that must be considered when establishing language support policies.</li>
</ol>

Taken together, the results suggest that the most effective strategy would be to prioritize English, Thai, and Mandarin—languages with stable user bases—along with Japanese, which shows clear growth potential, as the key supported languages.

## 3. Long-Term Residents

### 3-1. Nationality
As of 2024, long-term foreign residents come from a total of 188 countries. To identify the countries with the largest resident populations, I visualized the data using bar charts.

#### 3-1-1. Current

In [0]:
top_5_longterm_nationality_df = current_residents_df.filter(col("Resident_Category") == "long-term")\
    .groupBy("Nationality_English")\
    .agg(sum("Amount").alias("Count")) \
    .orderBy(col("Count").desc()) \
    .limit(5) \
    .toPandas()
 
labels = top_5_longterm_nationality_df['Nationality_English']
sizes = top_5_longterm_nationality_df['Count']

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,.0f}', ha='left', va='center')
plt.title("Nationality Distribution in Long-Term Residents")
plt.xlabel("Count")
plt.ylabel("Nationality")
plt.tight_layout()
plt.show()
 


In [0]:
top_5_average_longterm_nationality_df = residents_df.filter((col("Resident_Category") == "long-term"))\
    .groupBy("Nationality_English")\
    .agg(avg("Amount").alias("average_count")) \
    .orderBy(col("average_count").desc()) \
    .limit(5) \
    .toPandas()

labels = top_5_average_longterm_nationality_df['Nationality_English']
sizes = top_5_average_longterm_nationality_df['average_count']

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,.0f}', ha='left', va='center')
plt.title("Average Nationality Distribution in Long-Term Residents")
plt.xlabel("Count")
plt.ylabel("Nationality")
plt.tight_layout()
plt.show()


![](../analysis/images/EDA_user_group_analysis_resident/long_nat_count.png)
![](../analysis/images/EDA_user_group_analysis_resident/long_nat_average.png)

**Findings**</br>
As with the analysis of short-term foreign residents, the first graph shows the top five nationalities of long-term foreign residents in 2024, while the second graph presents the average number of residents by nationality over the past five years. The two results are largely consistent, indicating that the trends of the past five years have continued steadily to the present.

China (460,000; five-year average 440,000) and Vietnam (260,000; five-year average 200,000) together account for about 80% of the top five countries, forming the dominant share of long-term foreign residents. In contrast, Uzbekistan, Nepal, and Cambodia show relatively similar numbers without large differences.

Based on this, I visualized the five-year trends of long-term foreign residents from China, Vietnam, Uzbekistan, Nepal, and Cambodia using a line chart.


#### 3-1-2. Trend (5 Year)

In [0]:
long_top_nationalities = pd.concat([
    top_5_average_longterm_nationality_df["Nationality_English"],
    top_5_longterm_nationality_df["Nationality_English"],
])


long_countries_to_focus = long_top_nationalities.drop_duplicates().tolist()

In [0]:
long_residents_pd = residents_df.filter((col("Nationality_English").isin(long_countries_to_focus)) & (col("Resident_Category") == "long-term")) \
    .groupBy("Year", "Resident_Category", "Nationality_English") \
    .agg(sum("Amount").alias("count")) \
    .withColumnRenamed("Nationality_English", "Nationality")\
    .toPandas()

longterm_pivot_df = long_residents_pd.pivot_table(
    index = "Year",
    columns = "Nationality",
    values = "count",
    aggfunc= "sum"
).fillna(0)



plt.figure(figsize=(8, 4))
for nationality in longterm_pivot_df.columns:
    plt.plot(longterm_pivot_df.index, longterm_pivot_df[nationality], marker='o', label=nationality)

# populate year as string in x axis
plt.xticks(longterm_pivot_df.index, [str(int(year)) for year in longterm_pivot_df.index])

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Long-Term Resident Count by Nationality')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()

![](../analysis/images/EDA_user_group_analysis_resident/long_nat_line.png)

**Findings**<br/>
The top two countries account for about 80% of long-term residents among the top five, indicating that the long-term foreign resident population is heavily concentrated in a small number of nationalities. Considering this imbalance, it is reasonable at this stage to focus the core analysis on China and Vietnam, which represent the largest groups.

### 3-2. Language
I mapped the primary languages of each nationality to examine the languages most commonly used by long-term foreign residents. In cases where a country uses multiple languages, the resident count was duplicated across those languages to make the weight of each langauge equal, which may result in some overestimation in the figures.


#### 3-2-1. Current & Trend (5 Year)

In [0]:
top_5_longterm_language_df = language_aggregated_df.filter((col("Resident_Category") == "long-term") & (col("Year") == max_year)).orderBy(col("count").desc()).limit(5).toPandas()

labels = top_5_longterm_language_df["language"]
sizes = top_5_longterm_language_df["count"]

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,}', ha='left', va='center')

plt.title("Language Distribution in Long-Term Residents")
plt.xlabel("Count")
plt.ylabel("Language")
plt.tight_layout()
plt.show()


In [0]:
top_5_average_longterm_language_df = language_aggregated_df.filter((col("Resident_Category") == "long-term"))\
    .groupBy("language")\
    .agg(avg("count").alias("average_count")) \
    .orderBy(col("average_count").desc()) \
    .limit(5) \
    .toPandas()

labels = top_5_average_longterm_language_df['language']
sizes = top_5_average_longterm_language_df['average_count']

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,.0f}', ha='left', va='center')
    
plt.xlabel("Count")
plt.ylabel("Language")
plt.tight_layout()
plt.title("Average Language Distribution in Long-Term Residents")
plt.show()


In [0]:
long_top_languages = pd.concat([
    top_5_average_longterm_language_df["language"],
    top_5_longterm_language_df["language"]
])

long_languages_to_focus = long_top_languages.drop_duplicates().tolist()

In [0]:
long_residents_languages_pd = language_aggregated_df.filter(col("language").isin(long_languages_to_focus) & (col("Resident_Category") == "long-term")).toPandas()

longterm_language_pivot_df = long_residents_languages_pd.pivot_table(
    index = "Year",
    columns = "language",
    values = "count",
    aggfunc= "sum"
).fillna(0)

plt.figure(figsize=(8, 4))
for language in longterm_language_pivot_df.columns:
    plt.plot(longterm_language_pivot_df.index, longterm_language_pivot_df[language], marker='o', label=language)

plt.xticks(longterm_language_pivot_df.index, [str(int(year)) for year in longterm_language_pivot_df.index])

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Long-Term Resident Count by Primary Language')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()

![](../analysis/images/EDA_user_group_analysis_resident/long_lang_count.png)


![](../analysis/images/EDA_user_group_analysis_resident/long_lang_average.png)


![](../analysis/images/EDA_user_group_analysis_resident/long_lang_line.png)
**Findings** <br/>
When comparing nationality-based and language-based data, the overall trends appear similar. Mandarin not only dominates as the top language but has also shown an upward trend since 2022. 

Vietnamese and English have likewise demonstrated steady growth.

Interestingly, English—which did not appear in the top nationality rankings—ranked third in both 2024 and the five-year average. This, as with the short-term resident data, highlights that the top nationalities and the actual distribution of languages used may differ.

That said, it can be hypothesized that long-term residents, given their extended stay in Korea, face fewer language barriers compared to short-term residents. This could serve as an important criterion for prioritizing supported languages when resources are limited. 
However, there is a limitation of the current dataset not containing enough data for me to measure the actual Korean proficiency of the long-term residents.

Based on this hypothesis, a phased strategy appears most effective: initially focusing on Mandarin, which has an overwhelmingly large user base, and subsequently validating the need for additional language support through user feedback or supplementary data.