# User Group Analysis
To categorize core target groups, I analyzed foreign residents data and visitors data.

## 1. Foreign residents


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, col, isnan, max, sum, explode, split, trim, avg, lag
import matplotlib.pyplot as plt
import pandas as pd

In [0]:
spark = SparkSession.builder.appName("analysis_user_group").getOrCreate()

In [0]:
query = """
select *
from workspace.growth_poc.silver_residents
where year >= (select max(year)-4 from workspace.growth_poc.silver_residents)
    AND lower(Nationality) <> 'undefined'
"""
residents_df = spark.sql(query)

In [0]:
# 1. check schema
residents_df.printSchema()
print()
# 2. check column names
print(residents_df.columns)
print()
# 3. check data typue
print(residents_df.dtypes)
print()
# 4. check number of rows
print(f"Total number of rows: {residents_df.count()}")
print()
# 5. check statistical status
residents_df.describe().show()

# 6. check missing values
# get numeric columns
numeric_cols = [name for name, dtype in residents_df.dtypes if dtype in ('double', 'float', 'bigint')]
# numeric columns: count null values
for c in numeric_cols:
    null_count = residents_df.select(count(when(col(c).isNull() | isnan(c), c))).collect()[0][0]
    print(f"{c}: {null_count} nulls")



### 1-1. Nationality


In [0]:
max_year = residents_df.select(col("Year")).agg(max("Year")).collect()[0][0]
current_residents_df = residents_df.filter(col("Year") == max_year)

total_shortterm_residents = current_residents_df.filter(col("Resident_Category") == "short-term").select("Amount").agg(sum("Amount")).collect()[0][0]
total_longterm_residents = current_residents_df.filter(col("Resident_Category") == "long-term").select("Amount").agg(sum("Amount")).collect()[0][0]

print(f"--Number of Foreign Residents in {max_year}--")
print(f"Short term: {total_shortterm_residents}")
print(f"Long term: {total_longterm_residents}")
print(f"Total: {total_shortterm_residents + total_longterm_residents}")

#### 1-1-1. Current

current 

In [0]:
top_5_shortterm_nationality_df = current_residents_df.filter((col("Resident_Category") == "short-term"))\
    .groupBy("Nationality_English")\
    .agg(sum("Amount").alias("Count")) \
    .orderBy(col("Count").desc()) \
    .limit(5) \
    .toPandas()

top_5_shortterm_nationality_df
 
labels = top_5_shortterm_nationality_df['Nationality_English']
sizes = top_5_shortterm_nationality_df['Count']

# reverse the order(desc)
labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,}', ha='left', va='center')
plt.title("Nationality Distribution in Short-Term Residents")
plt.xlabel("Count")
plt.ylabel("Nationality")
plt.tight_layout()
plt.show()
 


In [0]:
top_5_longterm_nationality_df = current_residents_df.filter(col("Resident_Category") == "long-term")\
    .groupBy("Nationality_English")\
    .agg(sum("Amount").alias("Count")) \
    .orderBy(col("Count").desc()) \
    .limit(5) \
    .toPandas()
 
labels = top_5_longterm_nationality_df['Nationality_English']
sizes = top_5_longterm_nationality_df['Count']

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,.0f}', ha='left', va='center')
plt.title("Nationality Distribution in Long-Term Residents")
plt.xlabel("Count")
plt.ylabel("Nationality")
plt.tight_layout()
plt.show()
 


average

In [0]:
top_5_average_shortterm_nationality_df = residents_df.filter((col("Resident_Category") == "short-term"))\
    .groupBy("Nationality_English")\
    .agg(avg("Amount").alias("average_count")) \
    .orderBy(col("average_count").desc()) \
    .limit(5) \
    .toPandas()

labels = top_5_average_shortterm_nationality_df['Nationality_English']
sizes = top_5_average_shortterm_nationality_df['average_count']

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,.0f}', ha='left', va='center')
plt.title("Average Nationality Distribution in Short-Term Residents")
plt.xlabel("Count")
plt.ylabel("Nationality")
plt.tight_layout()
plt.show()
 


In [0]:
top_5_average_longterm_nationality_df = residents_df.filter((col("Resident_Category") == "long-term"))\
    .groupBy("Nationality_English")\
    .agg(avg("Amount").alias("average_count")) \
    .orderBy(col("average_count").desc()) \
    .limit(5) \
    .toPandas()

labels = top_5_average_longterm_nationality_df['Nationality_English']
sizes = top_5_average_longterm_nationality_df['average_count']

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,.0f}', ha='left', va='center')
plt.title("Average Nationality Distribution in Long-Term Residents")
plt.xlabel("Count")
plt.ylabel("Nationality")
plt.tight_layout()
plt.show()


#### 1-1-2. Trend

In [0]:
top_nationalities = pd.concat([
    top_5_average_longterm_nationality_df["Nationality_English"][:3],
    top_5_average_shortterm_nationality_df["Nationality_English"][:3],
    top_5_longterm_nationality_df["Nationality_English"][:3],
    top_5_shortterm_nationality_df["Nationality_English"][:3]
])

# remove duplicates
countries_to_focus = top_nationalities.drop_duplicates().tolist()

In [0]:
residents_pd = residents_df.filter(col("Nationality_English").isin(countries_to_focus)) \
    .groupBy("Year", "Resident_Category", "Nationality_English") \
    .agg(sum("Amount").alias("count")) \
    .withColumnRenamed("Nationality_English", "Nationality")\
    .toPandas()

shortterm_pivot_df = residents_pd[residents_pd["Resident_Category"] == "short-term"].pivot_table(
    index = "Year",
    columns = "Nationality",
    values = "count",
    aggfunc= "sum"
).fillna(0)
 

In [0]:
plt.figure(figsize=(8, 4))
for nationality in shortterm_pivot_df.columns:
    plt.plot(shortterm_pivot_df.index, shortterm_pivot_df[nationality], marker='o', label=nationality)

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Short-Term Resident Count by Nationality')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()

단기 체류 외국인 데이터는 국적별로 명확한 계층(Tier)을 형성하고 있습니다. 특히 4위 그룹(베트남)과 5위 그룹(우즈베키스탄) 사이에는 **규모의 현격한 단절(Clear Cut-off Point)**이 관찰됩니다.

이에 따라, 유의미한 시장 규모를 형성하고 있으며 **각기 다른 성장 패턴(안정, 급성장, 신흥)을 보여주는 상위 4개국(태국, 중국, 미국, 베트남)**을 핵심 분석 대상으로 선정하여, 다양한 유형의 잠재 고객 페르소나를 도출하고자 합니다.

In [0]:
longterm_pivot_df = residents_pd[residents_pd["Resident_Category"] == "long-term"].pivot_table(
    index = "Year",
    columns = "Nationality",
    values = "count",
    aggfunc= "sum"
).fillna(0)

plt.figure(figsize=(8, 4))
for nationality in longterm_pivot_df.columns:
    plt.plot(longterm_pivot_df.index, longterm_pivot_df[nationality], marker='o', label=nationality)

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Long-Term Resident Count by Nationality')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()

"분석에 앞서 장기 체류 외국인의 국적별 분포를 EDA(탐색적 데이터 분석)한 결과, 상위 2개 국적인 중국과 베트남이 전체 체류자의 약 75%를 차지하며 시장을 압도적으로 점유하고 있음을 확인했습니다. (근거 1)

특히 2위인 베트남과 3위인 태국 간에는 5배가 넘는 수치 차이를 보여, 3순위 이하 그룹들은 시장 전체 트렌드에 미치는 영향이 미미한 것으로 판단됩니다. (근거 2)

이처럼 데이터가 소수 그룹에 크게 편중된 편향 분포를 고려하여, 본 분석에서는 가장 큰 시장을 형성하고 있는 중국과 베트남을 핵심 분석 대상으로 한정하여 페르소나를 구체화하고, 이를 통해 가장 효과적인 초기 시장 진입 전략을 도출하고자 합니다.

### Language

Current

In [0]:
language_split_df = residents_df.withColumn("language_list", split(col("Primary_Language"), ','))
exploded_language_df = language_split_df.withColumn("language", explode(col("language_list")))
exploded_language_df = exploded_language_df.withColumn("language", trim(col("language")))

language_aggregated_df = exploded_language_df.groupBy(col("language"), col("Year"), col("Resident_Category")).agg(sum("Amount").alias("count"))

In [0]:
top_5_shortterm_language_df = language_aggregated_df.filter((col("Resident_Category") == "short-term") & (col("Year") == max_year)).orderBy(col("count").desc()).limit(5).toPandas()

labels = top_5_shortterm_language_df["language"]
sizes = top_5_shortterm_language_df["count"]


labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,}', ha='left', va='center')

plt.title("Language Distribution in Short-Term Residents")
plt.xlabel("Count")
plt.ylabel("Language")
plt.tight_layout()
plt.xticks(rotation=30) 
plt.show()


This graph shows the top 5 languages spoken by short-term foreign residents in Korea in 2024.  

**English** is the most commonly used (154,238), followed by **Thai** (145,305), **Mandarin Chinese** (122,475), Vietnamese (44,354), and Japanese (36,353).  

The top three languages dominate about 84% of the total.

In [0]:
top_5_longterm_language_df = language_aggregated_df.filter((col("Resident_Category") == "long-term") & (col("Year") == max_year)).orderBy(col("count").desc()).limit(5).toPandas()

labels = top_5_longterm_language_df["language"]
sizes = top_5_longterm_language_df["count"]

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,}', ha='left', va='center')

plt.title("Language Distribution in Long-Term Residents")
plt.xlabel("Count")
plt.ylabel("Language")
plt.tight_layout()
plt.xticks(rotation=30) 
plt.show()


This graph shows the top 5 languages spoken by long-term foreign residents in Korea in 2024.  

Unlike the short-term residents analysis, **Mandarin Chinese** (122,475) is the most commonly used among the long-term foreign residents. **Vietnamese** (261,581) marked the second, followed by English (146,510), Nepali (72,151), and Uzbek (61,733).

The top three languages dominate about 87% of the total.

Average


In [0]:
top_5_average_shortterm_language_df = language_aggregated_df.filter((col("Resident_Category") == "short-term"))\
    .groupBy("language")\
    .agg(avg("count").alias("average_count")) \
    .orderBy(col("average_count").desc()) \
    .limit(5) \
    .toPandas()

labels = top_5_average_shortterm_language_df['language']
sizes = top_5_average_shortterm_language_df['average_count']

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,}', ha='left', va='center')
    
plt.xlabel("Count")
plt.ylabel("Language")
plt.tight_layout()
plt.title("Average Language Distribution in Short-Term Residents")
plt.show()


In [0]:
top_5_average_longterm_language_df = language_aggregated_df.filter((col("Resident_Category") == "long-term"))\
    .groupBy("language")\
    .agg(avg("count").alias("average_count")) \
    .orderBy(col("average_count").desc()) \
    .limit(5) \
    .toPandas()

labels = top_5_average_longterm_language_df['language']
sizes = top_5_average_longterm_language_df['average_count']

labels = labels[::-1]
sizes = sizes[::-1]

plt.figure(figsize = (6,4))
bars = plt.barh(labels, sizes)

# add margin to the right
plt.xlim(0, sizes.max() * 1.25)

for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2,
             f'{width:,}', ha='left', va='center')
    
plt.xlabel("Count")
plt.ylabel("Language")
plt.tight_layout()
plt.title("Average Language Distribution in Long-Term Residents")
plt.show()


Trend

In [0]:
top_languages = pd.concat([
    top_5_average_longterm_language_df["language"][:3],
    top_5_average_shortterm_language_df["language"][:3],
    top_5_longterm_language_df["language"][:3],
    top_5_shortterm_language_df["language"][:3]
])

# remove duplicates
languages_to_focus = top_languages.drop_duplicates().tolist()

In [0]:
residents_languages_pd = language_aggregated_df.filter(col("language").isin(languages_to_focus)) \
    .groupBy("Year", "Resident_Category", "language") \
    .agg(sum("count").alias("count")) \
    .toPandas()

shortterm_language_pivot_df = residents_languages_pd[residents_languages_pd["Resident_Category"] == "short-term"].pivot_table(
    index = "Year",
    columns = "language",
    values = "count",
    aggfunc= "sum"
).fillna(0)
 

In [0]:
plt.figure(figsize=(8, 4))
for language in shortterm_language_pivot_df.columns:
    plt.plot(shortterm_language_pivot_df.index, shortterm_language_pivot_df[language], marker='o', label=language)

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Short-Term Resident Count by Primary Language')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()

In [0]:
longterm_language_pivot_df = residents_languages_pd[residents_languages_pd["Resident_Category"] == "long-term"].pivot_table(
    index = "Year",
    columns = "language",
    values = "count",
    aggfunc= "sum"
).fillna(0)

plt.figure(figsize=(8, 4))
for language in longterm_language_pivot_df.columns:
    plt.plot(longterm_language_pivot_df.index, longterm_language_pivot_df[language], marker='o', label=language)

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Long-Term Resident Count by Primary Language')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()

시계열 분석: nationality, long term/ short term.
=> 집중해야 할 유저 그룹 선택

이 다음, country 별 보다는 language로 통합하자

## 2. Foreign Arrivals

In [0]:
%sql
select *
from workspace.growth_poc.silver_visitors limit 100

In [0]:
query = """
select *
from workspace.growth_poc.silver_visitors
where year >= (select max(year)-4 from workspace.growth_poc.silver_visitors)
    AND lower(English_Nationality) <> 'undefined'
"""
# arrivals_nationalitypurpose_df
arr_np_df = spark.sql(query)

# 1. check schema
arr_np_df.printSchema()
print()
# 2. check column names
print(arr_np_df.columns)
print()
# 3. check data typue
print(arr_np_df.dtypes)
print()
# 4. check number of rows
print(f"Total number of rows: {arr_np_df.count()}")
print()
# 5. check statistical status
arr_np_df.describe().show()

# 6. check missing values
# get numeric columns
numeric_cols = [name for name, dtype in arr_np_df.dtypes if dtype in ('double', 'float', 'bigint')]
# numeric columns: count null values
for c in numeric_cols:
    null_count = arr_np_df.select(count(when(col(c).isNull() | isnan(c), c))).collect()[0][0]
    print(f"{c}: {null_count} nulls")



In [0]:
query = """
select *
from workspace.growth_poc.silver_visitors_agegroup_nationality
where visit_year >= (select max(visit_year)-4 from workspace.growth_poc.silver_visitors_agegroup_nationality)
    AND lower(English_Nationality) <> 'undefined'
"""
#arrivlas_nationalityagegroup_df
arr_na_df = spark.sql(query)

# 1. check schema
arr_na_df.printSchema()
print()
# 2. check column names
print(arr_na_df.columns)
print()
# 3. check data typue
print(arr_na_df.dtypes)
print()
# 4. check number of rows
print(f"Total number of rows: {arr_na_df.count()}")
print()
# 5. check statistical status
arr_na_df.describe().show()

# 6. check missing values
# get numeric columns
numeric_cols = [name for name, dtype in arr_na_df.dtypes if dtype in ('double', 'float', 'bigint')]
# numeric columns: count null values
for c in numeric_cols:
    null_count = arr_na_df.select(count(when(col(c).isNull() | isnan(c), c))).collect()[0][0]
    print(f"{c}: {null_count} nulls")



### 2-1. Current Status

### 2-2. Trend Analysis

In [0]:
# 	현재 대한민국에 체류 중인 외국인은 총 몇 명인가?
# 	국적, 연령, 성별, 체류 자격(유학생, 취업, 거주 등)에 따른 인구 분포는 어떻게 되는가?
# 	지난 5년간 체류 외국인 수의 증감 추세는 어떠한가?





In [0]:
%sql
select * from workspace.growth_poc.silver_visitors_agegroup_nationality limit 10 

In [0]:
%sql 
select *
from workspace.growth_poc.silver_app_reviews
where written_language = 'so'
 

In [0]:
%sql
select *
from workspace.growth_poc.silver_visitors

In [0]:
%sql
select * from workspace.growth_poc.silver_visitors_agegroup limit 10

In [0]:
# %md
# 2.	핵심 사용자 그룹 세분화 (Segmentation):
# o	핵심 질문:
# 	배달 앱 주 사용층일 가능성이 높은 연령대(예: 20-40대)의 외국인 규모는 어느 정도인가?
# 	특정 국적(예: 영어권, 중국어권 등)의 인구 집단이 특별히 큰 비중을 차지하는가?
# 	IT 기기 활용에 능숙하고 구매력이 있을 것으로 예상되는 그룹(예: D-2 유학생, E-7 전문인력)의 규모는 얼마나 되는가?
