# User Group Analysis
To categorize core target groups, I analyzed foreign residents data and visitors data.

## 1. Foreign residents


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, col, isnan, max, sum, explode, split, trim, avg, lag
import matplotlib.pyplot as plt
import pandas as pd

In [0]:
spark = SparkSession.builder.appName("analysis_user_group").getOrCreate()

In [0]:
query = """
select *
from workspace.growth_poc.silver_residents
where year >= (select max(year)-4 from workspace.growth_poc.silver_residents)
    AND lower(Nationality) <> 'undefined'
"""
residents_df = spark.sql(query)

In [0]:
# 1. check schema
residents_df.printSchema()
print()
# 2. check column names
print(residents_df.columns)
print()
# 3. check data typue
print(residents_df.dtypes)
print()
# 4. check number of rows
print(f"Total number of rows: {residents_df.count()}")
print()
# 5. check statistical status
residents_df.describe().show()

# 6. check missing values
# get numeric columns
numeric_cols = [name for name, dtype in residents_df.dtypes if dtype in ('double', 'float', 'bigint')]
# numeric columns: count null values
for c in numeric_cols:
    null_count = residents_df.select(count(when(col(c).isNull() | isnan(c), c))).collect()[0][0]
    print(f"{c}: {null_count} nulls")



### 1-1. Nationality


In [0]:
max_year = residents_df.select(col("Year")).agg(max("Year")).collect()[0][0]
current_residents_df = residents_df.filter(col("Year") == max_year)

total_shortterm_residents = current_residents_df.filter(col("Resident_Category") == "short-term").select("Amount").agg(sum("Amount")).collect()[0][0]
total_longterm_residents = current_residents_df.filter(col("Resident_Category") == "long-term").select("Amount").agg(sum("Amount")).collect()[0][0]

print(f"--Number of Foreign Residents in {max_year}--")
print(f"Short term: {total_shortterm_residents}")
print(f"Long term: {total_longterm_residents}")
print(f"Total: {total_shortterm_residents + total_longterm_residents}")

#### 1-1-1. Current

current 

In [0]:
top_5_shortterm_nationality_df = current_residents_df.filter((col("Resident_Category") == "short-term"))\
    .groupBy("Nationality_English")\
    .agg(sum("Amount").alias("Count")) \
    .orderBy(col("Count").desc()) \
    .limit(5) \
    .toPandas()

top_5_shortterm_nationality_df
 
labels = top_5_shortterm_nationality_df['Nationality_English']
sizes = top_5_shortterm_nationality_df['Count']

plt.figure(figsize = (4,4))
plt.pie(sizes, labels=labels, autopct='%.1f%%', startangle=90)
plt.title("Nationality Distribution in Short-Term Residents")
plt.axis("equal")
plt.show()


In [0]:
top_5_longterm_nationality_df = current_residents_df.filter(col("Resident_Category") == "long-term")\
    .groupBy("Nationality_English")\
    .agg(sum("Amount").alias("Count")) \
    .orderBy(col("Count").desc()) \
    .limit(5) \
    .toPandas()
 
labels = top_5_longterm_nationality_df['Nationality_English']
sizes = top_5_longterm_nationality_df['Count']

plt.figure(figsize = (4,4))
plt.pie(sizes, labels=labels, autopct='%.1f%%', startangle=90)
plt.title("Nationality Distribution in Long-Term Residents")
plt.axis("equal")
plt.show()


average

In [0]:
top_5_average_shortterm_nationality_df = residents_df.filter((col("Resident_Category") == "short-term"))\
    .groupBy("Nationality_English")\
    .agg(avg("Amount").alias("average_count")) \
    .orderBy(col("average_count").desc()) \
    .limit(5) \
    .toPandas()

labels = top_5_average_shortterm_nationality_df['Nationality_English']
sizes = top_5_average_shortterm_nationality_df['average_count']

plt.figure(figsize = (4,4))
plt.pie(sizes, labels=labels, autopct='%.1f%%', startangle=90)
plt.title("Average Nationality Distribution in Short-Term Residents")
plt.axis("equal")
plt.show()


In [0]:
top_5_average_longterm_nationality_df = residents_df.filter((col("Resident_Category") == "long-term"))\
    .groupBy("Nationality_English")\
    .agg(avg("Amount").alias("average_count")) \
    .orderBy(col("average_count").desc()) \
    .limit(5) \
    .toPandas()

labels = top_5_average_longterm_nationality_df['Nationality_English']
sizes = top_5_average_longterm_nationality_df['average_count']

plt.figure(figsize = (4,4))
plt.pie(sizes, labels=labels, autopct='%.1f%%', startangle=90)
plt.title("Average Nationality Distribution in Long-Term Residents")
plt.axis("equal")
plt.show()


#### 1-1-2. Trend

In [0]:
top_nationalities = pd.concat([
    top_5_average_longterm_nationality_df["Nationality_English"],
    top_5_average_shortterm_nationality_df["Nationality_English"],
    top_5_longterm_nationality_df["Nationality_English"],
    top_5_shortterm_nationality_df["Nationality_English"]
])

# remove duplicates
countries_to_focus = top_nationalities.drop_duplicates().tolist()

In [0]:
residents_pd = residents_df.filter(col("Nationality_English").isin(countries_to_focus)) \
    .groupBy("Year", "Resident_Category", "Nationality_English") \
    .agg(sum("Amount").alias("count")) \
    .withColumnRenamed("Nationality_English", "Nationality")\
    .toPandas()

shortterm_pivot_df = residents_pd[residents_pd["Resident_Category"] == "short-term"].pivot_table(
    index = "Year",
    columns = "Nationality",
    values = "count",
    aggfunc= "sum"
).fillna(0)
 

In [0]:
plt.figure(figsize=(8, 4))
for nationality in shortterm_pivot_df.columns:
    plt.plot(shortterm_pivot_df.index, shortterm_pivot_df[nationality], marker='o', label=nationality)

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Short-Term Resident Count by Nationality')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()

In [0]:
longterm_pivot_df = residents_pd[residents_pd["Resident_Category"] == "long-term"].pivot_table(
    index = "Year",
    columns = "Nationality",
    values = "count",
    aggfunc= "sum"
).fillna(0)

plt.figure(figsize=(8, 4))
for nationality in longterm_pivot_df.columns:
    plt.plot(longterm_pivot_df.index, longterm_pivot_df[nationality], marker='o', label=nationality)

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Long-Term Resident Count by Nationality')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()

### Language

Current

In [0]:
language_split_df = residents_df.withColumn("language_list", split(col("Primary_Language"), ','))
exploded_language_df = language_split_df.withColumn("language", explode(col("language_list")))
exploded_language_df = exploded_language_df.withColumn("language", trim(col("language")))

language_aggregated_df = exploded_language_df.groupBy(col("language"), col("Year"), col("Resident_Category")).agg(sum("Amount").alias("count"))

In [0]:
top_5_shortterm_language_df = language_aggregated_df.filter((col("Resident_Category") == "short-term") & (col("Year") == max_year)).orderBy(col("count").desc()).limit(5).toPandas()

labels = top_5_shortterm_language_df["language"]
sizes = top_5_shortterm_language_df["count"]

plt.figure(figsize = (4,4))
plt.pie(sizes, labels=labels, autopct='%.1f%%', startangle=90)
plt.title("Language Distribution in Short-Term Residents")
plt.axis("equal")
plt.show()


In [0]:
top_5_longterm_language_df = language_aggregated_df.filter((col("Resident_Category") == "long-term") & (col("Year") == max_year)).orderBy(col("count").desc()).limit(5).toPandas()

labels = top_5_longterm_language_df["language"]
sizes = top_5_longterm_language_df["count"]

plt.figure(figsize = (4,4))
plt.pie(sizes, labels=labels, autopct='%.1f%%', startangle=90)
plt.title("Language Distribution in Long-Term Residents")
plt.axis("equal")
plt.show()


Average


In [0]:
top_5_average_shortterm_language_df = language_aggregated_df.filter((col("Resident_Category") == "short-term"))\
    .groupBy("language")\
    .agg(avg("count").alias("average_count")) \
    .orderBy(col("average_count").desc()) \
    .limit(5) \
    .toPandas()

labels = top_5_average_shortterm_language_df['language']
sizes = top_5_average_shortterm_language_df['average_count']

plt.figure(figsize = (4,4))
plt.pie(sizes, labels=labels, autopct='%.1f%%', startangle=90)
plt.title("Average Language Distribution in Short-Term Residents")
plt.axis("equal")
plt.show()


In [0]:
top_5_average_longterm_language_df = language_aggregated_df.filter((col("Resident_Category") == "long-term"))\
    .groupBy("language")\
    .agg(avg("count").alias("average_count")) \
    .orderBy(col("average_count").desc()) \
    .limit(5) \
    .toPandas()

labels = top_5_average_longterm_language_df['language']
sizes = top_5_average_longterm_language_df['average_count']

plt.figure(figsize = (4,4))
plt.pie(sizes, labels=labels, autopct='%.1f%%', startangle=90)
plt.title("Average Language Distribution in Long-Term Residents")
plt.axis("equal")
plt.show()


Trend

In [0]:
top_languages = pd.concat([
    top_5_average_longterm_language_df["language"],
    top_5_average_shortterm_language_df["language"],
    top_5_longterm_language_df["language"],
    top_5_shortterm_language_df["language"]
])

# remove duplicates
languages_to_focus = top_languages.drop_duplicates().tolist()

In [0]:
print(languages_to_focus)

In [0]:
residents_languages_pd = language_aggregated_df.filter(col("language").isin(languages_to_focus)) \
    .groupBy("Year", "Resident_Category", "language") \
    .agg(sum("count").alias("count")) \
    .toPandas()

shortterm_language_pivot_df = residents_languages_pd[residents_languages_pd["Resident_Category"] == "short-term"].pivot_table(
    index = "Year",
    columns = "language",
    values = "count",
    aggfunc= "sum"
).fillna(0)
 

In [0]:
shortterm_language_pivot_df.head()


In [0]:
plt.figure(figsize=(8, 4))
for language in shortterm_language_pivot_df.columns:
    plt.plot(shortterm_language_pivot_df.index, shortterm_language_pivot_df[language], marker='o', label=language)

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Short-Term Resident Count by Primary Language')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()

In [0]:
longterm_language_pivot_df = residents_languages_pd[residents_languages_pd["Resident_Category"] == "long-term"].pivot_table(
    index = "Year",
    columns = "language",
    values = "count",
    aggfunc= "sum"
).fillna(0)

plt.figure(figsize=(8, 4))
for language in longterm_language_pivot_df.columns:
    plt.plot(longterm_language_pivot_df.index, longterm_language_pivot_df[language], marker='o', label=language)

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Long-Term Resident Count by Primary Language')
plt.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
plt.grid(True)
plt.tight_layout()
plt.show()

시계열 분석: nationality, long term/ short term.
=> 집중해야 할 유저 그룹 선택

이 다음, country 별 보다는 language로 통합하자

## 2. Foreign Arrivals

### 2-1. Current Status

### 2-2. Trend Analysis

In [0]:
%sql
select distinct pu
from workspace.growth_poc.silver_residents


In [0]:
%sql
select Resident_Category
  , Nationality
  , sum(Amount)
from workspace.growth_poc.silver_residents
group by Resident_Category
  , Nationality
order by sum(amount) desc 

In [0]:
%sql
select distinct purpose 
from workspace.growth_poc.silver_visitors limit 10 

In [0]:
# 	현재 대한민국에 체류 중인 외국인은 총 몇 명인가?
# 	국적, 연령, 성별, 체류 자격(유학생, 취업, 거주 등)에 따른 인구 분포는 어떻게 되는가?
# 	지난 5년간 체류 외국인 수의 증감 추세는 어떠한가?





In [0]:
%sql
select Nationality_English
    , Primary_Language
    , sum(Amount)
from workspace.growth_poc.silver_residents
where Year = 2024
group by Nationality_English, Primary_Language
order by sum(Amount) desc

In [0]:
%sql
select language
  , country
  , count(*)
from workspace.growth_poc.silver_appstore_reviews 
group by language, country
order by language, count(*) desc


limit 100

In [0]:
%sql
select content_translated, updated
from workspace.growth_poc.silver_appstore_reviews
where lower(language) like '%cn%'

In [0]:
%sql
select *
from workspace.growth_poc.analysis_reviews
where lower(review_content) like '%transl%'

In [0]:
%sql
select * from workspace.growth_poc.analysis_reviews  where language = 'ko'

In [0]:
%sql 
select *
from workspace.growth_poc.silver_app_reviews
where written_language = 'so'
 

In [0]:
%sql
select *
from workspace.growth_poc.silver_visitors

In [0]:
%sql
select * from workspace.growth_poc.silver_visitors_agegroup limit 10

In [0]:
%sql
select * from workspace.growth_poc.silver_visitors limit 10

In [0]:
%sql
select *
from workspace.growth_poc.silver_residents
where year >= (select max(year)-4 from workspace.growth_poc.silver_residents)

In [0]:
# %md
# 2.	핵심 사용자 그룹 세분화 (Segmentation):
# o	핵심 질문:
# 	배달 앱 주 사용층일 가능성이 높은 연령대(예: 20-40대)의 외국인 규모는 어느 정도인가?
# 	특정 국적(예: 영어권, 중국어권 등)의 인구 집단이 특별히 큰 비중을 차지하는가?
# 	IT 기기 활용에 능숙하고 구매력이 있을 것으로 예상되는 그룹(예: D-2 유학생, E-7 전문인력)의 규모는 얼마나 되는가?
