In [1]:
# ============================================
# 0. 환경 설정 (Drive + 폰트)
# ============================================
from google.colab import drive
drive.mount("/content/drive")

import os
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as fm
from IPython.display import display

# 나눔고딕 설치 (Colab 전용)
subprocess.run(
    ["apt-get", "-y", "install", "fonts-nanum"],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL,
)

fontpath = "/usr/share/fonts/truetype/nanum/NanumGothic.ttf"
fm.fontManager.addfont(fontpath)
plt.rcParams["font.family"] = "NanumGothic"
plt.rcParams["axes.unicode_minus"] = False
sns.set(style="whitegrid", font="NanumGothic")

# 1) Colab 환경인지 판단
if os.path.exists("/content/drive/MyDrive/빅데이터프로젝트/"):
    base_path = "/content/drive/MyDrive/빅데이터프로젝트/"
else:
    # 2) 깃허브 / 로컬 환경에서는 프로젝트 루트 기준 ./data/ 사용
    base_path = "./data/"

# ============================================
# 1. 원본 데이터 로드
# ============================================
crime_region = pd.read_csv(base_path + "crime_region_2020_2024.csv", encoding="utf-8-sig")
crime_rate   = pd.read_csv(base_path + "crime_rate_region_2020_2024.csv", encoding="utf-8-sig")
victim_gender = pd.read_csv(base_path + "victim_gender_trend.csv", encoding="utf-8-sig")
victim_age    = pd.read_csv(base_path + "victim_age_trend_2020_2024.csv", encoding="utf-8-sig")
pop           = pd.read_excel(base_path + "고령인구비율_시도_시_군_구.xlsx", skiprows=1)
victim_senior = pd.read_excel(base_path + "victim_senior_by_crimetype_final.xlsx")

print("crime_region:", crime_region.shape)
print("crime_rate  :", crime_rate.shape)
print("victim_gender:", victim_gender.shape)
print("victim_age   :", victim_age.shape)
print("pop          :", pop.shape)
print("victim_senior:", victim_senior.shape)

# ============================================
# 2. 통계청 인구 데이터 전처리 (pop)
# ============================================
pop = pop.iloc[:, :4].copy()
pop.columns = ["지역", "고령인구비율(%)", "65세이상인구", "전체인구"]

pop["지역"] = (
    pop["지역"]
    .astype(str)
    .str.replace("특별시|광역시|특별자치시|특별자치도", "", regex=True)
    .str.replace(" ", "")
)

for col in ["65세이상인구", "전체인구"]:
    pop[col] = (
        pop[col]
        .astype(str)
        .str.replace(",", "")
        .replace("", np.nan)
        .astype(float)
        .astype("Int64")
    )

pop["고령인구비율(%)"] = pd.to_numeric(pop["고령인구비율(%)"], errors="coerce")

print("\n[정제된 인구 데이터 예시]")
display(pop.head())

# ============================================
# 3. 경찰청 지역 데이터 전처리 (crime_region, crime_rate)
# ============================================
for df in [crime_region, crime_rate]:
    df["지역"] = df["지역"].astype(str).str.replace(" ", "")

crime_rate.columns = [c.replace(" ", "") for c in crime_rate.columns]

print("\ncrime_rate 컬럼:", crime_rate.columns)

# ============================================
# 4. 2024년 기준 지역 데이터 병합 (df_region) + CSV 저장
# ============================================
df_region = crime_rate[["지역", "2024_발생비"]].merge(
    pop[["지역", "고령인구비율(%)", "65세이상인구", "전체인구"]],
    on="지역",
    how="inner",
)

df_region["2024_발생비"] = pd.to_numeric(df_region["2024_발생비"], errors="coerce").round(1)
df_region = df_region.dropna(subset=["2024_발생비", "고령인구비율(%)"]).reset_index(drop=True)

print("\n[2024년 병합 결과 미리보기]")
display(df_region.head())

print("\n결측치 확인")
display(df_region.isna().sum())

print("\n지역 데이터 기본 통계")
display(df_region.describe())

df_region.to_csv(
    base_path + "merged_population_crime_2024.csv",
    index=False,
    encoding="utf-8-sig",
)
print("\nmerged_population_crime_2024.csv 저장 완료")

# ============================================
# 5. 죄종·성별별 65세 이상 피해자 전처리 (victim_senior) + CSV 저장
# ============================================
victim_senior = victim_senior.copy()
victim_senior.columns = victim_senior.columns.str.replace(" ", "")

num_cols = ["계", "남자_소계", "남자_65세이상", "여자_소계", "여자_65세이상"]
for col in num_cols:
    victim_senior[col] = pd.to_numeric(victim_senior[col], errors="coerce")

victim_senior["총_65세이상피해자수"] = (
    victim_senior["남자_65세이상"].fillna(0)
    + victim_senior["여자_65세이상"].fillna(0)
)

victim_senior["남성_고령비율(%)"] = (
    victim_senior["남자_65세이상"] / victim_senior["남자_소계"] * 100
).round(2)

victim_senior["여성_고령비율(%)"] = (
    victim_senior["여자_65세이상"] / victim_senior["여자_소계"] * 100
).round(2)

victim_senior["전체_고령비율(%)"] = (
    victim_senior["총_65세이상피해자수"] / victim_senior["계"] * 100
).round(2)

print("\n[65세 이상 피해자 비율 계산 결과]")
display(victim_senior.head())

victim_senior.to_csv(
    base_path + "victim_senior_processed.csv",
    index=False,
    encoding="utf-8-sig",
)
print("\nvictim_senior_processed.csv 저장 완료")


Mounted at /content/drive
▶ 프로젝트 폴더 목록: ['고령인구비율_시도_시_군_구.xlsx', 'crime_region_2020_2024.xlsx', 'crime_region_2020_2024.csv', 'crime_rate_region_2020_2024.csv', 'crime_rate_region_2020_2024.xlsx', 'victim_gender_trend.xlsx', 'victim_gender_trend.csv', 'victim_age_trend_2020_2024.csv', 'victim_age_trend_2020_2024.xlsx', 'victim_senior_by_crimetype_final.csv', 'victim_senior_by_crimetype_final.xlsx', 'figs', 'merged_population_crime_2024.csv', 'victim_senior_processed.csv', 'victim_senior_summary_성별별_전체비율.csv']


  warn("Workbook contains no default style, apply openpyxl's default")


crime_region: (18, 11)
crime_rate  : (17, 6)
victim_gender: (32, 12)
victim_age   : (8, 11)
pop          : (18, 4)
victim_senior: (38, 6)

[정제된 인구 데이터 예시]


Unnamed: 0,지역,고령인구비율(%),65세이상인구,전체인구
0,전국,20.0,10256782,51217221
1,서울,19.4,1813648,9331828
2,부산,23.9,780576,3266598
3,대구,20.9,493256,2363629
4,인천,17.7,533369,3021010



crime_rate 컬럼: Index(['지역', '2020_발생비', '2021_발생비', '2022_발생비', '2023_발생비', '2024_발생비'], dtype='object')

[2024년 병합 결과 미리보기]


Unnamed: 0,지역,2024_발생비,고령인구비율(%),65세이상인구,전체인구
0,서울,3238.1,19.4,1813648,9331828
1,부산,3436.0,23.9,780576,3266598
2,대구,2991.3,20.9,493256,2363629
3,인천,3059.2,17.7,533369,3021010
4,광주,3241.6,17.5,246980,1408422



결측치 확인


Unnamed: 0,0
지역,0
2024_발생비,0
고령인구비율(%),0
65세이상인구,0
전체인구,0



지역 데이터 기본 통계


Unnamed: 0,2024_발생비,고령인구비율(%),65세이상인구,전체인구
count,11.0,11.0,11.0,11.0
mean,3137.3,19.618182,482935.909091,2386018.363636
std,511.962846,4.088721,488267.417131,2470470.024426
min,2233.3,11.6,45301.0,390685.0
25%,2936.4,17.6,217841.0,1253235.5
50%,3059.2,18.9,384970.0,1517766.0
75%,3260.6,22.4,513312.5,2692319.5
max,4343.7,25.4,1813648.0,9331828.0



merged_population_crime_2024.csv 저장 완료

[65세 이상 피해자 비율 계산 결과]


Unnamed: 0,죄종,계,남자_소계,남자_65세이상,여자_소계,여자_65세이상,총_65세이상피해자수,남성_고령비율(%),여성_고령비율(%),전체_고령비율(%)
0,살인기수,274,134.0,19.0,140.0,39.0,58.0,14.18,27.86,21.17
1,살인미수등,498,302.0,51.0,196.0,49.0,100.0,16.89,25.0,20.08
2,강도,457,270.0,33.0,187.0,47.0,80.0,12.22,25.13,17.51
3,강간,4777,37.0,2.0,4740.0,74.0,76.0,5.41,1.56,1.59
4,유사강간,897,99.0,,798.0,15.0,15.0,,1.88,1.67



victim_senior_processed.csv 저장 완료
