In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# 엑셀 파일의 경로
excel_file_path = '/content/drive/MyDrive/Colab Notebooks/crime.xlsx'

In [3]:
# 엑셀 파일의 모든 시트를 딕셔너리 형태로 읽어오기
all_sheets = pd.read_excel(excel_file_path, sheet_name=None)

# 각 시트를 개별 데이터프레임으로 저장하고, 각 시트의 데이터를 결합
sheet_dataframes = {sheet_name: data for sheet_name, data in all_sheets.items()}
combined_df = pd.concat([
    sheet_dataframes["시간"],
    sheet_dataframes["요일"],
    sheet_dataframes["장소"],
    sheet_dataframes["지역"],
    sheet_dataframes["연령"],
    sheet_dataframes["부모"],
    sheet_dataframes["교육"],
    sheet_dataframes["국적"],
    sheet_dataframes["동기"],
    sheet_dataframes["자백"]
], axis=0, ignore_index=True)

# 결합된 데이터프레임 크기 확인
print(f"Combined DataFrame shape: {combined_df.shape}")

Combined DataFrame shape: (142, 22)


In [4]:
# 결측값 처리 (NaN을 'Unknown'으로 대체)
combined_df.fillna('Unknown', inplace=True)

In [5]:
# 특정 연도 데이터를 숫자로 직접 지정하여 제거
years_to_remove = [2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011]
combined_df.drop(columns=years_to_remove, errors='ignore', inplace=True)

# 결합된 데이터프레임 크기 및 열 확인
print(f"Combined DataFrame shape after removing years: {combined_df.shape}")
print(f"Remaining columns: {combined_df.columns}")

Combined DataFrame shape after removing years: (142, 10)
Remaining columns: Index(['시간', '요일', '장소', '지역', '연령', '부모', '교육', '국적', '동기', '자백'], dtype='object')


In [6]:
combined_df

Unnamed: 0,시간,요일,장소,지역,연령,부모,교육,국적,동기,자백
0,0-3시,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1,3-6시,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
2,6-9시,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
3,9-12시,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
4,12-15시,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
...,...,...,...,...,...,...,...,...,...,...
137,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,기타,Unknown
138,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,자백
139,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,일부자백
140,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,부인


In [7]:
# 모든 데이터를 문자열로 변환
combined_df = combined_df.astype(str)

# 실제 데이터의 크기 확인
num_real_samples = len(combined_df)
print(f"Number of real samples: {num_real_samples}")

Number of real samples: 142


In [8]:
# 가상의 비범죄 데이터 생성
def generate_fake_data(real_data, num_samples):
    fake_data = real_data.copy()
    for col in real_data.columns:
        fake_data[col] = np.random.choice(real_data[col].unique(), num_samples)
    return fake_data

num_fake_samples = num_real_samples
fake_df = generate_fake_data(combined_df, num_fake_samples)
fake_df['범죄 발생 여부'] = '0'

In [9]:
# 실제 데이터에도 라벨 추가
combined_df['범죄 발생 여부'] = '1'

# 실제 데이터와 가상의 데이터를 결합
final_df = pd.concat([combined_df, fake_df], ignore_index=True)

# 결합된 데이터프레임 크기 확인
print(f"Final DataFrame shape: {final_df.shape}")

# 모든 피처 이름을 문자열로 변환
final_df.columns = final_df.columns.astype(str)

Final DataFrame shape: (284, 11)


In [10]:
# 원핫 인코딩
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(final_df.drop(columns=['범죄 발생 여부']))
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(final_df.columns.drop('범죄 발생 여부')))
encoded_df['범죄 발생 여부'] = final_df['범죄 발생 여부'].astype(int)



In [11]:
# 피처와 라벨 분리 및 학습/테스트 데이터 분할
X = encoded_df.drop(columns=['범죄 발생 여부'])
y = encoded_df['범죄 발생 여부']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
# 랜덤포레스트 모델 학습
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [13]:
# 학습된 피처 이름 저장
feature_names = X_train.columns

In [14]:
feature_names

Index(['시간_12-15시', '시간_15-18시', '시간_18-21시', '시간_21-24시', '시간_3-6시',
       '시간_6-9시', '시간_9-12시', '시간_Unknown', '요일_금', '요일_목',
       ...
       '동기_유혹', '동기_유흥비', '동기_치부', '동기_허영사치심', '동기_현실불만', '동기_호기심', '자백_묵비권',
       '자백_부인', '자백_일부자백', '자백_자백'],
      dtype='object', length=141)

In [15]:
# 모델 평가
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n {classification_report(y_test, y_pred)}")

Accuracy: 0.9824561403508771
Confusion Matrix:
 [[28  1]
 [ 0 28]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98        29
           1       0.97      1.00      0.98        28

    accuracy                           0.98        57
   macro avg       0.98      0.98      0.98        57
weighted avg       0.98      0.98      0.98        57

