# Dacon progect: Prediction Model of Crime Prevention

## Setting

### install

In [11]:
# pip install pandas
# pip install seaborn
# pip install scikit-learn
# pip install optuna
#!pip install lightgbm

### import


In [12]:
import pandas as pd
import seaborn as sns
import numpy as np
import random
import os

import optuna
from optuna import Trial
from optuna.samplers import TPESampler

from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

import warnings
warnings.filterwarnings('ignore')

### seed 고정


In [13]:
#seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1002) # Seed 고정

In [14]:
#한글 세팅
fe = fm.FontEntry(fname = 'NanumBarunGothic.ttf', name = 'NanumBarunGothic')

fm.fontManager.ttflist.insert(0, fe)
plt.rc('font', family='NanumBarunGothic')

### data load

In [15]:
# test data
DATA = pd.read_csv('train.csv')

# test data load
TEST = pd.read_csv('test.csv')

# label data load : 범죄발생구역 중에서 비슷한 범죄항목 비율이 있는 지역끼리 대구분을 해 보았음
#LABEL = pd.read_csv('label.csv')

# day data load : 요일을 월 ~ 일 까지 0~6으로 구분함
DAY = pd.read_csv('day.csv')

#train.head(3)
#label.head(5)

## data EDA

### 범죄발생지역 대구분 - 데이터 머지

In [16]:
DATA = pd.merge(left=DATA, right=DAY, left_on='요일', right_on='요일')
TEST = pd.merge(left=TEST, right=DAY, left_on='요일', right_on='요일')

# 중복 컬럼 데이터 드롭
DATA.drop(columns=['요일', 'ID'], inplace=True)
TEST.drop(columns=['요일', 'ID'], inplace=True)

In [17]:
'''
여러개를 기준으로 머지 시킬때는 리스트 형태를 취한다.
마소 엑세스의 테이블끼리 조인해서 쿼리문을 만들때, 화살표와 같은 것!

 data = pd.merge(left=original, right=label, 
                left_on=['data', 'cadavercode'],
                right_on=['date', 'cadavercode']
                )
'''

"\n여러개를 기준으로 머지 시킬때는 리스트 형태를 취한다.\n마소 엑세스의 테이블끼리 조인해서 쿼리문을 만들때, 화살표와 같은 것!\n\n data = pd.merge(left=original, right=label, \n                left_on=['data', 'cadavercode'],\n                right_on=['date', 'cadavercode']\n                )\n"

In [18]:
DATA.head(5)

Unnamed: 0,월,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,TARGET,day
0,9,10,137,8.0,2.611124,0.0,0.0,0.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,2,1
1,11,6,438,13.0,3.209093,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0,1
2,5,8,447,13.0,2.037493,4.125,0.0,0.0,165.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주거지,0,1
3,6,12,1438,41.0,1.291232,2.428571,0.0,0.0,105.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0,1
4,2,6,1027,41.0,3.015956,0.0,0.0,336.4,335.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주거지,0,1


In [19]:
TEST.head(5)

Unnamed: 0,월,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,day
0,9,5,927,28.0,1.570654,19.625,0.0,0.0,165.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,차도,4
1,5,12,1149,29.0,1.996479,0.0,0.0,0.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,4
2,5,9,137,9.0,4.711117,6.75,0.0,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주차장,4
3,10,2,2137,25.0,3.218441,82.0,0.0,0.0,250.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,주거지,4
4,5,1,640,19.0,0.477128,0.0,0.0,0.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주거지,4


### 범주형 데이터 라벨링

In [20]:
'''cat_feature = ['범죄발생지']

for i in cat_feature:
        
    le = LabelEncoder()

    # 범죄발생지를 LabelEncoder 의 fit_transform 에 넣어준다
    le.fit_transform(DATA[i].unique())

    new_cat = i+"2"
    # 적용!
    DATA[new_cat] = le.transform(DATA[i])
    DATA.drop(columns=i, inplace=True)


for i in cat_feature:
        
    le = LabelEncoder()

    # 범죄발생지를 LabelEncoder 의 fit_transform 에 넣어준다
    le.fit_transform(TEST[i].unique())

    new_cat = i+"2"
    # 적용!
    TEST[new_cat] = le.transform(TEST[i])
    TEST.drop(columns=i, inplace=True)
    '''

'cat_feature = [\'범죄발생지\']\n\nfor i in cat_feature:\n        \n    le = LabelEncoder()\n\n    # 범죄발생지를 LabelEncoder 의 fit_transform 에 넣어준다\n    le.fit_transform(DATA[i].unique())\n\n    new_cat = i+"2"\n    # 적용!\n    DATA[new_cat] = le.transform(DATA[i])\n    DATA.drop(columns=i, inplace=True)\n\n\nfor i in cat_feature:\n        \n    le = LabelEncoder()\n\n    # 범죄발생지를 LabelEncoder 의 fit_transform 에 넣어준다\n    le.fit_transform(TEST[i].unique())\n\n    new_cat = i+"2"\n    # 적용!\n    TEST[new_cat] = le.transform(TEST[i])\n    TEST.drop(columns=i, inplace=True)\n    '

In [21]:
DATA.loc[DATA['범죄발생지'] == '공원', '범죄발생지'] = 0
DATA.loc[DATA['범죄발생지'] == '백화점', '범죄발생지'] = 1
DATA.loc[DATA['범죄발생지'] == '병원', '범죄발생지'] = 2
DATA.loc[DATA['범죄발생지'] == '식당', '범죄발생지'] = 3
DATA.loc[DATA['범죄발생지'] == '약국', '범죄발생지'] = 4
DATA.loc[DATA['범죄발생지'] == '은행', '범죄발생지'] = 5
DATA.loc[DATA['범죄발생지'] == '인도', '범죄발생지'] = 6
DATA.loc[DATA['범죄발생지'] == '주거지', '범죄발생지'] = 7
DATA.loc[DATA['범죄발생지'] == '주유소', '범죄발생지'] = 8
DATA.loc[DATA['범죄발생지'] == '주차장', '범죄발생지'] = 9
DATA.loc[DATA['범죄발생지'] == '차도', '범죄발생지'] = 10
DATA.loc[DATA['범죄발생지'] == '편의점', '범죄발생지'] = 11
DATA.loc[DATA['범죄발생지'] == '학교', '범죄발생지'] = 12
DATA.loc[DATA['범죄발생지'] == '호텔/모텔', '범죄발생지'] = 13

In [22]:
DATA['범죄발생지'] = DATA['범죄발생지'].astype(int)
DATA.dtypes

월            int64
시간           int64
소관경찰서        int64
소관지역       float64
사건발생거리     float64
강수량(mm)    float64
강설량(mm)    float64
적설량(cm)    float64
풍향         float64
안개         float64
짙은안개       float64
번개         float64
진눈깨비       float64
서리         float64
연기/연무      float64
눈날림        float64
범죄발생지        int32
TARGET       int64
day          int64
dtype: object

In [23]:
TEST.loc[TEST['범죄발생지'] == '공원', '범죄발생지'] = 0
TEST.loc[TEST['범죄발생지'] == '백화점', '범죄발생지'] = 1
TEST.loc[TEST['범죄발생지'] == '병원', '범죄발생지'] = 2
TEST.loc[TEST['범죄발생지'] == '식당', '범죄발생지'] = 3
TEST.loc[TEST['범죄발생지'] == '약국', '범죄발생지'] = 4
TEST.loc[TEST['범죄발생지'] == '은행', '범죄발생지'] = 5
TEST.loc[TEST['범죄발생지'] == '인도', '범죄발생지'] = 6
TEST.loc[TEST['범죄발생지'] == '주거지', '범죄발생지'] = 7
TEST.loc[TEST['범죄발생지'] == '주유소', '범죄발생지'] = 8
TEST.loc[TEST['범죄발생지'] == '주차장', '범죄발생지'] = 9
TEST.loc[TEST['범죄발생지'] == '차도', '범죄발생지'] = 10
TEST.loc[TEST['범죄발생지'] == '편의점', '범죄발생지'] = 11
TEST.loc[TEST['범죄발생지'] == '학교', '범죄발생지'] = 12
TEST.loc[TEST['범죄발생지'] == '호텔/모텔', '범죄발생지'] = 13

In [24]:
TEST['범죄발생지'] = TEST['범죄발생지'].astype(int)
TEST.dtypes

월            int64
시간           int64
소관경찰서        int64
소관지역       float64
사건발생거리     float64
강수량(mm)    float64
강설량(mm)    float64
적설량(cm)    float64
풍향         float64
안개         float64
짙은안개       float64
번개         float64
진눈깨비       float64
서리         float64
연기/연무      float64
눈날림        float64
범죄발생지        int32
day          int64
dtype: object

In [25]:
DATA.head(5)

Unnamed: 0,월,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,TARGET,day
0,9,10,137,8.0,2.611124,0.0,0.0,0.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,2,1
1,11,6,438,13.0,3.209093,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,0,1
2,5,8,447,13.0,2.037493,4.125,0.0,0.0,165.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,0,1
3,6,12,1438,41.0,1.291232,2.428571,0.0,0.0,105.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,0,1
4,2,6,1027,41.0,3.015956,0.0,0.0,336.4,335.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,0,1


In [26]:
TEST.head(5)

Unnamed: 0,월,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,day
0,9,5,927,28.0,1.570654,19.625,0.0,0.0,165.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,10,4
1,5,12,1149,29.0,1.996479,0.0,0.0,0.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4
2,5,9,137,9.0,4.711117,6.75,0.0,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,4
3,10,2,2137,25.0,3.218441,82.0,0.0,0.0,250.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,7,4
4,5,1,640,19.0,0.477128,0.0,0.0,0.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,4


In [27]:
# f_data.info()
# test.info()

In [28]:
# 기술통계량을 확인하기위해 describe() 함수를 사용합니다.  
DATA.describe()

Unnamed: 0,월,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,TARGET,day
count,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0
mean,6.430195,6.769507,1060.027581,26.881726,1.912424,24.608776,2.284407,23.430503,186.926107,0.385423,0.017842,0.144042,0.02033,0.01026,0.210755,0.008921,7.944862,0.835355,3.0917
std,3.108302,3.56639,698.380485,13.870968,0.958556,62.711211,15.852881,85.199896,98.299485,0.486698,0.132379,0.351134,0.141128,0.100771,0.407847,0.09403,2.261395,0.819762,2.016831
min,1.0,1.0,26.0,5.0,0.012269,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,4.0,526.0,13.0,1.209985,0.0,0.0,0.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,1.0
50%,7.0,7.0,937.0,27.0,1.822279,0.625,0.0,0.0,205.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,3.0
75%,9.0,10.0,1638.0,38.0,2.476528,18.571429,0.0,0.0,260.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,2.0,5.0
max,12.0,12.0,2450.0,54.0,4.998936,614.875,295.0,649.8,360.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,13.0,2.0,6.0


In [29]:
# 범죄발생지의 등장 빈도를 확인하는 코드입니다.  
DATA['범죄발생지'].value_counts()

범죄발생지
7     36077
10    25879
6      6437
11     4835
9      3262
3      1806
1      1493
8      1324
0       736
12      728
4       653
13      591
2       453
5       132
Name: count, dtype: int64

### visualization

In [None]:
'''# 종속변수 분포 확인
sns.displot(DATA['TARGET'])'''

In [None]:
'''# 전체 데이터 분포 확인
except_target = DATA.drop('TARGET', axis = 1)
except_target.hist(figsize = (12,12))
plt.show()'''

In [None]:
'''# 시간대별 발생 범죄 빈도 확인
sns.countplot(x = '시간', hue = 'TARGET', data = DATA)
plt.show()'''

In [None]:
'''# 이상치 확인
fig, axes = plt.subplots(2,2, figsize = (10,10))

sns.boxplot(y = DATA['사건발생거리'], ax = axes[0][0])
sns.boxplot(y = DATA['강수량(mm)'], ax = axes[0][1])

sns.boxplot(y = DATA['강설량(mm)'], ax = axes[1][0])
sns.boxplot(y = DATA['적설량(cm)'], ax = axes[1][1])

plt.show()'''

In [None]:
'''# 상관관계 확인
plt.figure(figsize = (15,15), dpi = 100)
sns.heatmap(DATA.corr(), annot = True, cmap = 'YlGn')
plt.show()'''

## Model: Decision Tree Regression

### 독립변수(x_train), 종속변수(y_train) 분리

In [31]:
# 독립변수(x_train), 종속변수(y_train) 분리
X = DATA.drop('TARGET', axis=1)
y = DATA['TARGET']

x_test = TEST

### 모델 정의

In [32]:
# data  split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [33]:
def objective(trial: Trial) -> float:
    params_lgb = {
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "objective": "multiclass",
        "metric": "multi_logloss",
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 3e-5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
        

    model = LGBMClassifier(**params_lgb)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5,
        verbose=False,
    )

    lgb_pred = model.predict_proba(X_valid)
    log_score = log_loss(y_valid, lgb_pred)
    
    return log_score

In [34]:
sampler = TPESampler(seed=1002)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=100)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2023-05-25 23:28:07,425][0m A new study created in memory with name: lgbm_parameter_opt[0m
[32m[I 2023-05-25 23:28:12,512][0m Trial 0 finished with value: 0.9549676404477587 and parameters: {'reg_alpha': 3.8783587769694796e-06, 'reg_lambda': 0.041504061839746025, 'max_depth': 9, 'num_leaves': 155, 'colsample_bytree': 0.5098408281623711, 'subsample': 0.5925388993394455, 'subsample_freq': 2, 'min_child_samples': 68, 'max_bin': 442}. Best is trial 0 with value: 0.9549676404477587.[0m
[32m[I 2023-05-25 23:28:14,698][0m Trial 1 finished with value: 0.9576862366627388 and parameters: {'reg_alpha': 4.152877358161054e-06, 'reg_lambda': 0.03153463283767513, 'max_depth': 13, 'num_leaves': 142, 'colsample_bytree': 0.8154847914247503, 'subsample': 0.33168215119071137, 'subsample_freq': 4, 'min_child_samples': 9, 'max_bin': 245}. Best is trial 0 with value: 0.9549676404477587.[0m
[32m[I 2023-05-25 23:28:17,062][0m Trial 2 finished with value: 0.9571719572964926 and parameters: {'r

Best Score: 0.9518044744612969
Best trial: {'reg_alpha': 1.039893019135935e-06, 'reg_lambda': 0.02873485626642655, 'max_depth': 11, 'num_leaves': 203, 'colsample_bytree': 0.545202864792721, 'subsample': 0.916925145286892, 'subsample_freq': 7, 'min_child_samples': 19, 'max_bin': 492}


In [35]:
PARAMS = study.best_trial.params
model = LGBMClassifier(**PARAMS)

In [36]:
# 모델 학습
model.fit(X, y)

### 예측

In [37]:
# 예측
y_pred = model.predict(x_test)


In [38]:
y_pred

array([2, 2, 1, ..., 0, 0, 0], dtype=int64)

In [39]:
# 예측
y_pred = model.predict(x_test)

## 파일 저장

In [40]:
# 파일 저장
submit = pd.read_csv('sample_submission.csv')

# 예측한 값을 TARGET 컬럼에 할당
submit['TARGET'] = y_pred
submit.head(15)

Unnamed: 0,ID,TARGET
0,TEST_00000,2
1,TEST_00001,2
2,TEST_00002,1
3,TEST_00003,0
4,TEST_00004,0
5,TEST_00005,2
6,TEST_00006,1
7,TEST_00007,1
8,TEST_00008,0
9,TEST_00009,0


In [41]:
submit.to_csv('submit7_lgb.csv', index = False)