In [1]:
import os
import random
import numpy as np
import pandas as pd

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
train = pd.read_csv('../data/Crimes/train.csv')
test = pd.read_csv('../data/Crimes/test.csv')
submission = pd.read_csv('../data/Crimes/sample_submission.csv', index_col=0)

In [4]:
train

Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,TARGET
0,TRAIN_00000,9,화요일,10,137,8.0,2.611124,0.000000,0.0,0.00,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,2
1,TRAIN_00001,11,화요일,6,438,13.0,3.209093,0.000000,0.0,0.00,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0
2,TRAIN_00002,8,일요일,6,1729,47.0,1.619597,0.000000,0.0,0.00,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,인도,1
3,TRAIN_00003,5,월요일,6,2337,53.0,1.921615,11.375000,0.0,0.00,225.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,주거지,1
4,TRAIN_00004,9,일요일,11,1439,41.0,1.789721,0.000000,0.0,0.00,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주유소,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84401,TRAIN_84401,4,일요일,7,336,11.0,3.808190,99.111111,0.0,0.00,165.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,1
84402,TRAIN_84402,8,목요일,12,2149,38.0,1.458490,0.000000,0.0,0.00,200.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,차도,0
84403,TRAIN_84403,7,일요일,6,29,46.0,2.944913,105.888889,0.0,0.00,315.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0
84404,TRAIN_84404,1,화요일,11,536,25.0,0.493679,2.285714,8.6,10.75,330.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,주거지,1


In [5]:
qual_col = ['요일', '범죄발생지']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[i] = le.transform(test[i])
print('Done.')

Done.


In [6]:
train_y = train['TARGET']
train_x = train.drop(columns=['ID', 'TARGET'])
test_x = test.drop(columns=['ID'])

In [7]:
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y,
                                                  test_size=0.2,
                                                  random_state=42)

In [8]:
# Cross-validation with StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [9]:
FOLDS=5
folds=StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed_everything(42))
pred, scores = np.zeros(len(test_x)), []

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.4, 0.6, 1.0]
}

for n_fold, (train_idx, val_idx) in enumerate(folds.split(train_x, train_y)):
    print(f'===================================={n_fold+1}============================================')
    
    train_X, val_X = train_x.iloc[train_idx],train_x.iloc[val_idx]
    train_Y, val_Y = train_y.iloc[train_idx],train_y.iloc[val_idx]
    
    # Classifier
    model = AdaBoostClassifier(random_state=42)
    
    grid = GridSearchCV(model,
                    param_grid,
                    cv=folds,
                    scoring='accuracy',
                    n_jobs=-1,
                    verbose=1)
    
    grid.fit(train_X, train_Y)
    
    best_model = grid.best_estimator_
    
    pred += best_model.predict(test_x) / FOLDS
    
    print(f'================================================================================\n\n')

Fitting 5 folds for each of 9 candidates, totalling 45 fits


Fitting 5 folds for each of 9 candidates, totalling 45 fits


Fitting 5 folds for each of 9 candidates, totalling 45 fits


Fitting 5 folds for each of 9 candidates, totalling 45 fits


Fitting 5 folds for each of 9 candidates, totalling 45 fits




In [10]:
pred = best_model.predict(test_x)

In [11]:
submission['TARGET'] = pred

In [12]:
submission.to_csv('submission01.csv')