### Settings

In [0]:
# data preprocessing
import os
import pandas as pd
import numpy as np
import random
import warnings
warnings.filterwarnings(action='ignore')
import pickle
from datetime import datetime, timedelta
# tokenizer
import re
from konlpy.tag import Mecab
# model setting
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
# models
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

### load data

In [0]:
os.chdir('/content/gdrive/My Drive/Colab Notebooks/programmers/')
train = pd.read_csv('data/hashcode_classification2020_train.csv')
test = pd.read_csv('data/hashcode_classification2020_test.csv')
submission=pd.read_csv('data/hashcode_classification2020_sample.csv')
print(train.shape, test.shape, submission.shape)

### text preprocessing

In [0]:
def text_process(data):
    X_data = []
    tokenizer = Mecab()
    for i in range(len(data)):
        X = []
        try:
            txt = data['title'][i]+' '+data['content'][i]
        except TypeError: 
            txt = data['title'][i]
        txt = re.sub('[^가-힣a-zA-Z0-9]',' ',txt)
        txt = re.sub('  +',' ',txt).split()
        for x in txt:
            isHangul = bool(re.search('[가-힣]',x))
            isEng = bool(re.match('[A-Za-z]',x))
            if isHangul:
                xx = re.split('([가-힣])+',x) if isEng else tokenizer.nouns(x)
                X.extend([x for x in xx if x != '' and len(x)>1])
            else:
                X.append(x)
        X_data.append(' '.join(X))
    return pd.Series(X_data)

In [0]:
X_train = text_process(train)
X_test = text_process(test)

In [0]:
def bigram(data):
    bigram_list = []
    for text in data:
        token = text.split()
        bigram = [token[i]+'.'+token[i+1] for i in range(len(token)-1)]  # bi-gram
        bigram_list.append(' '.join(bigram))
    return bigram_list

In [0]:
X_train2 = bigram(X_train)
X_test2 = bigram(X_test)

In [0]:
y_train = train['label']

In [0]:
v = CountVectorizer()
v.fit(X_train)
vec_train_count = v.transform(X_train).toarray()
vec_test_count = v.transform(X_test).toarray()
vec_train_count.shape

(2592, 19851)

In [0]:
v2 = TfidfVectorizer()
v2.fit(X_train)
vec_train_TFIDF = v2.transform(X_train).toarray()
vec_test_TFIDF = v2.transform(X_test).toarray()
vec_train_TFIDF.shape

(2592, 19851)

### modeling

In [0]:
RANDOM_SEED = 2486

In [0]:
model1 = Pipeline([
    ('model', MultinomialNB()),
])
model2 = Pipeline([
    ('model', RandomForestClassifier(random_state=RANDOM_SEED)),
])
model3 = Pipeline([
    ('model', SGDClassifier(random_state=RANDOM_SEED, loss='hinge')),
])
model4 = Pipeline([
    ('model', LogisticRegression(random_state=RANDOM_SEED)),
])
model5 = Pipeline([
    ('model', AdaBoostClassifier(random_state=RANDOM_SEED)),
])
model6 = Pipeline([
    ('model', LGBMClassifier(n_estimators=400, random_state=RANDOM_SEED))
])
'''model7 = Pipeline([
    ('model', SVC(random_state=RANDOM_SEED)), # probability=True
])'''

models = [model1, model2, model3, model4, model5, model6]

In [0]:
for i, model in enumerate(models):
    scores = cross_val_score(model, vec_train_count, y_train, cv=5, scoring = 'accuracy')
    print(("Model{0:d}: Mean score: {1:.4f}").format(i + 1, np.mean(scores))) # AUC 평균

Model1: Mean score: 0.7751
Model2: Mean score: 0.7824
Model3: Mean score: 0.7685
Model4: Mean score: 0.7727
Model5: Mean score: 0.7284
Model6: Mean score: 0.6308


### find the best parameter

In [0]:
def RandomSearch_lgbm(X, y):
    clf = Pipeline([
            ("lgbm", LGBMClassifier(random_state=RANDOM_SEED)),
        ])

    param_grid = [{
        'lgbm__n_estimators': [x for x in range(100,1000,100)],
        'lgbm__num_leaves': [x for x in range(10,100,10)],
        'lgbm__max_depth': [x for x in range(2,10)],
        }]

    random_search = RandomizedSearchCV(clf, param_grid, cv=5, scoring='accuracy')

    random_search.fit(X, y)

    pm = random_search.best_params_
    print(pm)

    clf = LGBMClassifier(n_estimators=pm['lgbm__n_estimators'], num_leaves=pm['lgbm__num_leaves'], max_depth=pm['lgbm__max_depth'], random_state=RANDOM_SEED)
    print('Mean acc: ', np.mean(cross_val_score(clf, X, y, cv=5, scoring = 'accuracy')))

In [0]:
RandomSearch_lgbm(vec_train_count, y_train)

{'lgbm__num_leaves': 80, 'lgbm__n_estimators': 300, 'lgbm__max_depth': 4}
Mean acc:  [0.81117534 0.82080925 0.80694981 0.8011583  0.77799228]


### prediction

In [0]:
clf = LGBMClassifier(n_estimators=50, num_leaves=500, max_depth=4, random_state=RANDOM_SEED)
clf.fit(vec_train_count2, y_train)
y_pred = clf.predict(vec_test_count2)
y_pred[:5]

array([5, 4, 1, 3, 5])

In [0]:
y_pred = ['label'] + y_pred.tolist()
pd.Series(y_pred).to_csv('submission/LGBM_c2_'+(datetime.now()+timedelta(hours=9)).isoformat()[5:-10]+'.csv',index=False)