In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [31]:
from colorama import Style, Fore
blk = Style.BRIGHT + Fore.BLACK
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
res = Style.RESET_ALL

In [126]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Data Preprocessing

In [127]:
vectorizer = TfidfVectorizer()
def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])
    
    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return X

In [128]:
X = train[['first_party', 'second_party', 'facts']]
y = train['first_party_winner']

In [129]:
X = get_vector(vectorizer, X, True)

In [130]:
X.shape

(2478, 52377)

In [131]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1982, 52377) (1982,) (496, 52377) (496,)


In [132]:
test = test[['first_party', 'second_party', 'facts']]
test = get_vector(vectorizer, test, False)
test

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [133]:
test.shape

(1240, 52377)

In [134]:
model = RandomForestClassifier()
model.fit(X,y)

RandomForestClassifier()

In [135]:
model.predict(test)

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

## Define Model & Train

## Voting 분류 모델

In [16]:
lr_model = LogisticRegression()
knn_model = KNeighborsClassifier(n_neighbors=8)

vo_clf = VotingClassifier(estimators=[('LR', lr_model), ('KNN', knn_model)], voting='soft')

vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
print(f'보팅 분류기 정확도: {np.round(accuracy_score(y_test, pred), 4)}')

classifiers = [lr_model, knn_model]
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    class_name = classifier.__class__.__name__
    print('{0} 정확도: {1:.4f}'.format(class_name, accuracy_score(y_test, pred)))

보팅 분류기 정확도: 0.6573
LogisticRegression 정확도: 0.6532
KNeighborsClassifier 정확도: 0.6230


## 그 외 다른 모델 사용

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print(f'랜덤 포레스트 정확도: {np.round(accuracy, 4)}')

랜덤 포레스트 정확도: 0.6492


In [18]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# 모델 구축
# BaggingClassifier에서 사용한 분류기가 클래스 확률추정(predict_proba)이 가능하면 자동으로 간접 투표 사용 
bag_model = BaggingClassifier(
    DecisionTreeClassifier(), # 약한 학습기(결정 트리)
    n_estimators=500, # 약한 학습기(결정 트리) 500개 생성
    max_samples=0.05, # 0.0~1.0 사이 실수 선택(실수 x 샘플 수) 혹은 샘플수 지정
    bootstrap=True, # True : 배깅, False : 페이스팅
    n_jobs=-1 # 훈련과 예측에 사용할 CPU 코어 수 (-1 : 가용한 모든 코어 사용)
)

# 모델 학습
bag_model.fit(X_train,y_train)

# 모델 예측
y_pred = bag_model.predict(X_test)

# 모델 평가
print(bag_model.__class__.__name__," : ",accuracy_score(y_test,y_pred))

BaggingClassifier  :  0.6612903225806451


## 앙상블 기법 사용해보기

In [19]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,train_test_split,KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

from mlxtend.classifier import StackingClassifier

In [20]:
cv = KFold(n_splits = 5 ,shuffle = True, random_state = 42)

# Logistic Regression
pipe_logist = Pipeline([
    ('MinMaxScale', MinMaxScaler()),
    ('Logistic', LogisticRegression())
])

param_grid_logist = {
    'Logistic__C': [0.1, 1, 10],
    'Logistic__penalty': ['l1', 'l2']
}

grid_logist = GridSearchCV(pipe_logist, param_grid=param_grid_logist, cv = cv)

# SVC
# pipe_svc = Pipeline([
#     ('MinMaxScale', MinMaxScaler()),
#     ('SVC', SVC(probability = True))
# ])

# param_grid_svc = {
#     'SVC__C': [0.1, 1, 10],
#     'SVC__kernel': ['linear', 'rbf']
# }

# grid_svc = GridSearchCV(pipe_svc, param_grid=param_grid_svc, cv = cv)

# KNN
pipe_knn = Pipeline([
    ('MinMaxScale', MinMaxScaler()),
    ('KNN', KNeighborsClassifier())
])

param_grid_knn = {
    'KNN__n_neighbors': [3, 5, 7],
    'KNN__weights': ['uniform', 'distance']
}

grid_knn = GridSearchCV(pipe_knn, param_grid=param_grid_knn, cv = cv)

# MLP
# pipe_mlp = Pipeline([
#     ('MinMaxScale', MinMaxScaler()),
#     ('MLP', MLPClassifier())
# ])

# param_grid_mlp = {
#     'MLP__hidden_layer_sizes': [(50,), (100,), (50, 50)],
#     'MLP__activation': ['relu', 'tanh'],
#     'MLP__alpha': [0.001, 0.01, 0.1]
# }

# grid_mlp = GridSearchCV(pipe_mlp, param_grid=param_grid_mlp, cv = cv)

# Random Forest
param_grid_rf = {
    'RandomForest__n_estimators': [100, 200, 300],
    'RandomForest__max_depth': [None, 5, 10]
}

pipe_rf = Pipeline([
    ('RandomForest', RandomForestClassifier())
])

grid_rf = GridSearchCV(pipe_rf, param_grid_rf, cv = cv)

# Extra Tree
param_grid_et = {
    'ExtraTrees__n_estimators': [100, 200, 300],
    'ExtraTrees__max_depth': [None, 5, 10]
}


pipe_et = Pipeline([
    ('ExtraTrees', ExtraTreesClassifier())
])

grid_et = GridSearchCV(pipe_et, param_grid_et, cv = cv)

# Catboost
# param_grid_cb = {
#     'CatBoost__iterations': [100, 200, 300],
#     'CatBoost__depth': [6, 8, 10]
# }

# pipe_cb = Pipeline([
#     ('CatBoost', CatBoostClassifier(verbose = 0))
# ])

# grid_cb = GridSearchCV(pipe_cb, param_grid_cb, cv = cv)


# LGBM
param_grid_lgbm = {
    'LGBM__n_estimators': [100, 200, 300],
    'LGBM__max_depth': [None, 5, 10]
}

pipe_lgbm = Pipeline([
    ('LGBM', LGBMClassifier(verbosity = -1))
])

grid_lgbm = GridSearchCV(pipe_lgbm, param_grid_lgbm, cv = cv)

# XGBoost
# param_grid_xgb = {
#     'XGB__n_estimators': [100, 200, 300],
#     'XGB__max_depth': [None, 5, 10]
# }

# pipe_xgb = Pipeline([
#     ('XGB', XGBClassifier())
# ])

# grid_xgb = GridSearchCV(pipe_xgb, param_grid_xgb, cv = cv)

In [21]:
grid_logist.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('MinMaxScale', MinMaxScaler()),
                                       ('Logistic', LogisticRegression())]),
             param_grid={'Logistic__C': [0.1, 1, 10],
                         'Logistic__penalty': ['l1', 'l2']})

In [22]:
# grid_svc.fit(X_train, y_train)

In [23]:
grid_knn.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('MinMaxScale', MinMaxScaler()),
                                       ('KNN', KNeighborsClassifier())]),
             param_grid={'KNN__n_neighbors': [3, 5, 7],
                         'KNN__weights': ['uniform', 'distance']})

In [24]:
# grid_mlp.fit(X_train, y_train)

In [25]:
grid_rf.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('RandomForest',
                                        RandomForestClassifier())]),
             param_grid={'RandomForest__max_depth': [None, 5, 10],
                         'RandomForest__n_estimators': [100, 200, 300]})

In [26]:
grid_et.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('ExtraTrees', ExtraTreesClassifier())]),
             param_grid={'ExtraTrees__max_depth': [None, 5, 10],
                         'ExtraTrees__n_estimators': [100, 200, 300]})

In [27]:
# grid_cb.fit(X_train, y_train)


In [28]:
grid_lgbm.fit(X_train, y_train)


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('LGBM', LGBMClassifier(verbosity=-1))]),
             param_grid={'LGBM__max_depth': [None, 5, 10],
                         'LGBM__n_estimators': [100, 200, 300]})

In [29]:
# grid_xgb.fit(X_train, y_train)

In [32]:
grid_models = [grid_logist,grid_knn,
              grid_rf,grid_et,grid_lgbm]

model_names = ['grid_logist','grid_knn',
              'grid_rf','grid_et','grid_lgbm']

print('5 FOLD BEST SCORE')
print(f'{blk}--'* 15)

best_models = []

for i,model in enumerate(grid_models):
    
    score = round(model.best_score_,3)    
    print(f'{blu}{model_names[i]} best score{res} : {red}{score}{res}')
    best_models.append(model.best_estimator_)
    
print(f'{blk}--' * 15)

5 FOLD BEST SCORE
[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--
[1m[34mgrid_logist best score[0m : [1m[31m0.667[0m
[1m[34mgrid_knn best score[0m : [1m[31m0.667[0m
[1m[34mgrid_rf best score[0m : [1m[31m0.668[0m
[1m[34mgrid_et best score[0m : [1m[31m0.667[0m
[1m[34mgrid_lgbm best score[0m : [1m[31m0.638[0m
[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--[1m[30m--


In [107]:
models = [LogisticRegression(), KNeighborsClassifier(), RandomForestClassifier(), ExtraTreesClassifier(), LGBMClassifier()]
new_best_model = []

for i in range(len(best_models)):
    if i < 2:
        grid_paramas = best_models[i][1].get_params()
    else:
        grid_paramas = best_models[i][0].get_params()
        
    model = models[i]
    model.set_params(**grid_paramas)
    new_best_model.append(model)
    print(grid_paramas)

{'C': 0.1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 7, 'p': 2, 'weights': 'uniform'}
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 300, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
{'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_

In [106]:
new_best_model

[LogisticRegression(C=0.1),
 KNeighborsClassifier(n_neighbors=7),
 RandomForestClassifier(n_estimators=300),
 ExtraTreesClassifier(max_depth=5),
 LGBMClassifier(max_depth=5, verbosity=-1)]

In [33]:
ensemble_soft = VotingClassifier(
    estimators = [
        ('Logist',best_models[0]),
        ('KNN',best_models[1]),
        ('RF', best_models[2]),
        ('ET', best_models[3]),
        ('LGBM', best_models[4])
    ],
    voting = 'soft'
)

ensemble_hard = VotingClassifier(
    estimators = [
        ('Logist',best_models[0]),
        ('KNN',best_models[1]),
        ('RF', best_models[2]),
        ('ET', best_models[3]),
        ('LGBM', best_models[4])
    ],
    voting = 'hard'
)

In [40]:
ensembles = [ensemble_soft, ensemble_hard]
names = ['ENSEMBLE_SOFT', 'ENSEMBLE_HARD']

for i,model in enumerate(ensembles):
    
    model.fit(X_train,y_train)
    
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    train_score = round(accuracy_score(y_train,train_pred),3)
    test_score = round(accuracy_score(y_test,test_pred),3)    
    
    print('--'*20)
    print(f'{blk}{names[i]} SCORE{res}')
    print(f'{blu}TRAIN : {train_score} {res}, {red}TEST : {test_score}{res}',sep = '|')

----------------------------------------
[1m[30mENSEMBLE_SOFT SCORE[0m
[1m[34mTRAIN : 0.834 [0m, [1m[31mTEST : 0.661[0m
----------------------------------------
[1m[30mENSEMBLE_HARD SCORE[0m
[1m[34mTRAIN : 0.848 [0m, [1m[31mTEST : 0.661[0m


## Stacking

In [47]:
stacking_logist = StackingClassifier(classifiers = ensemble_hard.estimators_,
                                     meta_classifier=best_models[0])

stacking_rf = StackingClassifier(classifiers = ensemble_hard.estimators_,
                                   meta_classifier = best_models[2])

stacking_lgbm = StackingClassifier(classifiers = ensemble_hard.estimators_,
                                   meta_classifier = best_models[4])

display(stacking_logist)
display(stacking_rf)
display(stacking_lgbm)

StackingClassifier(classifiers=[Pipeline(steps=[('MinMaxScale', MinMaxScaler()),
                                                ('Logistic',
                                                 LogisticRegression(C=0.1))]),
                                Pipeline(steps=[('MinMaxScale', MinMaxScaler()),
                                                ('KNN',
                                                 KNeighborsClassifier(n_neighbors=7))]),
                                Pipeline(steps=[('RandomForest',
                                                 RandomForestClassifier(n_estimators=300))]),
                                Pipeline(steps=[('ExtraTrees',
                                                 ExtraTreesClassifier(max_depth=5))]),
                                Pipeline(steps=[('LGBM',
                                                 LGBMClassifier(max_depth=5,
                                                                verbosity=-1))])],
                   meta_cla

StackingClassifier(classifiers=[Pipeline(steps=[('MinMaxScale', MinMaxScaler()),
                                                ('Logistic',
                                                 LogisticRegression(C=0.1))]),
                                Pipeline(steps=[('MinMaxScale', MinMaxScaler()),
                                                ('KNN',
                                                 KNeighborsClassifier(n_neighbors=7))]),
                                Pipeline(steps=[('RandomForest',
                                                 RandomForestClassifier(n_estimators=300))]),
                                Pipeline(steps=[('ExtraTrees',
                                                 ExtraTreesClassifier(max_depth=5))]),
                                Pipeline(steps=[('LGBM',
                                                 LGBMClassifier(max_depth=5,
                                                                verbosity=-1))])],
                   meta_cla

StackingClassifier(classifiers=[Pipeline(steps=[('MinMaxScale', MinMaxScaler()),
                                                ('Logistic',
                                                 LogisticRegression(C=0.1))]),
                                Pipeline(steps=[('MinMaxScale', MinMaxScaler()),
                                                ('KNN',
                                                 KNeighborsClassifier(n_neighbors=7))]),
                                Pipeline(steps=[('RandomForest',
                                                 RandomForestClassifier(n_estimators=300))]),
                                Pipeline(steps=[('ExtraTrees',
                                                 ExtraTreesClassifier(max_depth=5))]),
                                Pipeline(steps=[('LGBM',
                                                 LGBMClassifier(max_depth=5,
                                                                verbosity=-1))])],
                   meta_cla

In [48]:
stacking_models = [stacking_logist, stacking_rf, stacking_lgbm]
names = ['Stacking Logist', 'Stacking_RandomForest','Stacking_LGBM']

for i,model in enumerate(stacking_models):
    
    model.fit(X_train,y_train)
    
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    train_score = round(accuracy_score(y_train,train_pred),3)
    test_score = round(accuracy_score(y_test,test_pred),3)    
    
    print('--'*20)
    print(f'{blk}{names[i]} SCORE')
    print(f'{blu}TRAIN : {train_score}, {red}TEST : {test_score}{res}',sep = '|')

----------------------------------------
[1m[30mStacking Logist SCORE
[1m[34mTRAIN : 1.0, [1m[31mTEST : 0.657[0m
----------------------------------------
[1m[30mStacking_RandomForest SCORE
[1m[34mTRAIN : 1.0, [1m[31mTEST : 0.657[0m
----------------------------------------
[1m[30mStacking_LGBM SCORE
[1m[34mTRAIN : 1.0, [1m[31mTEST : 0.655[0m


## 모델 다시 만들기

In [None]:
models = [LogisticRegression(), KNeighborsClassifier(), RandomForestClassifier(), ExtraTreesClassifier(), LGBMClassifier()]
new_best_model = []

for i in range(len(best_models)):
    if i < 2:
        grid_paramas = best_models[i][1].get_params()
    else:
        grid_paramas = best_models[i][0].get_params()
        
    model = models[i]
    model.set_params(**grid_paramas)
    new_best_model.append(model)

In [None]:
log_params = {'C': 0.1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
knn_params = {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 7, 'p': 2, 'weights': 'uniform'}
rf_params = {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 300, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
et_params = {'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
lgbm_parmas = {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': 5, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': 'warn', 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'verbosity': -1}

In [112]:
pd.DataFrame(X).shape

(2478, 52377)

In [113]:
pd.DataFrame(y).shape

(2478, 1)

## Inference & Submission

In [38]:
submit = pd.read_csv('sample_submission.csv')

In [138]:
pred = ensemble_hard.predict(test)

submit['first_party_winner'] = pred
submit.to_csv('ensemble_hard_submit.csv', index=False)

In [139]:
pred = stacking_rf.predict(test)

submit['first_party_winner'] = pred
submit.to_csv('stacking_rf.csv', index=False)

In [None]:
submit['first_party_winner'] = pred
submit.to_csv('./baseline_submit.csv', index=False)
print('Done')

Done


In [None]:
submit

Unnamed: 0,ID,first_party_winner
0,TEST_0000,1
1,TEST_0001,1
2,TEST_0002,1
3,TEST_0003,1
4,TEST_0004,1
...,...,...
1235,TEST_1235,0
1236,TEST_1236,1
1237,TEST_1237,1
1238,TEST_1238,1
