In [37]:
import pandas as pd
import numpy as np
import re # 정규표현식

# 시스템
import os, sys
import shutil
from io import BytesIO
from pathlib import Path
import pickle
os.environ['JAVA_HOME'] = r'/Library/Java/JavaVirtualMachines/zulu-20.jdk' # 자바 환경변수


# 벡터화
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

# ML Classifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# 통계모델
from sklearn.decomposition import LatentDirichletAllocation # LDA
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

# 성능평가 지표
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, make_scorer

# 편의성
import warnings
from tqdm import tqdm
tqdm.pandas() 


# 커스텀
sys.path.append('/Users/GitHub/Projects/AI Project/낚시성 기사 탐지 모델/modules')


# 토큰 경로
tl_pickles, vl_pickles = ['/Users/GitHub/Projects/AI Project/낚시성 기사 탐지 모델/tokens/raw tokens/Training Labeled Pickles',
                          '/Users/GitHub/Projects/AI Project/낚시성 기사 탐지 모델/tokens/raw tokens/Validation Labeled Pickles']

# 피클 내 정보
# [0] Clickbait Auto | [1] NonClickbait Auto | [2] Clickbait Direct

### 1. CountVectorizer + uni-gram

In [38]:
# 각 피클 경로 파싱
tl_pickles_path = [tl_pickles + '/' + file_name for file_name in  os.listdir(tl_pickles) if file_name.endswith('.pickle')]
vl_pickles_path = [vl_pickles + '/' + file_name for file_name in  os.listdir(vl_pickles) if file_name.endswith('.pickle')]

In [39]:
def pk_df(pickle_path): # tl_pickles_path 내의 피클을 하나의 df로 반환
    f = open(pickle_path, 'rb')
    pk = pickle.load(f)

    # 피클 병합
    df = pd.concat(pk) # 3개 피클을 하나로
    
    return df

def df_Xy_split(tl_pk, vl_pk):
    # 벡터화
    ct = CountVectorizer()
    tl_vec = ct.fit_transform(tl_pk['article(str2)'])
    vl_vec = ct.transform(vl_pk['article(str2)'])
    
    # 데이터, 타겟 지정
    tl_y_data = tl_pk['useType']
    tl_X_data = tl_vec

    vl_y_data = vl_pk['useType']
    vl_X_data = vl_vec
    
    
    
    return tl_X_data, tl_y_data, vl_X_data, vl_y_data

### bernoulliNB: 카테고리별로 진행

In [40]:
warnings.filterwarnings(action='ignore')

pt_cat_list = ['_'.join(file_name.split('/')[-1].split('_')[:2]) for file_name in tl_pickles_path] # 카테고리명 지정하기 위한 리스트
concated_tl_pks = [] # 각 분야 별로 합쳐진 데이터프레임
concated_vl_pks = [] # 각 분야 별로 합쳐진 데이터프레임

bnb_result_best_params_, bnb_result_best_recall = [], [] # 결과 담기(test)
vl_accuracy, vl_recall, vl_precision, vl_f1 = [], [], [], [] # 결과담기(val)

# 피클 순회
for (tl_path, vl_path) in tqdm(zip(tl_pickles_path, vl_pickles_path), leave=True, desc='카테고리 순회 중', position=0): 

    # 각 피클 내 3개의 df를 하나로 concat
    tl_pk = pk_df(tl_path) 
    vl_pk = pk_df(vl_path)
    concated_tl_pks.append(tl_pk) # 나중에 또 합칠 것에 대비해 미리 모아두기
    concated_tl_pks.append(vl_pk) # 나중에 또 합칠 것에 대비해 미리 모아두기
    
    # X_data, y_data 지정
    tl_X_data, tl_y_data, vl_X_data, vl_y_data = df_Xy_split(tl_pk, vl_pk)
    
    # 데이터 분할(TL 데이터로만 학습))
    X_train, X_test, y_train, y_test = train_test_split(tl_X_data, tl_y_data,
                                                        test_size=0.1,
                                                        shuffle=True,
                                                        stratify=tl_y_data,
                                                        random_state=777
                                                        )

    # 모델 학습 

    # BernoulliNB 객체 생성
    bnb = BernoulliNB()
    
    # BernoulliNB 하이퍼파라미터 지정
    bnb_params = {'alpha' : np.arange(0.1, 2.5, 0.2),
                'fit_prior' : ['True', 'False']}
    
    # 성능지표 recall 커스텀
    recall = make_scorer(score_func=recall_score, greater_is_better=True, needs_proba=False, pos_label=1)
    
    # grid
    bnb_grid = GridSearchCV(estimator=bnb, param_grid=bnb_params,
                            scoring = recall,
                            n_jobs=2,
                            cv=3,
                            verbose=0)
    
    # 모델 학습
    bnb_grid.fit(X_train, y_train)
    y_pred = bnb_grid.predict(vl_X_data)
    
    # 평과 결과(Test)
    bnb_result_best_params_.append(bnb_grid.best_params_)
    bnb_result_best_recall.append(bnb_grid.best_score_)

    # 평과 결과(Validation)
    vl_accuracy.append(accuracy_score(y_pred, vl_y_data))
    vl_recall.append(recall_score(y_pred, vl_y_data, pos_label=1))
    vl_precision.append(precision_score(y_pred, vl_y_data, pos_label=1))
    vl_f1.append(f1_score(y_pred, vl_y_data, pos_label=1))
    
    
    
                           
                                                        
    # break # 1개 피클만 보기

bnb_results = pd.DataFrame({'cat':pt_cat_list,
                           'best_params_': bnb_result_best_params_,
                           'best_recall(test)': bnb_result_best_recall,
                           'accuracy(val)' : vl_accuracy,
                           'f1(val)' : vl_f1,
                           'precision(val)' : vl_precision,
                           'recall(val)' : vl_recall
})

# 결과 저장
fi =  open('results/BernoulliNB_results.pickle', 'wb')
pickle.dump(bnb_results, fi)
fi.close()

카테고리 순회 중: 14it [04:05, 17.54s/it]


## MultinomialNB: 카테고리별로 진행

In [41]:
warnings.filterwarnings(action='ignore')

mnb_result_best_params_, mnb_result_best_recall = [], [] # 결과 담기(test)
vl_accuracy, vl_recall, vl_precision, vl_f1 = [], [], [], [] # 결과담기(val)
# 피클 순회
for (tl_path, vl_path) in tqdm(zip(tl_pickles_path, vl_pickles_path), leave=True, desc='카테고리 순회 중', position=0): 

    # 각 피클 내 3개의 df를 하나로 concat
    tl_pk = pk_df(tl_path) 
    vl_pk = pk_df(vl_path)
    concated_tl_pks.append(tl_pk) # 나중에 또 합칠 것에 대비해 미리 모아두기
    concated_tl_pks.append(vl_pk) # 나중에 또 합칠 것에 대비해 미리 모아두기
    
    # X_data, y_data 지정
    tl_X_data, tl_y_data, vl_X_data, vl_y_data = df_Xy_split(tl_pk, vl_pk)
    
    # 데이터 분할(TL 데이터로만 학습))
    X_train, X_test, y_train, y_test = train_test_split(tl_X_data, tl_y_data,
                                                        test_size=0.1,
                                                        shuffle=True,
                                                        stratify=tl_y_data,
                                                        random_state=777
                                                        )

    # 모델 학습 

    # mnb = MultinomialNB 객체 생성
    mnb = MultinomialNB()
    
    # BernoulliNB 하이퍼파라미터 지정
    mnb_params = {'alpha' : np.arange(0.1, 2.5, 0.2),
                'fit_prior' : ['True', 'False']}
    
    # 성능지표 recall 커스텀
    recall = make_scorer(score_func=recall_score, greater_is_better=True, needs_proba=False, pos_label=1)
    
    # grid
    mnb_grid = GridSearchCV(estimator=mnb, param_grid=mnb_params,
                            scoring = recall,
                            n_jobs=2,
                            cv=3,
                            verbose=0)
    
    # 모델 학습
    mnb_grid.fit(X_train, y_train)
    y_pred = mnb_grid.predict(vl_X_data)
    
    # 평과 결과(Test)
    mnb_result_best_params_.append(mnb_grid.best_params_)
    mnb_result_best_recall.append(mnb_grid.best_score_)

    # 평과 결과(Validation)
    vl_accuracy.append(accuracy_score(y_pred, vl_y_data))
    vl_recall.append(recall_score(y_pred, vl_y_data, pos_label=1))
    vl_precision.append(precision_score(y_pred, vl_y_data, pos_label=1))
    vl_f1.append(f1_score(y_pred, vl_y_data, pos_label=1))
    
    
    
                           
                                                        
    # break # 1개 피클만 보기

mnb_results = pd.DataFrame({'cat':pt_cat_list,
                           'best_params_': mnb_result_best_params_,
                           'best_recall(test)': mnb_result_best_recall,
                           'accuracy(val)' : vl_accuracy,
                           'f1(val)' : vl_f1,
                           'precision(val)' : vl_precision,
                           'recall(val)' : vl_recall
})

# 결과 저장
fii =  open('results/MultinomialNB_results.pickle', 'wb')
pickle.dump(mnb_results, fii)
fii.close()

카테고리 순회 중: 14it [06:02, 25.91s/it]


### 결과 확인

In [42]:
# 결과 불러오기
fi2 = open('results/BernoulliNB_results.pickle', 'rb')
bnb_results = pickle.load(fi2)
fi2.close()

f2 = open('results/MultinomialNB_results.pickle', 'rb')
mnb_results = pickle.load(f2)
f2.close()

In [43]:
bnb_results

Unnamed: 0,cat,best_params_,best_recall(test),accuracy(val),f1(val),precision(val),recall(val)
0,Part1_PO,"{'alpha': 2.3000000000000007, 'fit_prior': 'Tr...",0.739102,0.748222,0.759744,0.777381,0.74289
1,Part2_IS,"{'alpha': 0.1, 'fit_prior': 'True'}",0.759339,0.871884,0.865753,0.844565,0.888032
2,Part2_GB,"{'alpha': 0.30000000000000004, 'fit_prior': 'T...",0.619135,0.738434,0.721564,0.724989,0.718171
3,Part1_SO,"{'alpha': 2.3000000000000007, 'fit_prior': 'Tr...",0.865645,0.501962,0.566231,0.659556,0.496043
4,Part1_LC,"{'alpha': 2.3000000000000007, 'fit_prior': 'Tr...",0.638104,0.80344,0.842553,0.958948,0.751355
5,Part1_EC,"{'alpha': 0.30000000000000004, 'fit_prior': 'T...",0.794644,0.724748,0.754288,0.859853,0.671809
6,Part2_ET,"{'alpha': 0.7000000000000001, 'fit_prior': 'Tr...",0.814982,0.862943,0.863955,0.900388,0.830355
7,Part2_LC,"{'alpha': 2.3000000000000007, 'fit_prior': 'Tr...",0.584756,0.717153,0.703192,0.610292,0.829452
8,Part2_SO,"{'alpha': 0.1, 'fit_prior': 'True'}",0.696904,0.794473,0.776546,0.746632,0.808956
9,Part1_ET,"{'alpha': 0.30000000000000004, 'fit_prior': 'T...",0.675766,0.567207,0.622627,0.734797,0.540168


In [75]:
bnb_alpha = []
bnb_fit_prior = []

for row in bnb_results['best_params_']:
    bnb_alpha.append(row['alpha'])
    bnb_fit_prior.append(row['fit_prior'])

bnb_params_result = pd.DataFrame({
    'cat' : list(bnb_results['cat']),
    'alpha' : bnb_alpha,
    'fit_prior' : bnb_fit_prior
})

# BernoulliNB의 파트, 카테고리 별 최적의 하이퍼 파라미터
bnb_params_result = bnb_params_result.sort_values(by='cat', ignore_index=True)
pd.concat([bnb_results, bnb_params_result.iloc[:, 1:]], axis=1)

Unnamed: 0,cat,best_params_,best_recall(test),accuracy(val),f1(val),precision(val),recall(val),alpha,fit_prior
0,Part1_PO,"{'alpha': 2.3000000000000007, 'fit_prior': 'Tr...",0.739102,0.748222,0.759744,0.777381,0.74289,0.3,True
1,Part2_IS,"{'alpha': 0.1, 'fit_prior': 'True'}",0.759339,0.871884,0.865753,0.844565,0.888032,0.3,True
2,Part2_GB,"{'alpha': 0.30000000000000004, 'fit_prior': 'T...",0.619135,0.738434,0.721564,0.724989,0.718171,0.1,True
3,Part1_SO,"{'alpha': 2.3000000000000007, 'fit_prior': 'Tr...",0.865645,0.501962,0.566231,0.659556,0.496043,2.3,True
4,Part1_LC,"{'alpha': 2.3000000000000007, 'fit_prior': 'Tr...",0.638104,0.80344,0.842553,0.958948,0.751355,2.3,True
5,Part1_EC,"{'alpha': 0.30000000000000004, 'fit_prior': 'T...",0.794644,0.724748,0.754288,0.859853,0.671809,2.3,True
6,Part2_ET,"{'alpha': 0.7000000000000001, 'fit_prior': 'Tr...",0.814982,0.862943,0.863955,0.900388,0.830355,2.3,True
7,Part2_LC,"{'alpha': 2.3000000000000007, 'fit_prior': 'Tr...",0.584756,0.717153,0.703192,0.610292,0.829452,2.3,True
8,Part2_SO,"{'alpha': 0.1, 'fit_prior': 'True'}",0.696904,0.794473,0.776546,0.746632,0.808956,0.7,True
9,Part1_ET,"{'alpha': 0.30000000000000004, 'fit_prior': 'T...",0.675766,0.567207,0.622627,0.734797,0.540168,0.3,True


: 

In [73]:
bnb_pt1 = bnb_params_result[:7].copy()
bnb_pt2 = bnb_params_result[7:].copy()

bnb_pt1['part'] = 'Part1'
bnb_pt2['part'] = 'Part2'

bnb_pt1['cat'] = ['EC', 'ET', 'GB', 'IS', 'LC', 'PO', 'SO']
bnb_pt2['cat'] = ['EC', 'ET', 'GB', 'IS', 'LC', 'PO', 'SO']

bnb_concat = pd.concat([bnb_pt1, bnb_pt2])
# bnb_concat.groupby('cat')
bnb_concat

Unnamed: 0,cat,alpha,fit_prior,part
0,EC,0.3,True,Part1
1,ET,0.3,True,Part1
2,GB,0.1,True,Part1
3,IS,2.3,True,Part1
4,LC,2.3,True,Part1
5,PO,2.3,True,Part1
6,SO,2.3,True,Part1
7,EC,2.3,True,Part2
8,ET,0.7,True,Part2
9,GB,0.3,True,Part2


In [44]:
mnb_results

Unnamed: 0,cat,best_params_,best_recall(test),accuracy(val),f1(val),precision(val),recall(val)
0,Part1_PO,"{'alpha': 0.5000000000000001, 'fit_prior': 'Tr...",0.593112,0.532819,0.382487,0.28254,0.591854
1,Part2_IS,"{'alpha': 0.1, 'fit_prior': 'True'}",0.712714,0.84396,0.832383,0.7921,0.876982
2,Part2_GB,"{'alpha': 0.1, 'fit_prior': 'True'}",0.602245,0.737319,0.717703,0.714279,0.72116
3,Part1_SO,"{'alpha': 0.30000000000000004, 'fit_prior': 'T...",0.825836,0.49229,0.59612,0.760239,0.490279
4,Part1_LC,"{'alpha': 2.3000000000000007, 'fit_prior': 'Tr...",0.744733,0.735391,0.799232,0.960332,0.684418
5,Part1_EC,"{'alpha': 0.7000000000000001, 'fit_prior': 'Tr...",0.825456,0.727411,0.761445,0.885405,0.667932
6,Part2_ET,"{'alpha': 0.30000000000000004, 'fit_prior': 'T...",0.769202,0.867763,0.863929,0.868532,0.859374
7,Part2_LC,"{'alpha': 2.3000000000000007, 'fit_prior': 'Tr...",0.534112,0.709721,0.683402,0.570653,0.851675
8,Part2_SO,"{'alpha': 0.1, 'fit_prior': 'True'}",0.692268,0.794727,0.775039,0.739278,0.814435
9,Part1_ET,"{'alpha': 0.7000000000000001, 'fit_prior': 'Tr...",0.694876,0.594312,0.634361,0.72428,0.564304
