In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split

# svd(차원축소)를 적용한 전처리(cb/xgb)
* CatBoost(cb) : RandomForest보다 분류문제에 더 최적화된 모델
* XGBoost(xgb) : RandomForest와 CatBoost를 조금씩 섞은 모델

In [None]:
train = pd.read_csv('train_data.csv')
train.head()

Unnamed: 0,html,label
0,Best Financial Service - #1 shop to ear...,1
1,"bitcoin, bitcoin generator, free bitcoin ...",1
2,Underground Market - Prepaid & Cloned Cards...,1
3,Stolen Cards | Plastic Sharks ...,1
4,Best Amazon Gift Card ...,0


In [None]:
input = train['html']
target = train['label']
print('html개수 확인 : ',len(input))
print('label개수 확인 : ',len(target))

html개수 확인 :  2494
label개수 확인 :  2494


In [None]:
nltk.download('words')
nltk.download('stopwords')
words = set(nltk.corpus.words.words())
processed_html = []
for num in range(2494):
  a = input[num]
  sent = re.sub('[^a-zA-Z0-9\$.]+', ' ', str(a))
  edit = " ".join(w for w in nltk.wordpunct_tokenize(sent) \
          if w.lower() in words or not w.isalpha())  
  edit = edit.lower()
  edit = edit.split()
  ps = PorterStemmer()
  edit = [ps.stem(word) for word in edit if not word in set(stopwords.words('english'))]
  edit = ' '.join(edit)
  processed_html.append(edit)
len(processed_html)

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


2494

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()   
feature = cv.fit_transform(processed_html)

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=5)
feature_svd = svd.fit_transform(feature)
feature_svd

array([[ 2.41695904e+01,  1.03224351e+01,  4.37995317e+00, ...,
        -1.95408390e-01,  1.51243348e-01, -3.11097026e-01],
       [ 6.74487164e+00,  1.03234754e+01,  5.63324025e+00, ...,
        -7.65921537e-01,  6.35988371e-01,  1.68473607e+00],
       [ 2.49337877e+00,  6.65461991e-01,  3.11249446e-01, ...,
        -4.45819086e-02, -3.29856772e-02, -9.79370151e-02],
       ...,
       [ 3.00257451e+00,  5.12793510e+00,  2.42181335e+00, ...,
        -2.48688259e-01, -9.87542184e-01,  1.36979746e-01],
       [ 1.88867419e+00,  2.53574201e+00,  4.16806366e+00, ...,
         3.90222812e-02, -4.38897320e-02,  3.99039477e-02],
       [ 9.99741802e+00,  1.27833318e+01,  4.81117094e+01, ...,
        -1.67828260e+00,  3.56150729e-01, -3.81394522e+00]])

In [None]:
for i in range(5):
  train[f'feature_svd_{i}'] = feature_svd[:,i]
train

Unnamed: 0,html,label,feature_svd_0,feature_svd_1,feature_svd_2,feature_svd_3,feature_svd_4,feature_svd_5,feature_svd_6,feature_svd_7,feature_svd_8,feature_svd_9
0,Best Financial Service - #1 shop to ear...,1,24.169590,10.322435,4.379953,-7.931377,-17.228528,-0.869559,3.955891,-0.195408,0.151243,-0.311097
1,"bitcoin, bitcoin generator, free bitcoin ...",1,6.744872,10.323475,5.633240,-5.965552,-5.631810,-0.957683,5.563780,-0.765922,0.635988,1.684736
2,Underground Market - Prepaid & Cloned Cards...,1,2.493379,0.665462,0.311249,-0.615375,-1.126637,0.007740,0.683859,-0.044582,-0.032986,-0.097937
3,Stolen Cards | Plastic Sharks ...,1,18.885118,19.699151,10.131295,-13.915318,-29.644749,1.674167,7.728204,-0.186353,-0.918245,-1.318088
4,Best Amazon Gift Card ...,0,3.562276,2.442381,1.132323,-1.569324,-2.963843,-0.262643,0.907956,-0.107614,0.001304,-0.033936
...,...,...,...,...,...,...,...,...,...,...,...,...
2489,ProPublica ‚Äî Investigative Journalism and Ne...,0,6.033250,12.776641,5.520962,-10.001013,0.209001,1.660653,10.673456,0.409469,-1.241584,0.517838
2490,BBC - Homepage ...,0,3.618474,4.302551,1.952835,-2.124305,-3.120299,0.905468,2.142140,4.354063,0.074103,0.109419
2491,XMPP.is - Communicate Freely XMPP.is - Communi...,0,3.002575,5.127935,2.421813,-4.505153,-1.099936,1.400312,4.456452,-0.248688,-0.987542,0.136980
2492,Scam List of Tor Scam List (http://yjhnb34...,0,1.888674,2.535742,4.168064,1.030718,-1.464161,1.003012,0.346602,0.039022,-0.043890,0.039904


In [None]:
train_2 = train.drop(columns=['html','label'])

In [None]:
X_train,X_valid,y_train,y_valid = train_test_split(train_2,train['label'],
                                                   test_size=0.2,random_state=42,stratify=train['label'])

In [None]:
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
from catboost import CatBoostClassifier
cbc = CatBoostClassifier(max_depth=9,learning_rate=1e-4)
cbc.fit(X_train,y_train,eval_set=(X_valid,y_valid),verbose=100)

0:	learn: 0.6930698	test: 0.6930772	best: 0.6930772 (0)	total: 21ms	remaining: 21s
100:	learn: 0.6849682	test: 0.6860539	best: 0.6860539 (100)	total: 1.57s	remaining: 14s
200:	learn: 0.6770733	test: 0.6792353	best: 0.6792353 (200)	total: 3.11s	remaining: 12.4s
300:	learn: 0.6693403	test: 0.6725797	best: 0.6725797 (300)	total: 4.63s	remaining: 10.7s
400:	learn: 0.6618272	test: 0.6660951	best: 0.6660951 (400)	total: 6.18s	remaining: 9.23s
500:	learn: 0.6544807	test: 0.6597687	best: 0.6597687 (500)	total: 7.7s	remaining: 7.67s
600:	learn: 0.6472065	test: 0.6535192	best: 0.6535192 (600)	total: 9.22s	remaining: 6.12s
700:	learn: 0.6401092	test: 0.6474492	best: 0.6474492 (700)	total: 10.8s	remaining: 4.59s
800:	learn: 0.6332476	test: 0.6415565	best: 0.6415565 (800)	total: 12.3s	remaining: 3.05s
900:	learn: 0.6265120	test: 0.6357813	best: 0.6357813 (900)	total: 13.8s	remaining: 1.52s
999:	learn: 0.6199731	test: 0.6301427	best: 0.6301427 (999)	total: 15.3s	remaining: 0us

bestTest = 0.63014270

<catboost.core.CatBoostClassifier at 0x7f3d068cb790>

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate=0.1,tree_method='gpu_hist',n_estimators=10000,
                   colsample_bytree=0.75,subsample=0.75,max_depth=7)
xgb.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],verbose=10,early_stopping_rounds=20)

[0]	validation_0-error:0.230461
Will train until validation_0-error hasn't improved in 20 rounds.
[10]	validation_0-error:0.142285
[20]	validation_0-error:0.132265
[30]	validation_0-error:0.138277
Stopping. Best iteration:
[16]	validation_0-error:0.126253



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.75, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=10000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.75, tree_method='gpu_hist', verbosity=1)

In [None]:
test = pd.read_csv('test_data.csv')

In [None]:
test['label'] = test['label'].apply(lambda x: int(x))
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1726 entries, 0 to 1725
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   html    1726 non-null   object
 1   label   1726 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 27.1+ KB


In [None]:
input_2 = test['html']

In [None]:
nltk.download('words')
nltk.download('stopwords')
words = set(nltk.corpus.words.words())
processed_html_score = []
for num in range(1726):
  a = input_2[num]
  sent_2 = re.sub('[^a-zA-Z0-9\$.]+', ' ', str(a))
  edit_2 = " ".join(w for w in nltk.wordpunct_tokenize(sent_2) \
          if w.lower() in words or not w.isalpha())  
  edit_2 = edit_2.lower()
  edit_2 = edit_2.split()
  ps_2 = PorterStemmer()
  edit_2 = [ps_2.stem(word) for word in edit_2 if not word in set(stopwords.words('english'))]
  edit_2 = ' '.join(edit_2)
  processed_html_score.append(edit_2)
len(processed_html_score)

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1726

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv_2 = CountVectorizer(max_features=28417)  
feature_score = cv_2.fit_transform(processed_html_score).toarray()  # toarray() 상황에 따라 주석처리

In [None]:
from sklearn.decomposition import TruncatedSVD
svd_2 = TruncatedSVD(n_components=5)
feature_score_svd = svd.fit_transform(feature_score)
feature_score_svd

array([[ 6.25607477e-03,  1.94849705e-01,  8.20676908e-01, ...,
         1.86837125e+00, -2.54986328e-01, -5.35560287e-01],
       [ 7.36421419e-02,  2.35928286e+00,  2.03083148e+00, ...,
         1.75129122e+00, -9.78032181e-01, -1.86823803e+00],
       [ 2.68520704e-02,  8.23124341e-01,  8.26112944e-01, ...,
        -4.79808037e-01,  1.68313781e+00,  4.76827081e-02],
       ...,
       [ 2.89467657e-01,  2.75155353e+00,  1.15292025e+00, ...,
        -4.04530048e+00,  1.86526152e+00,  2.63411172e+00],
       [ 9.89110652e-02,  3.06631257e-01,  9.77752151e-01, ...,
         2.57031984e-01,  1.03099467e+00, -6.99688464e-01],
       [ 1.41109841e-01,  1.25897302e+00,  2.81668008e+01, ...,
        -9.39709499e-01, -2.29148050e-01,  3.98784581e+00]])

In [None]:
for i in range(5):
  test[f'feature_svd_{i}'] = feature_score_svd[:,i]
test

Unnamed: 0,html,label,feature_svd_0,feature_svd_1,feature_svd_2,feature_svd_3,feature_svd_4,feature_svd_5,feature_svd_6,feature_svd_7,feature_svd_8,feature_svd_9
0,"eCash Cards: trusted, automatic Visa cre...",1,0.006256,0.194850,0.820655,-0.332373,3.601447,-1.309870,-0.046136,1.864209,-0.254490,-0.540703
1,Cash Machine For Everybody - Easy to u...,1,0.073642,2.359283,2.030771,-0.758960,13.992111,-5.841999,-0.642397,1.749946,-0.977064,-1.880590
2,netAuth You are connected throu...,1,0.026852,0.823124,0.826092,-0.360067,2.855114,-0.473459,-0.434903,-0.479891,1.682869,0.051005
3,QF Market - Fast Transfers QF...,1,0.206035,1.457255,3.778670,-1.385802,15.817493,-8.844629,-0.014425,6.492086,-0.958437,0.941448
4,[OFFICIAL & ORIGINAL] BITCOIN x200 SERV...,1,0.100380,1.436502,29.274289,-9.196594,6.143049,-4.196955,3.549944,-1.475695,2.957873,-7.370360
...,...,...,...,...,...,...,...,...,...,...,...,...
1721,ï»¿ 100x Your Coins in 24 Hours - Officiall...,1,0.628891,14.121510,780.837526,1609.203809,-27.442047,-37.426545,-4.653334,-7.334059,-5.592087,-6.739315
1722,Clone Credit Card - Shop CC 100% ...,1,0.609701,4.893413,6.864567,-1.740160,68.030092,-30.062621,-5.782759,12.400094,-16.607501,-14.031816
1723,lolitas ...,1,0.289468,2.751554,1.152892,-0.025098,6.029366,-3.297865,-2.334952,-4.038129,1.867710,2.650553
1724,Apple Market - Stolen & Carded Merchandis...,1,0.098911,0.306631,0.977732,-0.185827,1.863275,-0.825473,-0.376902,0.253749,1.032318,-0.699050


In [None]:
test_2 = test.drop(columns=['html','label'])
result = cbc.predict(feature_score)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test['label'],result)
print(cm)

[[ 334   13]
 [1378    1]]


In [None]:
result_2 = xgb.predict(feature_score)
cm_2 = confusion_matrix(test['label'],result_2)
print(cm_2)

[[ 332   15]
 [1347   32]]


# 이전 버전들과 크게 다를 것 없는 정확도...