# CountVectorizer 전처리 후 나온 one-hot encoding 값을
# TruncatedSVD 기법으로 차원축소
# 이후 RandomForest모델링 적용

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
train = pd.read_csv('train_data.csv')
train.head()

Unnamed: 0,html,label
0,Best Financial Service - #1 shop to ear...,1
1,"bitcoin, bitcoin generator, free bitcoin ...",1
2,Underground Market - Prepaid & Cloned Cards...,1
3,Stolen Cards | Plastic Sharks ...,1
4,Best Amazon Gift Card ...,0


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2494 entries, 0 to 2493
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   html    2494 non-null   object
 1   label   2494 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 39.1+ KB


In [None]:
# label과 html 분리
input = train['html']
target = train['label']
print('html개수 확인 : ',len(input))
print('label개수 확인 : ',len(target))

html개수 확인 :  2494
label개수 확인 :  2494


In [None]:
nltk.download('words')
nltk.download('stopwords')
words = set(nltk.corpus.words.words())
processed_html = []
for num in range(2494):
  a = input[num]
  sent = re.sub('[^a-zA-Z0-9\$.]+', ' ', str(a))
  edit = " ".join(w for w in nltk.wordpunct_tokenize(sent) \
          if w.lower() in words or not w.isalpha())  
  edit = edit.lower()
  edit = edit.split()
  ps = PorterStemmer()
  edit = [ps.stem(word) for word in edit if not word in set(stopwords.words('english'))]
  edit = ' '.join(edit)
  processed_html.append(edit)
len(processed_html)

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


2494

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()   # 임의로 단어집합과 동일하게 설정 --> 몇개로 해야? --> len(features)가 28417개로 동일하게 나옴 --> 28417
feature = cv.fit_transform(processed_html)

In [None]:
# TruncatedSVD 모듈 사용
# n_components : 축소할 n차원
# 임의로 10차원으로 축소

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=10)
feature_svd = svd.fit_transform(feature)
feature_svd

array([[ 2.41695903e+01,  1.03224412e+01,  4.37997032e+00, ...,
        -1.95832826e-01,  1.53602757e-01, -3.13363094e-01],
       [ 6.74487181e+00,  1.03234811e+01,  5.63317501e+00, ...,
        -7.66341501e-01,  6.32893162e-01,  1.67347411e+00],
       [ 2.49337877e+00,  6.65461089e-01,  3.11246469e-01, ...,
        -4.45776101e-02, -3.28339195e-02, -9.78788885e-02],
       ...,
       [ 3.00257461e+00,  5.12791958e+00,  2.42175882e+00, ...,
        -2.48173490e-01, -9.85548654e-01,  1.32198582e-01],
       [ 1.88867419e+00,  2.53574160e+00,  4.16806159e+00, ...,
         3.90174003e-02, -4.39137813e-02,  3.96869734e-02],
       [ 9.99741803e+00,  1.27833365e+01,  4.81117264e+01, ...,
        -1.67855140e+00,  3.55940716e-01, -3.81358326e+00]])

In [None]:
for i in range(10):
  train[f'feature_svd_{i}'] = feature_svd[:,i]
train

Unnamed: 0,html,label,feature_svd_0,feature_svd_1,feature_svd_2,feature_svd_3,feature_svd_4,feature_svd_5,feature_svd_6,feature_svd_7,feature_svd_8,feature_svd_9
0,Best Financial Service - #1 shop to ear...,1,24.169590,10.322441,4.379970,-7.931387,-17.228541,-0.869364,3.955955,-0.195833,0.153603,-0.313363
1,"bitcoin, bitcoin generator, free bitcoin ...",1,6.744872,10.323481,5.633175,-5.965530,-5.631383,-0.958345,5.563555,-0.766342,0.632893,1.673474
2,Underground Market - Prepaid & Cloned Cards...,1,2.493379,0.665461,0.311246,-0.615377,-1.126641,0.007718,0.683853,-0.044578,-0.032834,-0.097879
3,Stolen Cards | Plastic Sharks ...,1,18.885118,19.699114,10.131171,-13.915462,-29.644879,1.673396,7.728109,-0.185623,-0.914799,-1.325259
4,Best Amazon Gift Card ...,0,3.562276,2.442382,1.132326,-1.569324,-2.963843,-0.262613,0.907963,-0.107837,0.001884,-0.034024
...,...,...,...,...,...,...,...,...,...,...,...,...
2489,ProPublica ‚Äî Investigative Journalism and Ne...,0,6.033250,12.776603,5.520831,-10.001085,0.208814,1.659864,10.673589,0.411079,-1.238644,0.512924
2490,BBC - Homepage ...,0,3.618474,4.302534,1.952775,-2.124354,-3.120394,0.905137,2.142270,4.354717,0.076533,0.106126
2491,XMPP.is - Communicate Freely XMPP.is - Communi...,0,3.002575,5.127920,2.421759,-4.505171,-1.100021,1.399999,4.456515,-0.248173,-0.985549,0.132199
2492,Scam List of Tor Scam List (http://yjhnb34...,0,1.888674,2.535742,4.168062,1.030720,-1.464161,1.002995,0.346583,0.039017,-0.043914,0.039687


In [None]:
train_2 = train.drop(columns=['html','label'])

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=1000,criterion='entropy',random_state=42)
rfc.fit(train_2,train['label'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [None]:
test = pd.read_csv('test_data.csv')
test.head()

Unnamed: 0,html,label
0,"eCash Cards: trusted, automatic Visa cre...",1.0
1,Cash Machine For Everybody - Easy to u...,1.0
2,netAuth You are connected throu...,1.0
3,QF Market - Fast Transfers QF...,1.0
4,[OFFICIAL & ORIGINAL] BITCOIN x200 SERV...,1.0


In [None]:
test['label'] = test['label'].apply(lambda x: int(x))
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1726 entries, 0 to 1725
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   html    1726 non-null   object
 1   label   1726 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 27.1+ KB


In [None]:
input_2 = test['html']

In [None]:
nltk.download('words')
nltk.download('stopwords')
words = set(nltk.corpus.words.words())
processed_html_score = []
for num in range(1726):
  a = input_2[num]
  sent_2 = re.sub('[^a-zA-Z0-9\$.]+', ' ', str(a))
  edit_2 = " ".join(w for w in nltk.wordpunct_tokenize(sent_2) \
          if w.lower() in words or not w.isalpha())  
  edit_2 = edit_2.lower()
  edit_2 = edit_2.split()
  ps_2 = PorterStemmer()
  edit_2 = [ps_2.stem(word) for word in edit_2 if not word in set(stopwords.words('english'))]
  edit_2 = ' '.join(edit_2)
  processed_html_score.append(edit_2)
len(processed_html_score)

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1726

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv_2 = CountVectorizer(max_features=28417)  
feature_score = cv_2.fit_transform(processed_html_score)

In [None]:
from sklearn.decomposition import TruncatedSVD
svd_2 = TruncatedSVD(n_components=10)
feature_score_svd = svd.fit_transform(feature_score)
feature_score_svd

array([[ 6.25607582e-03,  1.94849788e-01,  8.20655288e-01, ...,
         1.86600145e+00, -2.54377374e-01, -5.46374552e-01],
       [ 7.36421421e-02,  2.35928304e+00,  2.03077052e+00, ...,
         1.74946646e+00, -9.73162402e-01, -1.89033916e+00],
       [ 2.68520709e-02,  8.23124452e-01,  8.26091861e-01, ...,
        -4.77226087e-01,  1.68466208e+00,  4.49281311e-02],
       ...,
       [ 2.89467648e-01,  2.75155363e+00,  1.15289207e+00, ...,
        -4.03435234e+00,  1.86951434e+00,  2.64730590e+00],
       [ 9.89110633e-02,  3.06631357e-01,  9.77732055e-01, ...,
         2.54722388e-01,  1.03213728e+00, -7.00635701e-01],
       [ 1.41109871e-01,  1.25897560e+00,  2.81661792e+01, ...,
        -9.28189248e-01, -1.73750326e-01,  4.02124479e+00]])

In [None]:
for i in range(10):
  test[f'feature_score_svd_{i}'] = feature_score_svd[:,i]
test

Unnamed: 0,html,label,feature_score_svd_0,feature_score_svd_1,feature_score_svd_2,feature_score_svd_3,feature_score_svd_4,feature_score_svd_5,feature_score_svd_6,feature_score_svd_7,feature_score_svd_8,feature_score_svd_9
0,"eCash Cards: trusted, automatic Visa cre...",1,0.006256,0.194850,0.820655,-0.332373,3.601449,-1.309849,-0.046210,1.866001,-0.254377,-0.546375
1,Cash Machine For Everybody - Easy to u...,1,0.073642,2.359283,2.030771,-0.758960,13.992111,-5.841993,-0.639072,1.749466,-0.973162,-1.890339
2,netAuth You are connected throu...,1,0.026852,0.823124,0.826092,-0.360067,2.855116,-0.473443,-0.434230,-0.477226,1.684662,0.044928
3,QF Market - Fast Transfers QF...,1,0.206035,1.457255,3.778670,-1.385803,15.817454,-8.844995,-0.021044,6.464926,-0.964968,0.971276
4,[OFFICIAL & ORIGINAL] BITCOIN x200 SERV...,1,0.100380,1.436502,29.274289,-9.196594,6.143077,-4.196635,3.548706,-1.458668,2.960297,-7.408584
...,...,...,...,...,...,...,...,...,...,...,...,...
1721,ï»¿ 100x Your Coins in 24 Hours - Officiall...,1,0.628891,14.121510,780.837526,1609.203810,-27.442034,-37.426408,-4.654625,-7.326124,-5.592077,-6.759609
1722,Clone Credit Card - Shop CC 100% ...,1,0.609701,4.893413,6.864567,-1.740160,68.030042,-30.063011,-5.780868,12.409267,-16.595556,-14.025428
1723,lolitas ...,1,0.289468,2.751554,1.152892,-0.025098,6.029364,-3.297877,-2.334820,-4.034352,1.869514,2.647306
1724,Apple Market - Stolen & Carded Merchandis...,1,0.098911,0.306631,0.977732,-0.185827,1.863276,-0.825463,-0.377029,0.254722,1.032137,-0.700636


In [None]:
test_2 = test.drop(columns=['html','label'])
result = rfc.predict(test_2)
result

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
res = pd.DataFrame(result)
res.to_csv('rfc.csv',index=False)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test['label'],result)
print(cm)

[[ 341    6]
 [1372    7]]


# SVD 전처리 수행에도 정확도 변동X (20%)