# 전처리 과정에서 CountVectorizer가 아닌 TF-IDF 기법 적용
# 단어 빈도 * 역문서 빈도의 값으로 one-hot encoding 추출
* 단어 빈도 : 한 html에서 특정 단어가 몇 번 나왔는지?
* 역문서 빈도 : 특정단어가 등장한 html이 몇개인지?
* 표현식 = 단어빈도 X (1/역문서빈도)
* 참고 문서 : https://wikidocs.net/31698 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
train = pd.read_csv('train_data.csv')
train.head()

Unnamed: 0,html,label,path
0,Best Financial Service - #1 shop to ear...,1,222222222xn2ozdb2mjnkjrvcopf5thb6la6yj24jvyjqr...
1,"bitcoin, bitcoin generator, free bitcoin ...",1,22222uswoye6ve7ixbgkwlvynjrvsg4od2qrs6zs5pbtpr...
2,Underground Market - Prepaid & Cloned Cards...,1,2222fxq4xfkvilzdihu5ybce7ztf66fr6c7ub3enabg5iy...
3,Stolen Cards | Plastic Sharks ...,1,22c7nfj32ujbnymoo2zh64il46j3k2vuo7kryj757hkhpa...
4,Best Amazon Gift Card ...,0,22cwxace6a4cu2yzti4i2x2gikl4wpqr3nz36jlpxoosgi...


In [3]:
train['label'] = train['label'].apply(lambda x: int(x))
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2494 entries, 0 to 2493
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   html    2494 non-null   object
 1   label   2494 non-null   int64 
 2   path    178 non-null    object
dtypes: int64(1), object(2)
memory usage: 58.6+ KB


In [4]:
# label과 html 분리
input = train['html']
target = train['label']
print('html개수 확인 : ',len(input))
print('label개수 확인 : ',len(target))

html개수 확인 :  2494
label개수 확인 :  2494


In [5]:
'''
nltk.download('words')
nltk.download('stopwords')
words = set(nltk.corpus.words.words())
processed_html = []
for num in range(2494):
  a = input[num]
  sent = re.sub('[^a-zA-Z0-9\$.]+', ' ', str(a))
  edit = " ".join(w for w in nltk.wordpunct_tokenize(sent) \
          if w.lower() in words or not w.isalpha())  
  edit = edit.lower()
  edit = edit.split()
  ps = PorterStemmer()
  edit = [ps.stem(word) for word in edit if not word in set(stopwords.words('english'))]
  edit = ' '.join(edit)
  processed_html.append(edit)
len(processed_html)
'''
nltk.download('words')
nltk.download('stopwords')
words = set(nltk.corpus.words.words())
processed_html = []
for num in range(len(train['html'])):
  a = input[num]
  sent = re.sub(r'(\dBTC)',' BTC ',a)
  sent_2 = re.sub('[^a-zA-Z0-9\$.]+', ' ', sent)
  #edit = " ".join(w for w in nltk.wordpunct_tokenize(sent) \
  #        if w.lower() in words or not w.isalpha())                     ---> nltk모듈의 corpus 사전 내에 없는 단어를 제거하는 로직인데 은어들이 전부 제거되어 주석처리 했습니다 
  edit = sent_2.lower()
  edit = edit.split()
  ps = PorterStemmer()
  edit = [ps.stem(word) for word in edit if not word in set(stopwords.words('english'))]
  edit = ' '.join(edit)
  processed_html.append(edit)
len(processed_html)

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


2494

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer()  
feature = tfid.fit_transform(processed_html).toarray()

In [32]:
# TF-IDF 전처리 샘플
feature[0]

array([0.        , 0.01641774, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_valid,y_train,y_valid = train_test_split(feature,target,test_size=0.2,random_state=42,stratify=target)

In [8]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs=-1,random_state=42,n_estimators=5000,criterion='gini',max_depth=5)
rfc.fit(X_train,y_train)
pred = rfc.predict(X_valid)

In [9]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_valid,pred)
print(cm)

[[238   7]
 [ 70 184]]


In [10]:
# 채점 시작
test = pd.read_csv('test_data.csv')
test['label'] = test['label'].apply(lambda x: int(x))
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1726 entries, 0 to 1725
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   html    1726 non-null   object
 1   label   1726 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 27.1+ KB


In [11]:
input_2 = test['html']

In [12]:
words = set(nltk.corpus.words.words())
processed_html_score = []
for num in range(len(input_2)):
  a = input_2[num]
  sent = re.sub(r'(\dBTC)',' BTC ',a)
  sent_2 = re.sub('[^a-zA-Z0-9\$.]+', ' ', sent)
  edit_2 = sent_2.lower()
  edit_2 = edit_2.split()
  ps_2 = PorterStemmer()
  edit_2 = [ps_2.stem(word) for word in edit_2 if not word in set(stopwords.words('english'))]
  edit_2 = ' '.join(edit_2)
  processed_html_score.append(edit_2)
len(processed_html_score)

1726

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid_2 = TfidfVectorizer(max_features=68690)  
feature_score = tfid_2.fit_transform(processed_html_score).toarray()

In [16]:
result = rfc.predict(feature_score)
cm_2 = confusion_matrix(test['label'],result)
print(cm_2)

[[ 347    0]
 [1379    0]]


In [17]:
np.unique(result,return_counts=True)
# 전부 0으로 찍어버림.....

(array([0]), array([1726]))

# 전처리 및 모델링 결과 전부 0으로 찍어버리는 현상 발생
# 결과가 의미가 없음