# 데이터 증대(EDA) 기법적용
#### 데이터셋은 train_data.csv 파일을 로드
#### --> 이유 : 비교적 중복이 많이 제거되어 있으므로
* 코드 출처 : https://github.com/catSirup/KorEDA/blob/master/eda.py
* 위 코드에 있는 4가지 EDA 방법 중 Random Swap 기법 적용
* Random Swap : 랜덤하게 단어들의 배열을 바꿔주는 기법

In [None]:
import pandas as pd
import numpy as np
import random
import re
import nltk
from nltk.corpus import stopwords, wordnet
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
train = pd.read_csv('train_data.csv')
input = train['html']
target = train['label']
print(len(input))

2494


In [None]:
words = set(nltk.corpus.words.words())       # nltk에 내장된 단어목록
def swap_word(new_words):
    
    random_idx_1 = random.randint(0, len(new_words)-1)      # html 내 랜덤하게 단어 1개 선택
    random_idx_2 = random_idx_1
    counter = 0
    
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)    
        counter += 1
        
        if counter > 3:
            return new_words
    
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]   # 단어 위치 변경
    return new_words

def random_swap(words, n):
    
    words = words.split()
    new_words = words.copy()
    
    for _ in range(n):
        new_words = swap_word(new_words)
        
    sentence = ' '.join(new_words)   # 공백제거
    
    return sentence

swap_n = [10,12,14,16,18]     # 재배열되는 단어의 개수
text_aug = []

# 총 5회 실행 --> 데이터셋 5배 증대
for i in swap_n:
  for doc in range(len(input)):
    text_aug.append(random_swap(input[doc],i))   # 함수 안에 데이터셋의 input값을 집어넣으면 i번 단어가 뒤바뀐 html필드값 리턴
len(text_aug)

12470

In [None]:
# label도 5배크기로 늘려줘야함
label = list(train['label'])
label_aug = label * 5
len(label_aug)

12470

In [None]:
aug_df = pd.DataFrame({'html':text_aug,'label':label_aug})
aug_df.head()

Unnamed: 0,html,label
0,Best Financial Service - #1 shop to earn risk ...,1
1,"bitcoin, bitcoin in free bitcoin generator, le...",1
2,"Underground Market - Prepaid & MORE Cards, Ama...",1
3,Stolen Cards | Plastic Sharks Stolen Cards Rev...,1
4,Best Amazon a Card Welcome! NEW LOW PRICE! Ama...,0


In [None]:
train_2 = pd.concat([train,aug_df],ignore_index=True)
print(train_2.shape)
train_2.head()

(14964, 2)


Unnamed: 0,html,label
0,Best Financial Service - #1 shop to ear...,1
1,"bitcoin, bitcoin generator, free bitcoin ...",1
2,Underground Market - Prepaid & Cloned Cards...,1
3,Stolen Cards | Plastic Sharks ...,1
4,Best Amazon Gift Card ...,0


In [None]:
pd.options.display.max_colwidth=9999
# html 변화 체크
print(train_2.loc[0])
print(train_2.loc[2494])
print(train_2.loc[4988])
print(train_2.loc[7482])
print(train_2.loc[9976])
print(train_2.loc[12470])

html            Best Financial Service - #1 shop to earn risk free money for anybody!               Ã VISA MasterCard Cloned Cards Gift Cards Amazon PayPal Western Union Transfers             Pay BTC                   FRESH ITEMS EVERY DAY  Escrow        <->  BTC    â  Verified Onion Link    Ã Best Dark Web Market                 FAQ Proofs Reviews          Best Financial Market   Prepaid / Cloned / Gift Cards and Money Transfers via PayPal or Western Union  See Products            WE DO NOT HAVE ANY LISTINGS ON DARK MARKETS. IF YOU SEE  SOMEONE CALLED âIMPERIAL MARKETâ THERE, THEY ARE PROBABLY SCAMMERS WHO  USE OUR NAME. BE CAREFUL!New items added: TODAY ()        Best Financial Market   We sell the best quality products from the US, Europe, Asia and Africa. We have specialized and have over 5 yearsâ experience in this  field. We check every card before shipping. We also offer  full  refund if youâll face any problems with your order.               WorldWide Shipping   You

In [None]:
train_2.to_csv('eda_data.csv',index=False)