In [23]:
import pandas as pd
import re
import numpy as np

from pathlib import Path


In [None]:
# download from google drive
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

!ln -s /content/gdrive/My\ Drive/toxic_nlp /toxic_nlp
!ls /toxic_nlp

class MyConfig(dict):
  def __getattr__(self, name): return self[name]
  def __setattr__(self, name, value): self[name] = value


configs = MyConfig({'path':{
    'path_drive' : Path('/toxic_nlp'),
    'path_colab_data' : Path('./dataset')
    }
})

# 데이터 셋 구성
- Wiki page, Gab, Reddit, Twitter, 백인 우월주의 포럼까지 다양한 도메인에서 수집된 데이터들로 구성하였다.
- 혐오표현의 분류문제에서 모델이 맥락정보를 적절하게 사용하기에 데이터의 문제가 있음.
  1. Toxic label의 비율이 낮음.
  2. 맥락정보를 포함하는 데이터셋이 거의 없음.
  3. 맥락정보를 포함하는 데이터셋 출처(도메인)가 한정적임.


**- 수집한 데이터 셋 명세**

<img src ='https://drive.google.com/uc?export=view&id=1MIRayeKoIIcBDtdUHrqz0uGpXB81jpXe'>


# # **Preprocessing Data Sets**
- 데이터셋 출처에 따라서 적절한 전처리를 수행하였음.
  - 출처(도메인) 마다 제거해야 할 패턴이 다양함.

## 1.Cat
- Pavlopoulos et al,. 2020. Toxicity Detection: Does Context Really Matter? 연구에서 수집됨
- Wiki takl page의 comment를 수집
- 이모지, URL, html tag 등 삭제

In [2]:
!wget https://raw.githubusercontent.com/ipavlopoulos/context_toxicity/master/data/CAT_LARGE/gc.csv gc.csv
df = pd.read_csv('gc.csv')
print(df.label.value_counts())
df.head()


--2021-08-09 06:22:16--  https://raw.githubusercontent.com/ipavlopoulos/context_toxicity/master/data/CAT_LARGE/gc.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3697430 (3.5M) [text/plain]
Saving to: ‘gc.csv’


2021-08-09 06:22:16 (47.7 MB/s) - ‘gc.csv’ saved [3697430/3697430]

--2021-08-09 06:22:16--  http://gc.csv/
Resolving gc.csv (gc.csv)... failed: Name or service not known.
wget: unable to resolve host address ‘gc.csv’
FINISHED --2021-08-09 06:22:16--
Total wall clock time: 0.6s
Downloaded: 1 files, 3.5M in 0.07s (47.7 MB/s)
0    9849
1     151
Name: label, dtype: int64


Unnamed: 0,id,text,parent,label,api
0,100030037.40314.40314,"BTW, I see no ""attack"" here. What I see is so...",I also feel its important for StuRat to have h...,0,0.1323
1,100104685.102437.102437,I realise... I just want to urge caution and a...,A main reason this project was started was to ...,0,0.0314
2,100189373.112824.112824,I should have stressed that this should be a t...,"I'd go for that. (By the way, not all deletion...",0,0.2229
3,100203140.40393.40393,Thanks to Lysy for shortening and copy editing...,(edit conflict with Piotrus) I agree with the...,0,0.0559
4,100214962.61084.61084,I like the additional info in the info box. I...,The larger box displays fine in Mozilla Firefo...,0,0.0539


In [17]:
# regex pattern for Cat dataset
class Regex_pattern():
  def __init__(self):
    self.emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"u"\U00002702-\U000027B0"u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"u"\U00010000-\U0010ffff"u"\u2640-\u2642"u"\u2600-\u2B55"
                               u"\u200d"u"\u23cf"u"\u23e9"u"\u231a"u"\ufe0f"u"\u3030""]+", flags=re.UNICODE)
    self.comment_number = re.compile("[0-9]+.")
    self.url = re.compile("(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")
    self.url_1 = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    self.url_2 = re.compile('www(?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    self.url_3 = re.compile('http[s]? : // (?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    self.user = re.compile(r"@[a-zA-Z]+")
    self.hash = re.compile(r"#")
    self.unlatter = re.compile("[âð±ïó¾¬ª¯¶¦â¡©§¸©¹¼¥¤ã»ÃÀÁÂÃÄÅ½º¢µ$|¿°«à]")
    self.space = re.compile(r"\s+") # too many space
    self.ip = re.compile("[']{2,3}")
    self.slash = re.compile("[\'|]{1}")
    self.slash2 = re.compile("[\']{2,}")

  def __call__(self, string):
    string = self.emoji_pattern.sub(r'', string)
    # string = regex_pattern.comment_number.sub(r'',string)
    string = self.url.sub(r'',string)
    # user @ tag는 전체 삭제
    string = self.user.sub(r'',string)
    # #tag 해시태그는 #해시 문자만 삭제
    string = self.hash.sub(r'',string)
    # string = re.compile(r"#[a-zA-Z]+").sub(r'',string)
    string = self.unlatter.sub(r'',string)
    string = self.space.sub(' ', string).strip()
    string = self.url_1.sub('',string)
    string = self.url_2.sub('',string)
    string = self.url_3.sub('',string)
    string = self.ip.sub(r"'",string)
    string = self.slash.sub(r"'",string)
    string = self.slash2.sub(r"",string)
    return string

regex_pattern = Regex_pattern()

In [None]:
# 정규표현식 TEST (눈으로 직접 보면서 확인)
df_sample = df.sample(100)
for f in df_sample.text:
  print('ori',f)
  f = regex_pattern(f)
  print('reg',str(f))

In [21]:
# clear text
df['text_clean'] = df['text'].apply(lambda x: regex_pattern(x))
df['parent_clean'] = df['parent'].apply(lambda x: regex_pattern(x))

In [22]:
df.head()

Unnamed: 0,id,text,parent,label,api,text_clean,parent_clean
0,100030037.40314.40314,"BTW, I see no ""attack"" here. What I see is so...",I also feel its important for StuRat to have h...,0,0.1323,"BTW, I see no ""attack"" here. What I see is som...",I also feel its important for StuRat to have h...
1,100104685.102437.102437,I realise... I just want to urge caution and a...,A main reason this project was started was to ...,0,0.0314,I realise... I just want to urge caution and a...,A main reason this project was started was to ...
2,100189373.112824.112824,I should have stressed that this should be a t...,"I'd go for that. (By the way, not all deletion...",0,0.2229,I should have stressed that this should be a t...,"I'd go for that. (By the way, not all deletion..."
3,100203140.40393.40393,Thanks to Lysy for shortening and copy editing...,(edit conflict with Piotrus) I agree with the...,0,0.0559,Thanks to Lysy for shortening and copy editing...,(edit conflict with Piotrus) I agree with the ...
4,100214962.61084.61084,I like the additional info in the info box. I...,The larger box displays fine in Mozilla Firefo...,0,0.0539,I like the additional info in the info box. It...,The larger box displays fine in Mozilla Firefo...


In [None]:
# save to csv
df_clean = df[['parent_clean','text_clean','label']]
df_clean.rename({'parent_clean':'parent','text_clean':'text'},axis=1,inplace=True)

df_clean.to_csv(configs.path['path_drive'] / 'dataset' / 'gc_clean.csv' )

## 2.Gab_red
- Qian(2019) 연구에서 수집된 데이터.
- Raddit, Gab에서 혐오단어 키워드를 통해 Conversational data를 수집.
- 2차례 이상 이어지는 유저 간 Conversation 중, 혐오 발언으로 간주될 수 있는 Comment 번호를 Toxic으로 라벨링 한 데이터.

In [None]:
# load dataset
!wget https://raw.githubusercontent.com/jing-qian/A-Benchmark-Dataset-for-Learning-to-Intervene-in-Online-Hate-Speech/master/data/gab.csv gab.csv
!wget https://raw.githubusercontent.com/jing-qian/A-Benchmark-Dataset-for-Learning-to-Intervene-in-Online-Hate-Speech/master/data/reddit.csv raddit.csv


In [28]:

df_gab = pd.read_csv('gab.csv')
df_reddit = pd.read_csv('reddit.csv')

df_rg = pd.concat([df_gab,df_reddit], ignore_index=True)
# df_rg.dropna(inplace=True) # nan for non-toxic label
print(df_rg.isnull().sum(),df_rg.shape)
print('duplicate? ',df_rg.duplicated().sum())

# context가 있는 comment만 남김 (comment가 1개만 있으면서 첫번째 문장인 경우 삭제)
df_rg = df_rg[~(df_rg.hate_speech_idx == '[1]')][['text','hate_speech_idx']]
df_rg.head()
df_rg.shape

id                    0
text                  0
hate_speech_idx    1829
response           1829
dtype: int64 (16845, 4)
duplicate?  1


(11682, 2)

In [31]:
for i,j in zip(df_rg['text'][700:703], df_rg['hate_speech_idx']):
  print(f'\n{j}\n{i}\n\n')


[3]
1. 
2. 	Never again will I watch a @NFL game! Same goes for #NetFlix, never again will I let them make one penny from me !!   Yes Barry, you did that! You & the #racist #antiwhite #antiChristian #Atheistic #Leftatards always destroying, never creating anything but trouble!   What did you do with the hotdogs at the WH Barry!?? We remember, you treasonous 💩!!!   #Boycott !
3. 		I looked the other day, everything on Netflix is all African American, Australian, British, or anything else requiring subtitles. Sorry you idiots, the last thing I want to do at the end of the night, is read sub titles. Everything Obama touches he ruins. It's deliberate by Soros. 
4. 			it's all nigger fantasies and Wakanda type BULLSHIT!! not real..




[2]
1. Hulu deleted this tweet after becoming the focus of countless jokes and being mocked for several hours straight.   Even companies are willing to backdown from SJW talking points if the heat gets too hot.
2. 	https://www.bitchute.com/video/hzi8Cz3IhZ1J

### Gab_red 데이터 문장 분리 로직
- 위 예시처럼 2차례 이상 이어지는 User간 Conversation이 합쳐서 제공되고 있으며, 혐오발언으로 간주될 수 있는 Commnet번호를 라벨링 한 형태듸 데이터.
- Hate Speech로 label된 문장을 중심으로, 그 이전 Comment를 맥락 text로 추출하였음.
- 하나의 Conversation에 여러개의 Hate Speech로 label이 있는 경우, Hate Speech label 수만큼 분리됨.
- Hate Speech로 label된 문장 이전 맥락이 지나치게 짧은 경우(예를 들어 한단어), 맥락 정보가 포함되지 않는 문장이 되어버리기 때문에, 적어도 5개 단어 토큰(stop word 제외) 보다 커질 때 까지 이전 comment를 포함시킴.
  - **맥락 Commnet의 Token을 카운트할 때, stop word, punct는 포함시키지 않기 위해서 spacy package를 사용하였음.**

In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_trf

import spacy
from spacy.tokenizer import Tokenizer

# 맥락 commnet의 token을 카운트 할 때, stop word, punct를 걸러내기 위해서 사용
nlp = spacy.load("en_core_web_trf")
tokenizer = Tokenizer(nlp.vocab)

In [59]:
# regex pattern for Gab_red dataset
# regex pattern for Cat dataset
class Regex_pattern():
  def __init__(self):
    self.emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"u"\U00002702-\U000027B0"u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"u"\U00010000-\U0010ffff"u"\u2640-\u2642"u"\u2600-\u2B55"
                               u"\u200d"u"\u23cf"u"\u23e9"u"\u231a"u"\ufe0f"u"\u3030""]+", flags=re.UNICODE)
    self.comment_number = re.compile("[0-9]+.")
    self.url = re.compile("(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")
    self.url_1 = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    self.url_2 = re.compile('www(?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    self.url_3 = re.compile('http[s]? : // (?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    self.user = re.compile(r"@[a-zA-Z]+")
    self.hash = re.compile(r"#")
    self.unlatter = re.compile("[âð±ïó¾¬ª¯¶¦â¡©§¸©¹¼¥¤ã»ÃÀÁÂÃÄÅ½º¢µ$|¿°«à]")
    self.space = re.compile(r"\s+") # too many space
    self.ip = re.compile("[']{2,3}")
    self.slash = re.compile("[\'|]{1}")
    self.slash2 = re.compile("[\']{2,}")

  def __call__(self, string):
    string = self.emoji_pattern.sub(r'', string)
    # string = regex_pattern.comment_number.sub(r'',string)
    string = self.url.sub(r'',string)
    # user @ tag는 전체 삭제
    string = self.user.sub(r'',string)
    # #tag 해시태그는 #해시 문자만 삭제
    string = self.hash.sub(r'',string)
    # string = re.compile(r"#[a-zA-Z]+").sub(r'',string)
    string = self.unlatter.sub(r'',string)
    # string = self.space.sub(' ', string).strip()
    string = self.url_1.sub('',string)
    string = self.url_2.sub('',string)
    string = self.url_3.sub('',string)
    # string = self.ip.sub(r"'",string)
    # string = self.slash.sub(r"'",string)
    # string = self.slash2.sub(r"",string)
    return string

regex_pattern = Regex_pattern()

In [78]:
def token_count(doc):
  cnt = 0
  for token in tokenizer(doc): # spacy tokenizer
    # print(t)
    if (token.is_stop == False) & (token.is_punct == False):
      cnt +=1
  return cnt

def concat_doc(doc_list):
  if len(doc_list) < 2:
    return doc_list
  else :
    return ['. '.join(reversed(doc_list))]

# def set_text_parent(comments, idx):
def set_text_parent(x, regex_pattern):
  comments = x[0]
  idx = x[1]

  # comments = remove_emoji(comments, regex_pattern)
  comments = regex_pattern(comments)
  comments = comments.replace('\t','').split('\n')

  doc = []
  for i in comments:
    i = re.compile(r"\s+").sub(" ", i).strip() # too many space -> single space
    doc.append(i)

  doc = doc[:-1]

  text = []
  parent = []
  label = []

  # for non-toxic comments
  if str(idx) == 'nan':
    text.append(doc[-1])
    max_doc = len(doc)
    cnt = 0
    for i in range(max_doc-2, -1,-1):
      cnt += token_count(doc[i])
      parent.append(doc[i])
      label.append(0)
      if cnt > 5:
        return concat_doc(parent), text, label
      else : continue
    return concat_doc(parent), text, label

  # set start idx 0
  idx_list = [int(i)-1 for i in re.compile(r'\d{1,2}').findall(idx)]

  # only single comment is toxic & single context commnet
  if idx_list[0] == 1 and len(idx_list) < 2:
    parent.append(doc[idx_list[0]-1])
    text.append(doc[idx_list[0]])
    label.append(1)
    return parent, text, label # return single string sentence list

  # single toxic
  if len(idx_list) < 2:

    try:
      text.append(doc[idx_list[-1]])
    except:
      print(idx_list[-1], doc, len(doc))
      
    label = [1]
    cnt = 0
    for i in range(idx_list[-1]-1, -1, -1): 
      try:
        cnt += token_count(doc[i])
        parent.append(doc[i])
      except:
        print(i, len(doc))
      
      if cnt > 5:
        return concat_doc(parent), text, label
      else : continue
    return concat_doc(parent), text, label


  if len(idx_list) > 1:

    for j in idx_list:
      if j == 0:
        continue
      
      text.append(doc[j])
      label = [1]
      cnt = 0
      tmp_parent = []
      for i in range(j-1,-1,-1):
        cnt += token_count(doc[i])
        tmp_parent.append(doc[i])
        if cnt > 5:
          parent.append(concat_doc(tmp_parent)[0])
          break
        else : continue
        parent.append(concat_doc(tmp_parent)[0])

  return parent, text, label

In [81]:
t_p, t_t, t_l = [], [], []

for n, (c,i) in enumerate(zip(df_rg['text'],df_rg['hate_speech_idx'])):
  if n % 1000 == 0:
    print(n)
  tmp = (c,i)
  try :
    p, t, l = set_text_parent(tmp, regex_pattern)
  except :
    print(n) # labeling error,,
  # print(p,t,l)
  for sp,st in zip(p, t):
    t_p.append(sp)
    t_t.append(st)
    t_l.append(l[0])

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
2 ["1. I heard from Giant Bomb's Abby Russell that they touched some political subjects that need to be discussed but in a subtle way. I guess this is subtle for her.", '2. Thanks for reminding me that stupid millenial cunt exists, I used to enjoy GB before she turned whatever good remained in it into shit.'] 2
11000
18 14
17 14
16 14
15 14
14 14


In [82]:
df_split = pd.DataFrame(list(zip(t_p,t_t,t_l)), columns=['parent','text','label'])
# gab, reddit에서 혐오 키워드로 수집했기 때문에 혐오 표현이 대부분.
df_split.label.value_counts()

1    12914
0     1323
Name: label, dtype: int64

In [None]:
# save to csv
df_split.to_csv(configs.path['path_drive'] / 'dataset' / 'gab_reddit.csv' )

## 3.Ktc
- Kaggle Toxic competition dataset
- Wili talk page 15만건 수집
- 10%의 데이터만 Toxic으로 분류되어 있음.

In [None]:
def get_data_csv_path(configs, condition):
  condition = condition
  return [f for f in configs.path['list_csv'] if f'{condition}' in str(f)][0]
# from my google drive
configs.path['list_csv'] = [f for f in (configs.path['path_drive'] / 'dataset').glob('*.csv')]

df_jig_train = pd.read_csv(get_data_csv_path(configs, 'jig_train'))
df_jig_test = pd.read_csv(get_data_csv_path(configs, 'jig_test'))
df_jig_test_label = pd.read_csv(get_data_csv_path(configs, 'jig_test_labels'))

In [None]:
# # regex pattern for jig dataset
class Regex_pattern():
  def __init__(self):
    self.emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"u"\U0001F300-\U0001F5FF"u"\U0001F680-\U0001F6FF"u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"u"\U00002702-\U000027B0"u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"u"\U0001f926-\U0001f937"u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"u"\u2600-\u2B55"u"\u200d"u"\u23cf"u"\u23e9"u"\u231a"u"\ufe0f"u"\u3030""]+", flags=re.UNICODE)
    self.comment_number = re.compile("[0-9]+.")
    self.user = re.compile(r"@[a-zA-Z]+")
    self.hash = re.compile(r"#")
    self.unlatter = re.compile("[âð±ïó¾¬ª¯¶¦â¡©§¸©¹¼¥¤ã»ÃÀÁÂÃÄÅ½º¢µ$|¿°«à]")
    self.space = re.compile(r"\s+") # too many space
    
    self.url = re.compile("(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")
    self.url_1 = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    self.url_2 = re.compile('www(?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    self.url_3 = re.compile('http[s]? : // (?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    
    self.ip = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
    self.article_id = re.compile('d:\d\d\s{0,5}$')
    self.userid = re.compile('\[\[User(.*)\|')

  def __call__(self, string):

      string = self.emoji_pattern.sub(r'', string)
      # string = regex_pattern.comment_number.sub(r'',string)
      string = self.url.sub(r'',string)
      # user @ tag는 전체 삭제
      string = self.user.sub(r'',string)
      # #tag 해시태그는 #해시 문자만 삭제
      string = self.hash.sub(r'',string)
      # string = re.compile(r"#[a-zA-Z]+").sub(r'',string)
      string = self.unlatter.sub(r'',string)
      string = self.space.sub(' ', string).strip()
      string = self.url_1.sub('',string)
      string = self.url_2.sub('',string)
      string = self.url_3.sub('',string)
      return string

regex_pattern = Regex_pattern()

In [None]:
# 정규표현식으로 잘 제거되고 있는지 확인
df_sample = df_jig_train.sample(100)
for f in df_sample.comment_text:
  print(regex_pattern(f))


In [None]:
df['text_clean'] = df['comment_text'].apply(lambda x: regex_pattern(x))

# save to csv
df_jig_clean = df[['text_clean','toxic']].rename({'text_clean':'text','toxic':'label'})
df_jig_clean.head()
df_jig_clean.to_csv(configs.path['path_drive'] / 'dataset' / 'jig_train.csv' )

## 4.Twit
- Tweets hate speech detection dataset
- hugging face api 활용

In [None]:
!pip install datasets

In [85]:
from datasets import load_dataset

dataset = load_dataset('tweets_hate_speech_detection')

# inbalance dataset!!
df = pd.DataFrame({'tweet': dataset['train'][:]['tweet'], 'label': dataset['train'][:]['label']})
df.label.value_counts()

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (/root/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/3e953745870454cf8ff15cc48097dbb5ff459596e0a089867c2a29cee63984ec)


0    29720
1     2242
Name: label, dtype: int64

In [86]:
# regex patterns
class Regex_pattern():
  def __init__(self):
    self.emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"u"\U00002702-\U000027B0"u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"u"\U00010000-\U0010ffff"u"\u2640-\u2642"u"\u2600-\u2B55"
                               u"\u200d"u"\u23cf"u"\u23e9"u"\u231a"u"\ufe0f"u"\u3030""]+", flags=re.UNICODE)
    self.comment_number = re.compile("[0-9]+.")
    self.url = re.compile("(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")
    self.user = re.compile(r"@[a-zA-Z]+")
    self.hash = re.compile(r"#")
    self.unlatter = re.compile("[âð±ïó¾¬ª¯¶¦â¡©§¸©¹¼¥¤ã»ÃÀÁÂÃÄÅ½º¢µ$|¿°«à]")
    self.space = re.compile(r"\s+") # too many space
    self.url_1 = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    self.url_2 = re.compile('www(?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    self.url_3 = re.compile('http[s]? : // (?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

  def __call__(self, string):
    string = self.emoji_pattern.sub(r'', string)
    # string = regex_pattern.comment_number.sub(r'',string)
    string = self.url.sub(r'',string)
    # user @ tag는 전체 삭제
    string = self.user.sub(r'',string)
    # #tag 해시태그는 #해시 문자만 삭제
    string = self.hash.sub(r'',string)
    # string = re.compile(r"#[a-zA-Z]+").sub(r'',string)
    string = self.unlatter.sub(r'',string)
    string = self.space.sub(' ', string).strip()
    string = self.url_1.sub('',string)
    string = self.url_2.sub('',string)
    string = self.url_3.sub('',string)
    return string

regex_pattern = Regex_pattern()

In [None]:
df_sample = df.sample(100)
for f in df_sample.tweet:
  print('ori',f)
  f = regex_pattern(f)
  print('reg',f)

In [88]:
df['tweet_clean'] = df['tweet'].apply(lambda x: regex_pattern(x))
df_clean = df[['tweet_clean','label']]
df_clean.head()

Unnamed: 0,tweet_clean,label
0,when a father is dysfunctional and is so selfi...,0
1,thanks for lyft credit i can't use cause they ...,0
2,bihday your majesty,0
3,model i love u take with u all the time in ur...,0
4,factsguide: society now motivation,0


In [None]:
# save to csv
df_clean.to_csv(configs.path_drive / 'dataset' / 'tweet.csv')

## 5.Sws
- Stormfront white supremacist 백인 우월주의자 포럼에서 수집된 데이터
- hugging face api활용

***- data labels***
- 0: non toxic
- 1: toxic
- 2: 외국어
- 3: 해당 문장은 toxic 하지 않지만, 맥락이 있으면 toxic (맥락은 제공되지 않음)


In [89]:
df = load_dataset('hate_speech18')
df = pd.DataFrame({'text': df['train']['text'], 
                   'label':df['train']['label'], 
                   'num_contexts':df['train']['num_contexts'] })

# inbalance,,
df.label.value_counts()


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1621.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=992.0, style=ProgressStyle(description_…

Using custom data configuration default



Downloading and preparing dataset hate_speech18/default (download: 3.49 MiB, generated: 1.31 MiB, post-processed: Unknown size, total: 4.81 MiB) to /root/.cache/huggingface/datasets/hate_speech18/default/0.0.0/8033f254483a20d1d10b0f1b56ded1f54326b1617872c537f354a721a6951d47...


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Downloading', max=1.0, style=ProgressSt…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset hate_speech18 downloaded and prepared to /root/.cache/huggingface/datasets/hate_speech18/default/0.0.0/8033f254483a20d1d10b0f1b56ded1f54326b1617872c537f354a721a6951d47. Subsequent calls will reuse this data.


0    9507
1    1196
3     168
2      73
Name: label, dtype: int64

In [90]:
# label 0,1만 사용하였음
df = df[ ((df.label == 0) | (df.label==1)) ]

In [91]:
# regex pattern for 백인우월주의 dataset
class Regex_pattern():
  def __init__(self):
    self.emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"u"\U00002702-\U000027B0"u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"u"\U00010000-\U0010ffff"u"\u2640-\u2642"u"\u2600-\u2B55"
                               u"\u200d"u"\u23cf"u"\u23e9"u"\u231a"u"\ufe0f"u"\u3030""]+", flags=re.UNICODE)
    self.comment_number = re.compile("[0-9]+.")
    self.url = re.compile("(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")
    self.user = re.compile(r"@[a-zA-Z]+")
    self.hash = re.compile(r"#")
    self.unlatter = re.compile("[âð±ïó¾¬ª¯¶¦â¡©§¸©¹¼¥¤ã»ÃÀÁÂÃÄÅ½º¢µ$|¿°«à]")
    self.space = re.compile(r"\s+") # too many space
    self.url_1 = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    self.url_2 = re.compile('www(?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    self.url_3 = re.compile('http[s]? : // (?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

  def __call__(self, string):
    string = self.emoji_pattern.sub(r'', string)
    # string = regex_pattern.comment_number.sub(r'',string)
    string = self.url.sub(r'',string)
    # user @ tag는 전체 삭제
    string = self.user.sub(r'',string)
    # #tag 해시태그는 #해시 문자만 삭제
    string = self.hash.sub(r'',string)
    # string = re.compile(r"#[a-zA-Z]+").sub(r'',string)
    string = self.unlatter.sub(r'',string)
    string = self.space.sub(' ', string).strip()
    string = self.url_1.sub('',string)
    string = self.url_2.sub('',string)
    string = self.url_3.sub('',string)
    return string

regex_pattern = Regex_pattern()

In [None]:
df_sample = df.sample(100)
for f in df_sample.text:
  print('ori',f)
  f = regex_pattern(f)
  print('reg',str(f))

In [96]:
# apply regex pattren
df['text_clean'] = df['text'].apply(lambda x: regex_pattern(x))

In [None]:
# token 2개 이하, 삭제되지 않은 url, 한단어, 공백 제거
less_comment = df.index[df['text_clean'].apply(lambda x : len(x.split(' '))) < 3]
for i in df.loc[less_comment].text_clean:
  print(i)

In [None]:
# save to csv
df = df.drop(less_comment)
df_clean = df[['text_clean','label']].rename({'text_clean':'text'}, axis=1)
df_clean.to_csv(configs.path_drive / 'dataset' / 'hate_speech18.csv')

# Merge All Collected Data set

In [None]:
def get_data_csv_path(configs, condition):
  condition = condition
  return [f for f in configs.path['list_csv'] if f'{condition}' in str(f)][0]

list_csv = [f for f in Path('/toxic_nlp/dataset').glob('*') if '.csv' in str(f)]
list_csv

In [None]:
df_cat_gabreddit = pd.read_csv([f for f in list_csv if 'generated_dataset.csv' in str(f)][0])
df_hate_speech = pd.read_csv([f for f in list_csv if 'hate_speech18.csv' in str(f)][0])
df_jig_train = pd.read_csv([f for f in list_csv if 'jig_train.csv' in str(f)][0])
df_tweet = pd.read_csv([f for f in list_csv if 'tweet.csv' in str(f)][0])

In [None]:
# check duplicated
sum(df_cat_gabreddit.duplicated())
sum(df_hate_speech.duplicated())
sum(df_jig_train.duplicated())
sum(df_tweet.duplicated())

In [None]:
# concat
df_merged = pd.concat([df_cat_gabreddit,df_tweet, df_jig_train, df_hate_speech], axis = 0, ignore_index=True)
# clear colunm name
df_merged.reset_index(inplace=True)
df_merged.drop(columns=['Unnamed: 0','index'], axis=1, inplace=True)
# check duplicated
df_merged[df_merged.duplicated()]
# save to csv
df_merged.to_csv('/toxic_nlp/dataset/merged.csv')

# Split test set with same ratio
- 데이터 출처마다 sample size 차이가 크게 난다.
  - 전체 데이터에서 임의의 비율로 test set을 분리하면, 15만건으로 가장 많은 데이터 비율을 가진 Keggle set이 대부분을 차지하게 되는 문제가 발생한다.
- ***데이터 출처와 label의 비율(15%)을 맞추면서 test set 구성***
- `Sklearn StratifiedShuffleSplit` 활용


In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

!ln -s /content/gdrive/My\ Drive/toxic_nlp /toxic_nlp
!ls /toxic_nlp

from pathlib import Path
import pandas as pd

In [99]:
list_csv = [f for f in Path('/toxic_nlp/dataset').glob('*') if '.csv' in str(f)]
list_csv
df = pd.read_csv([f for f in list_csv if 'merged.csv' in str(f)][0])
df.drop(['Unnamed: 0','Unnamed: 0.1'], axis=1, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [100]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
df_train = df.copy()

In [101]:
def data_loader(df, condition):
  return df[df.src==condition].reset_index(), df[df.src == condition].reset_index()['label']

In [102]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
train_sets = []
test_sets = []
for i in [0,1,2,3]:
  dataset, label = data_loader(df_train, i)
  for train_idx, val_idx in split.split( dataset, label):
    train_sets.append(dataset.loc[train_idx]) 
    test_sets.append(dataset.loc[val_idx])

In [103]:
# train set 구성
for i in [0,1,2,3]:
  print(train_sets[i].src.value_counts())
  print(test_sets[i].label.value_counts())

0    25294
Name: src, dtype: int64
1    2590
0    1874
Name: label, dtype: int64
1    135235
Name: src, dtype: int64
0    21621
1     2244
Name: label, dtype: int64
2    8715
Name: src, dtype: int64
0    1359
1     179
Name: label, dtype: int64
3    27167
Name: src, dtype: int64
0    4459
1     336
Name: label, dtype: int64


In [None]:
df_train_merged = pd.concat([train_sets[0],train_sets[1],train_sets[2],train_sets[3]], ignore_index=True)
df_tset_merged = pd.concat([test_sets[0],test_sets[1],test_sets[2],test_sets[3]], ignore_index=True)

df_train_merged.to_csv('/toxic_nlp/dataset/exp_v1_train.csv')
df_tset_merged.to_csv('/toxic_nlp/dataset/exp_v1_test.csv')

In [None]:
# generated text & parent concat
df_gan = ( df_train_merged[['parent_gen','text_gen','label']]
          .rename(columns={'parent_gen':'parent','text_gen':'text'})
          .dropna() )

pd.concat([df_train_merged,df_gan], axis=0).to_csv('/toxic_nlp/dataset/exp_v1_gan_merged.csv')

df_gan_ori = ( df_gan_merged[df_gan_merged.src==0][['parent_gen','text','label']].
              rename(columns={'parent_gen':'parent'}) )
# df_gan_ori
# ori-ori + gan_gan + gan_ori condition. 
pd.concat([df_gan_merged, df_gan_ori], axis=0).to_csv('/toxic_nlp/dataset/exp_v2_gan_merged.csv')