In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%pip install pandas # read csv
%pip install contractions
%pip install pyspellchecker # check and replace mispell words
%pip install nltk
%pip install matplotlib
%pip install pydot
%pip install tqdm
%pip install nlpaug

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24
Collecting pyspellchecker
  Downloading pyspellc

In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# **Preprogress Data**

In [None]:
# from spellchecker import SpellChecker
import contractions
import urllib.parse
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


class CleanText:

    def __init__(self):
        # self.eng_checker = SpellChecker(language='en')
        self.lemmatizer = nltk.WordNetLemmatizer()
        nltk_stopwords = stopwords.words('english')
        negative_words = ['not', 'no', 'never', 'nor',
                          # location words
                          "off", "out", "over", "under", "up", "down",]
        self.stop_words = [
            word for word in nltk_stopwords if word not in negative_words]

    def _lower_case(self, text):
        return text.lower()

    def _check_url(self, url):
        # check if the url is valid or not
        parsed = urllib.parse.urlparse(url)
        return bool(parsed.scheme and parsed.netloc)

    def _remove_html(self, text):
        # remove all the html tags
        result = text.replace('<.*?>', '')
        return result

    def _expaned_contractions(self, text):
        return contractions.fix(text)

    def _remove_emails(self, text):
        return text.replace(r"[\w\.-]+@[\w\.-]+\.\w+", "")

    def _change_user_name(self, text):
        pattern = re.compile(r"@\w+")
        return re.sub(pattern, "user", text)

    def _replace_urls(self, text):
        # Replace URLs with 'url'
        return re.sub(r'https?://(www\.)?(\w+)(\.\w+)(/\w*)?', 'url', text)

    def _replace_symbol_to_space(self, text):
        # replace all the symbols to space
        return re.sub(r'[^\w\s]', ' ', text)

    def _remove_single_char(self, text):
        # remove all the single characters
        return re.sub(r'\b(?<![a-hj-z])[a-z](?![a-z])\b', ' ', text)

    def _remove_multiple_space(self, text):
        # remove all the multiple spaces
        return re.sub(r'\s+', ' ', text)

    def _remove_number(self, text):
        return re.sub(r'[0-9]', '', text)

    def _remove_repeat_word(self, text):
        # remove all the characters that appear more than 2 times in a word
        return re.sub(r'(.)\1+', r'\1\1', text)

    def _fit_word(self, text):
        # Replace misplassed words
        words = text.split()
        words = [self._correct_word(word) for word in words]
        return ' '.join(words)

    def _correct_word(self, text):
        fixed = self.eng_checker.correction(text)
        if fixed != None:
            return fixed
        return text

    def _remove_emoji(self, text):
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    def _remove_stopwords(self, text):
        # Remove stopwords
        word_tokens = nltk.word_tokenize(text)
        filtered_text = [w for w in word_tokens if w not in self.stop_words]
        return ' '.join(filtered_text)

    def _lemmatize_text(self, text):
        # Lemmatize the text
        output=""
        text=text.split(" ")
        for word in text:
          word1 = self.lemmatizer.lemmatize(word, pos = "n")
          word2 = self.lemmatizer.lemmatize(word1, pos = "v")
          word3 = self.lemmatizer.lemmatize(word2, pos = "a")
          word4 = self.lemmatizer.lemmatize(word3, pos = "r")
          output=output + " " + word4
        return str(output.strip())

    def _fit_in_alphabet(self, text):
        regex = re.compile('[^a-zA-Z]')
        # First parameter is the replacement, second parameter is your input string
        return regex.sub(' ', text)

    def clean_lstm(self, text):

        # lọc nhiễu
        text = self._lower_case(text)
        text = self._remove_html(text)
        text = self._replace_urls(text)
        text = self._remove_emails(text)
        text = self._change_user_name(text)
        text = self._remove_emoji(text)
        text = self._remove_repeat_word(text)

        # chuẩn hóa

        text = self._expaned_contractions(text)
        # text = self._fit_word(text)
        text = self._remove_stopwords(text)
        text = self._fit_in_alphabet(text)
        text = self._lemmatize_text(text)

        # loại bỏ phần không cần thiết
        text = self._remove_single_char(text)
        text = self._remove_multiple_space(text)

        return text.strip()

    def clean_w2v(self, text):
        # lọc nhiễu
        text = self._lower_case(text)
        text = self._remove_html(text)
        text = self._replace_urls(text)
        text = self._remove_emails(text)
        text = self._change_user_name(text)
        text = self._remove_emoji(text)
        text = self._remove_repeat_word(text)

        # chuẩn hóa
        text = self._expaned_contractions(text)
        text = self._fit_in_alphabet(text)
        text = self._lemmatize_text(text)

        # loại bỏ phần không cần thiết
        text = self._remove_single_char(text)
        text = self._remove_multiple_space(text)

        return text.strip()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# **Clean Data Fasttext**

In [None]:
fb_df = pd.read_csv('/content/drive/MyDrive/KLTN/data/fb_news_comments_1000K.csv').fillna('no comment')

In [None]:
cleaner = CleanText()

fb_df['cleaned_text'] = fb_df['message'].progress_apply(cleaner.clean_w2v)

100%|██████████| 1038319/1038319 [11:44<00:00, 1474.32it/s]


In [None]:
fb_df.to_csv('/content/drive/MyDrive/KLTN/data/fb_comment_cleaned(new).csv')

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.0-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4479795 sha256=269891c3aab0081615989c5c7ac6e7bd28fec72714ac241bdf32adb89a37ab4f
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.0


In [None]:
txt_train = ''
with open('train_text.txt','w') as f:
  for comment in fb_df['cleaned_text']:
    f.write(comment+'\n')



In [None]:
import fasttext
model = fasttext.train_unsupervised(input="train_text.txt",epoch=25,wordNgrams=3, dim=100)

In [None]:
model.save_model('/content/drive/MyDrive/KLTN/data/fasttext_fb.bin')

In [None]:
model.words

In [None]:
model.get_nearest_neighbors('fuck')

# **Load data**

In [None]:
path = '/content/drive/MyDrive/KLTN/data/preprogress'

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/KLTN/data/train.csv')

In [None]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
cleaner = CleanText()

cleaner.clean_lstm('asking')

'ask'

In [None]:

train_df['cleaned_text'] = train_df['comment_text'].progress_apply(cleaner.clean_lstm)

100%|██████████| 159571/159571 [05:45<00:00, 461.56it/s]


In [None]:
train_df['cleaned_text']

0         explanation edit make under username hardcore ...
1         aww match background colour seemingly stick th...
2         hey man really not try edit war guy constantly...
3         not make real suggestion improvement wonder se...
4                             sir hero chance remember page
                                ...                        
159566    and second time ask view completely contradict...
159567                 ashamed horrible thing put talk page
159568    spitzer umm no actual article prostitution rin...
159569    look like actually put speedy first version de...
159570    really not think understand come idea bad righ...
Name: cleaned_text, Length: 159571, dtype: object

In [None]:
train_df.dropna()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation edit make under username hardcore ...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,aww match background colour seemingly stick th...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man really not try edit war guy constantly...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,not make real suggestion improvement wonder se...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chance remember page
...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0,and second time ask view completely contradict...
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0,ashamed horrible thing put talk page
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0,spitzer umm no actual article prostitution rin...
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0,look like actually put speedy first version de...


In [None]:
train_df.to_csv(f'{path}/train_cleaned.csv')

# **Data Augmentation**

Mark toxic data

In [None]:
train_df["target"] = (
    train_df[
        ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    ].sum(axis=1)
    >= 1
).astype(int)
train_df[train_df["target"] == 1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_text,target
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,cocksucker piss around work,1
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0,hey talk exclusive group wp taliban good destr...,1
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0,bye not look come think comming back tosser,1
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1,gay antisemmitian archangel white tiger meow g...,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0,fuck filthy mother as dry,1


In [None]:
train_df["target"].value_counts()

0    143346
1     16225
Name: target, dtype: int64

In [None]:
train_df["target"].value_counts(normalize=True)

0    0.898321
1    0.101679
Name: target, dtype: float64

Temp dataframe target is 1

In [None]:
temp_df = train_df[train_df.target == 1].reset_index(drop=True)
temp_df.shape

(16225, 10)

In [None]:
import nlpaug
import nlpaug.augmenter.word as naw


aug = naw.SynonymAug(aug_src='wordnet',aug_max=5,aug_min=1)
news_row = []
print(temp_df.shape[0])
for i in tqdm(range(temp_df.shape[0]), desc = 'Progress Bar'):
    text = temp_df.iloc[i]['cleaned_text']
    augmented_text = aug.augment(text, n=4)
    for j in range(len(augmented_text)):
        row = temp_df.iloc[i].copy()
        row['cleaned_text'] = augmented_text[j]
        news_row.append(row)

16225


Progress Bar:   0%|          | 0/16225 [00:00<?, ?it/s][nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
Progress Bar: 100%|██████████| 16225/16225 [03:15<00:00, 83.02it/s]


In [None]:
news_df = pd.DataFrame(news_row, columns=train_df.columns)

In [None]:
news_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_text,target
0,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,cocksucker piss around body of work,1
0,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,sob pee around work,1
0,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,mother fucker piss around piece of work,1
0,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,cocksucker piss around employment,1
1,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0,hey verbalize exclusive group wp taliban uprig...,1
...,...,...,...,...,...,...,...,...,...,...
16223,ffb47123b2d82762,"""\n\nHey listen don't you ever!!!! Delete my e...",1,0,0,0,1,0,hey listen not ever delete edit ever annoy wwe...,1
16224,ffbdbb0483ed0841,and i'm going to keep posting the stuff u dele...,1,0,1,0,1,0,move keep post material delete fuck site close...,1
16224,ffbdbb0483ed0841,and i'm going to keep posting the stuff u dele...,1,0,1,0,1,0,go keep post stuff delete fuck situation faith...,1
16224,ffbdbb0483ed0841,and i'm going to keep posting the stuff u dele...,1,0,1,0,1,0,go keep post stuff delete fuck site close pour...,1


In [None]:
news_df['cleaned_text'].drop_duplicates()

0                      cocksucker piss around body of work
0                                      sob pee around work
0                  mother fucker piss around piece of work
0                        cocksucker piss around employment
1        hey verbalize exclusive group wp taliban uprig...
                               ...                        
16223    hey listen not ever delete edit ever annoy wwe...
16224    move keep post material delete fuck site close...
16224    go keep post stuff delete fuck situation faith...
16224    go keep post stuff delete fuck site close pour...
16224    go keep post hooey delete fuck site closelippe...
Name: cleaned_text, Length: 61745, dtype: object

In [None]:
train_new_df = pd.concat([train_df, news_df], axis=0).reset_index(drop=True)
train_new_df.dropna(subset='cleaned_text')

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_text,target
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation edit make under username hardcore ...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,aww match background colour seemingly stick th...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man really not try edit war guy constantly...,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,not make real suggestion improvement wonder se...,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chance remember page,0
...,...,...,...,...,...,...,...,...,...,...
224446,ffb47123b2d82762,"""\n\nHey listen don't you ever!!!! Delete my e...",1,0,0,0,1,0,hey listen not ever delete edit ever annoy wwe...,1
224447,ffbdbb0483ed0841,and i'm going to keep posting the stuff u dele...,1,0,1,0,1,0,move keep post material delete fuck site close...,1
224448,ffbdbb0483ed0841,and i'm going to keep posting the stuff u dele...,1,0,1,0,1,0,go keep post stuff delete fuck situation faith...,1
224449,ffbdbb0483ed0841,and i'm going to keep posting the stuff u dele...,1,0,1,0,1,0,go keep post stuff delete fuck site close pour...,1


In [None]:
train_new_df.shape

(224451, 10)

In [None]:
train_new_df = train_new_df.sample(frac=1)

In [None]:
train_new_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_text,target
72531,c218d82a89df572a,"""\nWatch it: Palestinian national radio puts o...",1,0,1,0,1,1,watch palestinian national radio put out call ...,1
33352,58e66a3223ce6466,"Self-appointed, self-aggrandising and self-imp...",1,0,0,0,0,0,self appoint self aggrandise self important gu...,1
99983,172167501c93fe9f,"""\n\nWhy on earth do you think I give a fuck a...",1,0,1,0,0,0,earth think give fuck opinion especially wrong...,1
104039,2ca984b3dc59249b,Who made you sherrif you shitsack?,1,0,1,0,1,0,make sherrif shitsack,1
58697,9d3373992b529265,"""\n\nHey Rent, thanks for the suggestion. It's...",0,0,0,0,0,0,hey rent thank suggestion funny mention it by ...,0


In [None]:
train_new_df.to_csv(f'{path}/train_clean_aug.csv')

Clean test

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/KLTN/data/test.csv')

In [None]:
cleaner = CleanText()

test_df['cleaned_text'] = test_df['comment_text'].progress_apply(cleaner.clean_w2v)

100%|██████████| 153164/153164 [03:49<00:00, 667.96it/s]


In [None]:
test_df.to_csv(f'/content/drive/MyDrive/KLTN/data/test_cleaned.csv')