# Preparing the Dataset for Modeling

In [4]:
import os
import re
import pathlib
import numpy as np
import pandas as pd

In [5]:
!gsutil cp gs://plb-ecg-automl-toxicity-demo-2021-03/train.csv .

Copying gs://plb-ecg-automl-toxicity-demo-2021-03/train.csv...

Operation completed over 1 objects/65.6 MiB.                                     


In [6]:
data = pd.read_csv('train.csv')

In [7]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
# add clean column label
data['clean'] = (1 - data.iloc[:, 2:].sum(axis=1) >= 1).astype(int)

In [9]:
# merge all other non-clean commnents to toxic
data.loc[data['clean'] == 0, ['toxic']] = 1

In [10]:
# select dataframe of clean examples
data_clean = data[data['clean'] == 1].sample(n=20000)

In [11]:
# select dataframe of toxic examples
data_toxic = data[data['toxic'] == 1].sample(n=16000)

In [12]:
# join into one dataframe
data = pd.concat([data_clean, data_toxic])

In [13]:
# remove unused columns
data.drop(['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)

In [14]:
# data.head(30)

In [15]:
for index, row in data.iterrows():
    comment_text = re.sub(r'[^\w\s]','',row['comment_text']).rstrip().lstrip().strip()
    classes = ''
    if (row['toxic'] == 1):
        classes = 'toxic'
    else:
        classes = 'clean'
    
    pathlib.Path("./file/{}".format(classes)).mkdir(parents=True, exist_ok=True) 
    with open("./file/{}/text_{}.txt".format(classes,index), "w") as text_file:
        text_file.write(comment_text)

In [16]:
data_path = []
directory = 'file/'

In [17]:
# create data csv
for subdir, dirs, files in os.walk(directory):
    for file in files:
        filepath = subdir + os.sep + file

        if filepath.endswith(".txt"):
            entry = ['{}/{}'.format('gs://plb-ecg-automl-toxicity-demo-2021-03',filepath), os.path.basename(subdir)]
            data_path.append(entry)

In [18]:
# convert to Pandas DataFrame
data_pd = pd.DataFrame(np.array(data_path))

In [19]:
# export data to csv
data_pd.to_csv("data.csv", header=None, index=None)

In [None]:
# move to cloud storage
!gsutil -m -q cp -r file data.csv gs://plb-ecg-automl-toxicity-demo-2021-03/

In [None]:
!echo Done!