# Preparing the Dataset for Modeling

In [15]:
import os
import re
import pathlib
import numpy as np
import pandas as pd

In [2]:
!gsutil cp gs://ekabasandbox-lcm/toxicity/train.csv .

Copying gs://ekabasandbox-lcm/toxicity/train.csv...
- [1 files][ 65.6 MiB/ 65.6 MiB]                                                
Operation completed over 1 objects/65.6 MiB.                                     


In [3]:
data = pd.read_csv('train.csv')

In [4]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
# add clean column label
data['clean'] = (1 - data.iloc[:, 2:].sum(axis=1) >= 1).astype(int)

In [6]:
# merge all other non-clean commnents to toxic
data.loc[data['clean'] == 0, ['toxic']] = 1

In [7]:
# select dataframe of clean examples
data_clean = data[data['clean'] == 1].sample(n=20000)

In [8]:
# select dataframe of toxic examples
data_toxic = data[data['toxic'] == 1].sample(n=16000)

In [9]:
# join into one dataframe
data = pd.concat([data_clean, data_toxic])

In [10]:
# remove unused columns
data.drop(['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)

In [22]:
# data.head(30)

In [16]:
for index, row in data.iterrows():
    comment_text = re.sub(r'[^\w\s]','',row['comment_text']).rstrip().lstrip().strip()
    classes = ''
    if (row['toxic'] == 1):
        classes = 'toxic'
    else:
        classes = 'clean'
    
    pathlib.Path("./file/{}".format(classes)).mkdir(parents=True, exist_ok=True) 
    with open("./file/{}/text_{}.txt".format(classes,index), "w") as text_file:
        text_file.write(comment_text)

In [17]:
data_path = []
directory = 'file/'

In [18]:
# create data csv
for subdir, dirs, files in os.walk(directory):
    for file in files:
        filepath = subdir + os.sep + file

        if filepath.endswith(".txt"):
            entry = ['{}/{}'.format('gs://ekabasandbox-lcm',filepath), os.path.basename(subdir)]
            data_path.append(entry)

In [19]:
# convert to Pandas DataFrame
data_pd = pd.DataFrame(np.array(data_path))

In [20]:
# export data to csv
data_pd.to_csv("data.csv", header=None, index=None)

In [None]:
# move to cloud storage

In [None]:
!gsutil cp -m -r file data.csv gs://ekabasandbox-lcm/toxicity/

Copying file://file/clean/text_63990.txt [Content-Type=text/plain]...
Copying file://file/clean/text_24973.txt [Content-Type=text/plain]...           
Copying file://file/clean/text_89086.txt [Content-Type=text/plain]...           
Copying file://file/clean/text_82623.txt [Content-Type=text/plain]...           
/ [4 files][  1.3 KiB/  1.3 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://file/clean/text_155903.txt [Content-Type=text/plain]...
Copying file://file/clean/text_115038.txt [Content-Type=text/plain]...          
Copying file://file/clean/text_65887.txt [Content-Type=text/plain]...           
Copying file://file/clean/text_95466.txt [Content-Type=text/plain]...           
Copying file://file/clean/text_25520.txt

# Finish