# Preparing the Dataset for Modeling

In [None]:
# Enter your backet folder (without ending '/')
%env BUCKET_FOLDER=gs://ekabasandbox-lcm/toxicity

In [None]:
import os
import re
import pathlib
import numpy as np
import pandas as pd

In [None]:
!gsutil cp $BUCKET_FOLDER/train.csv .

In [None]:
data = pd.read_csv('train.csv')

In [None]:
data.head()

In [None]:
# add clean column label
data['clean'] = (1 - data.iloc[:, 2:].sum(axis=1) >= 1).astype(int)

In [None]:
# merge all other non-clean commnents to toxic
data.loc[data['clean'] == 0, ['toxic']] = 1

In [None]:
# select dataframe of clean examples
data_clean = data[data['clean'] == 1].sample(n=20000)

In [None]:
# select dataframe of toxic examples
data_toxic = data[data['toxic'] == 1].sample(n=16000)

In [None]:
# join into one dataframe
data = pd.concat([data_clean, data_toxic])

In [None]:
# remove unused columns
data.drop(['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)

In [None]:
# data.head(30)

In [None]:
for index, row in data.iterrows():
    comment_text = re.sub(r'[^\w\s]','',row['comment_text']).rstrip().lstrip().strip()
    classes = ''
    if (row['toxic'] == 1):
        classes = 'toxic'
    else:
        classes = 'clean'
    
    pathlib.Path("./file/{}".format(classes)).mkdir(parents=True, exist_ok=True) 
    with open("./file/{}/text_{}.txt".format(classes,index), "w") as text_file:
        text_file.write(comment_text)

In [None]:
data_path = []
directory = 'file/'

In [None]:
# create data csv
bucket_folder=os.environ['BUCKET_FOLDER']
for subdir, dirs, files in os.walk(directory):
    for file in files:
        filepath = subdir + os.sep + file

        if filepath.endswith(".txt"):
            entry = ['{}/{}'.format(bucket_folder,filepath), os.path.basename(subdir)]
            data_path.append(entry)

In [None]:
# convert to Pandas DataFrame
data_pd = pd.DataFrame(np.array(data_path))

In [None]:
# export data to csv
data_pd.to_csv("data.csv", header=None, index=None)

In [None]:
# move to cloud storage
!gsutil -m -q cp -r file data.csv $BUCKET_FOLDER/

In [None]:
!echo Done!