In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

# I'm loading the jigsaw train csv and validation csv provided for the Jigsaw Multilingual Toxic Comment Classification on kaggle to further challenge my understanding of the material

In [2]:
jigsaw = pd.read_csv("jigsaw-unintended-bias-train.csv")
validation = pd.read_csv("validation.csv")
jigsaw.head()

Unnamed: 0,id,comment_text,toxic,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,haha you guys are a bunch of losers.,0.893617,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


In [3]:
#the 'rating' could be made more readable if the two values for it were made 
#into two binary features 
rating_dummy = pd.get_dummies(jigsaw['rating'])
rating_dummy

Unnamed: 0,approved,rejected
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
1902189,1,0
1902190,1,0
1902191,0,1
1902192,1,0


In [4]:
#Upon visual realization of this, I add them to the dataset and remove rating
jigsaw['approved rating'] = rating_dummy['approved']
jigsaw['rejected rating'] = rating_dummy['rejected']
jigsaw.drop(['rating'], axis=1, inplace=True)

In [5]:
#take a quick look at my changes
jigsaw.head()

Unnamed: 0,id,comment_text,toxic,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,approved rating,rejected rating
0,59848,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0,0.0,0,4,0,1
1,59849,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0,0.0,0,4,0,1
2,59852,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0,0.0,0,4,0,1
3,59855,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0,0.0,0,4,0,1
4,59856,haha you guys are a bunch of losers.,0.893617,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,0,0,0,1,0,0.0,4,47,0,1


In [6]:
#let's check for null values
jigsaw.isnull().sum()

id                                           0
comment_text                                 0
toxic                                        0
severe_toxicity                              0
obscene                                      0
identity_attack                              0
insult                                       0
threat                                       0
asian                                  1475487
atheist                                1475487
bisexual                               1475487
black                                  1475487
buddhist                               1475487
christian                              1475487
female                                 1475487
heterosexual                           1475487
hindu                                  1475487
homosexual_gay_or_lesbian              1475487
intellectual_or_learning_disability    1475487
jewish                                 1475487
latino                                 1475487
male         

## The majority of a large portion of the data is missing so I will be omitting those parameters that are missing a large amount of data.

# Let's check out some correlations!

In [8]:
#Approval
np.corrcoef(jigsaw['approved rating'], data['toxic'])[0, 1]

-0.20962787669403726

In [9]:
#Rejection
np.corrcoef(jigsaw['rejected rating'], jigsaw['toxic'])[0, 1]

0.20962787669403726

In [10]:
#Severely Toxic
np.corrcoef(jigsaw['severe_toxicity'], jigsaw['toxic'])[0, 1]

0.3935944017368648

In [11]:
#Insult
np.corrcoef(jigsaw['insult'], jigsaw['toxic'])[0, 1]

0.9282705306721125

In [12]:
#Threat
np.corrcoef(jigsaw['threat'], jigsaw['toxic'])[0, 1]

0.28767953592511214

In [13]:
#Funny
np.corrcoef(jigsaw['funny'], jigsaw['toxic'])[0, 1]

-0.007334212915921778

In [14]:
#Wow
np.corrcoef(jigsaw['wow'], jigsaw['toxic'])[0, 1]

0.012597318186389404

In [15]:
#Sad
np.corrcoef(jigsaw['sad'], jigsaw['toxic'])[0, 1]

0.018103512534035682

In [16]:
#Sexually Explicit
np.corrcoef(jigsaw['sexual_explicit'], jigsaw['toxic'])[0, 1]

0.25257186656792546

In [17]:
#identity
np.corrcoef(jigsaw['identity_annotator_count'], jigsaw['toxic'])[0, 1]

0.024504930952232582

In [19]:
#toxicity
np.corrcoef(jigsaw['toxicity_annotator_count'], jigsaw['toxic'])[0, 1]

0.23710836454058706

# funny, wow, sad, and identity_annotator_count have correlations below 0.2 so I will be omitting them from here on out.

In [20]:
#making an array with the columns I consider significant to toxicity prediction 
corr_cols = ['comment_text','severe_toxicity', 'toxic', 'obscene', 'threat', 'insult', 'approved rating',
                  'rejected rating', 'sexual_explicit']
data = jigsaw[corr_cols]
data.head()

Unnamed: 0,comment_text,severe_toxicity,toxic,obscene,threat,insult,approved rating,rejected rating,sexual_explicit
0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,0,1,0.0
1,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,0,1,0.0
2,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,0,1,0.0
3,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,0,1,0.0
4,haha you guys are a bunch of losers.,0.021277,0.893617,0.0,0.0,0.87234,0,1,0.0


In [21]:
data.describe()

Unnamed: 0,severe_toxicity,toxic,obscene,threat,insult,approved rating,rejected rating,sexual_explicit
count,1902194.0,1902194.0,1902194.0,1902194.0,1902194.0,1902194.0,1902194.0,1902194.0
mean,0.004585531,0.1030068,0.01388516,0.009298498,0.08117227,0.9336372,0.06636284,0.006595598
std,0.02286902,0.1970813,0.06465998,0.04939469,0.1760987,0.2489153,0.2489153,0.04525432
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.0,0.1666667,0.0,0.0,0.09090909,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
#From I decided to split the jigsaw training set into 3 different datasets
train, test = train_test_split(data, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

1217404 train examples
304351 validation examples
380439 test examples


In [23]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
#Very much needed when I introduce the validation csv
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('toxic')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [24]:
# A small batch size is used for demonstration purposes
#testing batch commands
batch_size = 5 
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [25]:
#batch test looks good
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of comments:', feature_batch['comment_text'])
  print('A batch of toxic:', label_batch )

Every feature: ['comment_text', 'severe_toxicity', 'obscene', 'threat', 'insult', 'approved rating', 'rejected rating', 'sexual_explicit']
A batch of comments: tf.Tensor(
[b"Trump's reaction is as predictable as it is sad.  He will try to claim a victory regardless of what is revealed and will never admit being wrong.  And nunes, despite some early tough talk about appropriate investigating, is proving to be a trump boot-licker and apologist.  It's not surprising that he didn't bother to confer with adam schiff before making his announcements."
 b'Looks like you win and we\'re all going to ride the Metro!  Amazing how no one knows final costs or O & M costs (you admitted it) and the "powers that be" just don\'t care and keep moving forward.  Perhaps they all should learn the definition of "fiduciary responsibility" since by their actions they surely don\'t know.'
 b'A pledge drive.   Hey NCR, how about posting the salaries of your top employees?\n\nI would really like to see how much M

In [26]:
# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]

In [27]:
# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
  feature_layer = layers.DenseFeatures(feature_column)
  print(feature_layer(example_batch).numpy())

# The data I recieved had varying values for a few of their features. With a new layer being created, I needed to set the float default to float32 since there were an abundance of float64 values.

In [32]:
#A tool is needed to enforce uniformity amoung the float types
#within the data
tf.keras.backend.set_floatx('float32')

#comment_text is then ctegorized into 3 diffeent features since a string 
#cannot be administered to a model
comment_text = feature_column.categorical_column_with_vocabulary_list(
       'comment_text', ['fixed', 'normal', 'reversible'])

comment_text_one_hot = feature_column.indicator_column(comment_text)
demo(comment_text_one_hot)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [33]:
#take a quick look
data.head()

Unnamed: 0,comment_text,severe_toxicity,toxic,obscene,threat,insult,approved rating,rejected rating,sexual_explicit
0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,0,1,0.0
1,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,0,1,0.0
2,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,0,1,0.0
3,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,0,1,0.0
4,haha you guys are a bunch of losers.,0.021277,0.893617,0.0,0.0,0.87234,0,1,0.0


In [34]:
feature_columns = []
#Using the the categorical column as input for the embedding column
#So that we can make a more dense vector of columns 
comment_text_embedding = feature_column.embedding_column(comment_text,
                                                         dimension=8)
feature_columns.append(comment_text_embedding)

In [35]:
#this will create a layer to apply the embedded comment text
#from the jigsaw dataset
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [36]:
#The test batch showed desirable result, so now I move on to a more desirable
#batch size
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

# Let's Create, Compile and Fit the model!

In [37]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=5)

Epoch 1/5


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1a7cc88d848>

In [38]:
#let's use the test batch to evaluate accuracy
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.7010085582733154


# The validation csv has a language feature, which I will omit since it was not a part of my core features.

In [39]:
validation.drop(['lang'], axis=1, inplace=True)
validation.head()

Unnamed: 0,id,comment_text,toxic
0,0,Este usuario ni siquiera llega al rango de ...,0
1,1,Il testo di questa voce pare esser scopiazzato...,0
2,2,Vale. Sólo expongo mi pasado. Todo tiempo pasa...,1
3,3,Bu maddenin alt başlığı olarak uluslararası i...,0
4,4,Belçika nın şehirlerinin yanında ilçe ve belde...,0


In [40]:
#Determine batch size
batch_size = 32
val = df_to_dataset(validation, batch_size=batch_size)

In [41]:
#Then we test the validation data with the model I have created!
loss, accuracy = model.evaluate(val)
print("Accuracy", accuracy)

Accuracy 0.8462499976158142


# The validation data gives an accuracy of 84.62%, which I believe to be a desirable accuracy rating!