In [23]:
import pandas as pd
import zipfile
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

### Reading and Exploring the Dataset

In [24]:
def load_and_unzip_csv(zip_file_path):
    file_name = os.path.splitext(os.path.basename(zip_file_path))[0]
    
    output_directory = f"./{file_name}"
    os.makedirs(output_directory, exist_ok=True)

    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(output_directory)

    files = os.listdir(output_directory)

    if len(files) == 1 and files[0].endswith('.csv'):
        csv_file_path = os.path.join(output_directory, files[0])
        df = pd.read_csv(csv_file_path)
        return df
    else:
        print("Error: The unzipped directory does not contain a single CSV file.")
        return None

In [25]:
zip_file_path = '/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'
dataset = load_and_unzip_csv(zip_file_path)

In [26]:
dataset.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [27]:
dataset.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [28]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [29]:
dataset.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [30]:
dataset["comment_text"][3]

'"\nMore\nI can\'t make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It\'s listed in the relevant form eg Wikipedia:Good_article_nominations#Transport  "'


### Clean up the comment text

In [31]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [32]:
dataset.drop('id', inplace=True, axis=1)

In [33]:
dataset['comment_text'] = dataset['comment_text'].map(lambda com : clean_text(com))

In [34]:
dataset["comment_text"][3]

'more i cannot make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if no one else does first if you have any preferences for formatting style on references or want to do it yourself please let me know there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up it listed in the relevant form eg wikipedia good_article_nominations transport'

In [35]:
label_column= list(dataset.columns[1:])

In [36]:
features = dataset.comment_text
labels = dataset[label_column].values
labels

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [37]:
import numpy as np
from sklearn.model_selection import train_test_split

features = dataset.comment_text
labels = dataset[label_column].values

train_features, temp_features, train_labels, temp_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42)

val_features, test_features, val_labels, test_labels = train_test_split(
    temp_features, temp_labels, test_size=0.5, random_state=42)

In [38]:
train_labels

array([[1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [39]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_features.values, train_labels))
train_dataset = train_dataset.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(8)

val_dataset = tf.data.Dataset.from_tensor_slices((val_features.values, val_labels))
val_dataset = val_dataset.cache().batch(BATCH_SIZE).prefetch(8)

test_dataset = tf.data.Dataset.from_tensor_slices((test_features.values, test_labels))
test_dataset = test_dataset.cache().batch(BATCH_SIZE).prefetch(8)

In [40]:
BUFFER_SIZE = 160000
BATCH_SIZE = 16

### Preprocess for training

In [41]:
MAX_FEATURES = 200000 # number of words in the vocab

In [42]:
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=MAX_FEATURES, output_sequence_length=1800,
                               output_mode='int')

In [43]:
encoder.adapt(train_features.values)

In [44]:
#def vectorize_text(text):
 #   text = tf.expand_dims(text, -1)
  #  return encoder(text)

In [45]:
#train_encod = vectorize_text(train_features.values)
#valid_encod = vectorize_text(val_features.values)
#test_encod = vectorize_text(test_features.values)

### Create Sequential Model

In [46]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(6, activation="sigmoid")
])

In [47]:
model.compile(
    loss="binary_crossentropy",
    optimizer='adam',
    metrics=[tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.Recall(), tf.keras.metrics.Precision()]
)

In [48]:
earlystop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

In [49]:
history = model.fit(train_dataset, epochs=10, validation_data=val_dataset, verbose=1, callbacks=[earlystop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [50]:
model.evaluate(test_dataset)



[0.045918211340904236,
 0.9951748251914978,
 0.667140007019043,
 0.8268215656280518]

In [52]:
model.save("comment_toxic_saved_model")

In [53]:
model = tf.keras.models.load_model('comment_toxic_saved_model')

In [64]:
input_text = 'You are foolish'
input_text_ =  np.expand_dims(input_text, axis=0)
input_text_

array(['You are foolish'], dtype='<U15')

In [65]:
res = model.predict(input_text_)
res>0.5



array([[ True, False, False, False, False, False]])

In [66]:
import shutil

# Sauvegarder le modèle SavedModel dans le répertoire de travail de Kaggle
model.save("/kaggle/working/temp_model", save_format='tf')

# Compresser le dossier
shutil.make_archive("/kaggle/working/temp_model", 'zip', "/kaggle/working/temp_model")

'/kaggle/working/temp_model.zip'