TODO : 
- Pré processer mes text pour ne plus avoir les trucs pourris avant d'utiliser le tokenizer de keras
- Utiliser des embeddings pré réalisés --> Notamment fast text et initialiser ma couche d'embeddings à partir de fast text
    https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
    https://www.kaggle.com/sbongo/do-pretrained-embeddings-give-you-the-extra-edge
    https://www.kaggle.com/mschumacher/using-fasttext-models-for-robust-embeddings
    https://www.kaggle.com/sterby/fasttext-like-baseline-with-keras-lb-0-053
    
- Utiliser un lstm bidirectionnel / GRU
    https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-069
    https://machinelearningmastery.com/develop-bidirectional-lstm-sequence-classification-python-keras/
- Augmenter la profondeur de mon NN
- Rajouter des features (taille du texte, nombre de caractères pourris (@-+.) 
- Essayer avec un CNN 
- Utiliser des méthodes de bagging et voir ce que ca fait

In [2]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [3]:
import string
import re

import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, GRU, Embedding, Dropout, Activation, Add
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [4]:
# Load Data
train_data_path = "data/train.csv"
test_data_path = "data/test.csv"

train_set = pd.read_csv(train_data_path)

In [5]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
Y_train = train_set[train_set.columns[2:]]
list_sentences_train = train_set["comment_text"]

In [35]:
class FeatureExtractor:
    
    def __init__(self, punctuation_set):
        self.punctuation_set = punctuation_set
        
    def extract_features(self, comments_list):
        """ This function aims at extracting features from 
        list of comments
        1. remove \n
        2. Count number of punctuation then divide it by letter number in doc
        3. Count number of capital letter then divide it by letter number in doc
        4. Count unique_words_nb / nb_words
        5. Count 
        """
        sentences_count = []
        punctuation_list = []
        capital_letters_list = []
        unique_words_list = []
        for comment in comments_list:
            sentences_count.append(len(re.findall("\n",str(comment)))+1)
            clean_comment = comment.replace('\n', ' ')
            nb_letters = len(clean_comment.replace(' ', ''))
            punctuation_list.append(sum(map(clean_comment.count, self.punctuation_set))/nb_letters)
            capital_letters_list.append(len(re.findall(r'[A-Z]', clean_comment))/nb_letters)
            try:
                unique_words_list.append(len(set(clean_comment.split()))/len(clean_comment.split()))
            except ZeroDivisionError:
                unique_words_list.append(0)
        return sentences_count, punctuation_list, capital_letters_list, unique_words_list

In [7]:
comment_list = list(list_sentences_train)
feature_extractor = FeatureExtractor(set(string.punctuation))
sentences_count, punctuation_list, capital_letters_list, unique_words_list = feature_extractor.extract_features(comment_list)

In [8]:
df_list_sentences_train = list_sentences_train.to_frame()
df_list_sentences_train['nb_sentences'] = sentences_count
df_list_sentences_train['nb_punctuation'] = punctuation_list
df_list_sentences_train['nb_capital'] = capital_letters_list
df_list_sentences_train['nb_unique_words'] = unique_words_list

In [9]:
fixed_features = df_list_sentences_train[df_list_sentences_train.columns[1:]]

In [10]:
max_features = 50000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)

In [11]:
maxlen = 300
X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)

In [21]:
X_final = np.concatenate((X_train, fixed_features.as_matrix()), axis=1)
X_final.shape

(159571, 304)

In [28]:
class DataGenerator:
    
    def __init__(self):
        self.index_train = 0
    
    def generate_data(self, batch_size, train):
        while True:
            comments = []
            fixed_features = []
            labels = []
            if self.index_train + batch_size < len(X_final): 
                comments.append(X_final[self.index_train:self.index_train + batch_size, :maxlen])
                fixed_features.append(X_final[self.index_train:self.index_train + batch_size, maxlen:])
                labels.append(Y_train[self.index_train:self.index_train + batch_size])
                self.index_train += batch_size
            else:
                rest_train = (self.index_train + batch_size) % len(X_final)
                comments = [np.append(X_final[self.index_train:, :maxlen], X_final[:rest_train, :maxlen], axis=0)]
                fixed_features = [np.append(X_final[self.index_train:, maxlen:], X_final[:rest_train, maxlen:], axis=0)]
                labels = [np.append(Y_train[self.index_train:], Y_train[:rest_train], axis=0)]
                self.index_train = rest_train
            yield [np.array(comments).reshape(batch_size, maxlen), np.array(fixed_features).reshape(batch_size, 4)], labels

In [14]:
embed_size = 128
comment_layer_input = Input(shape=(maxlen,))
comment_layer_embed = Embedding(input_dim=max_features, output_dim=embed_size)(comment_layer_input)
comment_layer_gru = GRU(64, return_sequences=True, name='gru_layer')(comment_layer_embed)
comment_layer = GlobalMaxPool1D()(comment_layer_gru)
comment_layer = Dropout(0.2)(comment_layer)

In [15]:
fixed_features_input = Input(shape=(4,))
fixed_features_dense_layer = Dense(units=64, activation='relu')(fixed_features_input)

In [16]:
merge_layer = Add()([comment_layer, fixed_features_dense_layer])
global_dense_layer = Dense(64, activation="relu")(merge_layer)
global_dense_layer = Dropout(0.2)(global_dense_layer)
output = Dense(len(list_classes), activation="sigmoid")(global_dense_layer)

In [17]:
model = Model(inputs=[comment_layer_input, fixed_features_input], outputs=output)
model.summary()
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 300, 128)     6400000     input_1[0][0]                    
__________________________________________________________________________________________________
gru_layer (GRU)                 (None, 300, 64)      37056       embedding_1[0][0]                
__________________________________________________________________________________________________
global_max_pooling1d_1 (GlobalM (None, 64)           0           gru_layer[0][0]                  
__________________________________________________________________________________________________
input_2 (I

In [18]:
# define early stopping callback
earlystop = EarlyStopping(monitor='val_acc', min_delta=0.0001, patience=5)
callbacks_list = [earlystop]

In [29]:
data_generator = DataGenerator()

In [30]:
batch_size = 64
epochs = 50
model_info = model.fit_generator(data_generator.generate_data(batch_size,True),
                        samples_per_epoch=200, nb_epoch=epochs)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
 25/200 [==>...........................] - ETA: 50s - loss: 0.0364 - acc: 0.9865

KeyboardInterrupt: 

In [31]:
# Save model
model.save('gru_toxic_comments_classifier.h5')

In [33]:
test_set = pd.read_csv(test_data_path)
list_sentences_test = test_set["comment_text"]

In [37]:
feature_extractor = FeatureExtractor(set(string.punctuation))
test_comment_list = list(list_sentences_test)
test_sentences_count, test_punctuation_list, test_capital_letters_list, test_unique_words_list = \
    feature_extractor.extract_features(test_comment_list)

In [38]:
df_list_sentences_test = list_sentences_test.to_frame()
df_list_sentences_test['nb_sentences'] = test_sentences_count
df_list_sentences_test['nb_punctuation'] = test_punctuation_list
df_list_sentences_test['nb_capital'] = test_capital_letters_list
df_list_sentences_test['nb_unique_words'] = test_unique_words_list

In [39]:
test_fixed_features = df_list_sentences_test[df_list_sentences_test.columns[1:]]

In [41]:
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

maxlen = 300
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [42]:
X_test_final = np.concatenate((X_test, test_fixed_features.as_matrix()), axis=1)
X_test_final.shape

(153164, 304)

In [45]:
Y_test = model.predict([X_test_final[:, :maxlen], X_test_final[:, maxlen:]])

In [46]:
Y_test.shape

(153164, 6)

In [47]:
final_df = pd.DataFrame()

In [48]:
final_df['toxic'] = Y_test[:, 0]
final_df['severe_toxic'] = Y_test[:, 1]
final_df['obscene'] = Y_test[:, 2]
final_df['threat'] = Y_test[:, 3]
final_df['insult'] = Y_test[:, 4]
final_df['identity_hate'] = Y_test[:, 5]

In [49]:
final_df.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.998957,0.4338047,0.989188,0.07132328,0.942847,0.149793
1,0.000104,2.341396e-09,1.4e-05,6.256165e-07,5e-06,2e-06
2,0.000788,3.134857e-08,7e-05,4.045759e-06,3.9e-05,1e-05
3,0.000175,3.237983e-09,1.8e-05,6.864863e-07,1e-05,2e-06
4,0.000821,1.43125e-07,0.000131,1.46779e-05,7.3e-05,3.3e-05


In [50]:
submissions = pd.read_csv("data/sample_submission.csv")
final_df.insert(0, 'id', submissions['id'])

In [51]:
final_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998957,0.4338047,0.989188,0.07132328,0.942847,0.149793
1,0000247867823ef7,0.000104,2.341396e-09,1.4e-05,6.256165e-07,5e-06,2e-06
2,00013b17ad220c46,0.000788,3.134857e-08,7e-05,4.045759e-06,3.9e-05,1e-05
3,00017563c3f7919a,0.000175,3.237983e-09,1.8e-05,6.864863e-07,1e-05,2e-06
4,00017695ad8997eb,0.000821,1.43125e-07,0.000131,1.46779e-05,7.3e-05,3.3e-05


In [52]:
final_df.to_csv('gru_fixed_features_df.csv', index = False)