# Predictive NLP Classifier: LASER by Facebook

In [1]:
#workflow and statistics
import pandas as pd
import numpy as np
import matplotlib as matplotlib
import matplotlib.pyplot as plt

#for showing missing values
import missingno as msno

#visualisation
import seaborn as sns

#natural language processing toolkit
import nltk
import string
from nltk.corpus import stopwords  # removes useless words
from nltk.stem.lancaster import LancasterStemmer  #converts the words to base form; aggressive
from nltk.stem import porter
from nltk.stem.util import suffix_replace, prefix_replace
from nltk.stem.api import StemmerI
from nltk.stem import SnowballStemmer

#create a wordcloud of often used words
import wordcloud
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
nltk.download('stopwords')

#accessing google cloud storage
#from google.cloud import storage
#from io import BytesIO
#client = storage.Client()
#bucket = "bilderkennung_nf_2020"

#building baseline classifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline

# Grid search cross validation
from sklearn.model_selection import GridSearchCV

# LSTM 
import re
from tqdm import tqdm_notebook

from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras import regularizers, initializers, optimizers, callbacks
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from keras.callbacks import EarlyStopping


import os
import glob

# LASER Embeddings
from laserembeddings import Laser

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/student/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


In [None]:
#train_data = pd.read_csv("gs://bilderkennung_nf_2020/data /jigsaw-toxic-comment-train.csv")

In [2]:
train_data = pd.read_csv('./jigsaw-toxic-comment-train.csv')

In [3]:
train_data = train_data.drop('id', axis=1)

In [4]:
train_data.head(2)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0


In [65]:
train_data.shape

(223549, 7)

In [5]:
#valid_data = pd.read_csv("gs://bilderkennung_nf_2020/data /validation.csv")

In [6]:
val_data = pd.read_csv("./validation.csv")

In [7]:
val_data = val_data.drop('id', axis=1)

In [8]:
val_data.head(2)

Unnamed: 0,comment_text,lang,toxic
0,Este usuario ni siquiera llega al rango de ...,es,0
1,Il testo di questa voce pare esser scopiazzato...,it,0


In [66]:
val_data.shape

(8000, 3)

In [9]:
test_data = pd.read_csv("./test.csv")

In [10]:
test_data = test_data.drop('id', axis=1)

In [11]:
test_data.head(2)

Unnamed: 0,content,lang
0,Doctor Who adlı viki başlığına 12. doctor olar...,tr
1,"Вполне возможно, но я пока не вижу необходимо...",ru


In [67]:
test_data.shape

(63812, 2)

## Preprocessing

remove stopwords, punctuation and make everything lowercase:

In [12]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
train_data['comment_text'] = train_data['comment_text'].apply(remove_punctuation)
val_data['comment_text'] = val_data['comment_text'].apply(remove_punctuation)
test_data['content'] = test_data['content'].apply(remove_punctuation)

In [13]:
sw=stopwords.words('english')
def removesw(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)
train_data['comment_text'] = train_data['comment_text'].apply(removesw)
val_data['comment_text'] = val_data['comment_text'].apply(removesw)
test_data['content'] = test_data['content'].apply(removesw)

In [14]:
stemmer = SnowballStemmer("english")

def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 
train_data['comment_text'] = train_data['comment_text'].apply(stemming)
val_data['comment_text'] = val_data['comment_text'].apply(stemming)
test_data['content'] = test_data['content'].apply(stemming)

## Prepare Model

In [72]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, LSTM # 1/2
from tensorflow.keras.preprocessing.text import Tokenizer #0/1
from tensorflow.keras.preprocessing.sequence import pad_sequences #0/1
from tensorflow.keras import Sequential # 1/1
from tensorflow.keras.utils import to_categorical #0/1

import re

In [83]:
train = train_data

labels = ['toxic']
y_train = train[labels].values
comments_train = train['comment_text']
comments_train = list(comments_train)

In [86]:
val = val_data

labels = ['toxic']
y_val = val[labels].values
comments_val = val['comment_text']
comments_val = list(comments_val)

In [89]:
#tokenize

tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(comments_train)
sequences1 = tokenizer.texts_to_sequences(comments_train)
word_index = tokenizer.word_index
print('Vocabulary size:', len(word_index))

Vocabulary size: 308594


In [90]:
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(comments_val)
sequences2 = tokenizer.texts_to_sequences(comments_val)
word_index = tokenizer.word_index
print('Vocabulary size:', len(word_index))

Vocabulary size: 69367


In [91]:
#pad_sentences

data = pad_sequences(sequences1, padding = 'post', maxlen = 200)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

Shape of data tensor: (223549, 200)
Shape of label tensor: (223549, 1)


In [92]:
data1 = pad_sequences(sequences2, padding = 'post', maxlen = 200)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

Shape of data tensor: (223549, 200)
Shape of label tensor: (223549, 1)


In [93]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = y[indices]

In [96]:
num_validation_samples = int(0.2*data.shape[0])
x_train = data[: -num_validation_samples]
y_train = labels[: -num_validation_samples]
x_val = data[-num_validation_samples: ]
y_val = labels[-num_validation_samples: ]
print('Number of entries in each category:')
print('training: ', y_train.sum(axis=0))
print('validation: ', y_val.sum(axis=0))

Number of entries in each category:
training:  [17141]
validation:  [4243]


In [99]:
X1= train_data['comment_text']
X1= np.asarray(X1)
Y1= train_data['toxic']
Y1= np.asarray(Y1)

In [100]:
#validation data
X2= val_data['comment_text']
X2= np.asarray(X2)
Y2= val_data['toxic']
Y2= np.asarray(Y2)

comments = list(train_data['comment_text']) + list(val_data['comment_text'])

print(comments[0])
print(len(comments))

[LASER for NLP](https://www.engati.com/blog/laser-for-nlp-tasks-part-ii)

In [101]:
#insert LASER
laser = Laser()
embeddings = laser.embed_sentences(
    data, # !
    lang= ['en']) # add other datasets, add languages, , 'es', 'it', 'tr', 'ru', 'fr', 'pt', 'de'

In [102]:
Y1 = np.asarray(Y1).astype('float32').reshape((-1,1))
Y2 = np.asarray(Y2).astype('float32').reshape((-1,1))

In [103]:
# for modeling
model = Sequential()
model.add(Dense(2,activation='sigmoid'))
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])
model.fit(X1, Y1, validation_data=(X2, Y2), epochs= 5)

Epoch 1/5


ValueError: in user code:

    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:749 train_step
        y, y_pred, sample_weight, regularization_losses=self.losses)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/keras/engine/compile_utils.py:204 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/keras/losses.py:149 __call__
        losses = ag_call(y_true, y_pred)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/keras/losses.py:253 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/keras/losses.py:1605 binary_crossentropy
        K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/keras/backend.py:4823 binary_crossentropy
        return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /Users/student/opt/anaconda3/envs/nf/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:174 sigmoid_cross_entropy_with_logits
        (logits.get_shape(), labels.get_shape()))

    ValueError: logits and labels must have the same shape ((None, 2) vs (None, 1))


In [104]:
model.summary()

Model: "sequential_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_27 (Dense)             (None, 2)                 4         
Total params: 4
Trainable params: 4
Non-trainable params: 0
_________________________________________________________________


In [None]:
#save the model
model.save('lase_model.h5')