## Notebook for testing the model with custom input

### Load model and functions

In [1]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from tensorflow.keras.models import load_model
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

Create additional functions same as used in the model training. This is required due to issues with sklearn.pipeline when pickling custom models (keras model in this case). Problem discussed in: https://www.stefaanlippens.net/python-pickling-and-dealing-with-attributeerror-module-object-has-no-attribute-thing.html

In [2]:
# Define model
def create_model(l1_dim=10, lsa_dim=10, dropout=0.):
    model = Sequential()
    model.add(Dense(l1_dim, input_dim=lsa_dim, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(6, activation='sigmoid'))

    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[AUC(name='auc')])
    return model

# Build keras estimator
keras_estimator = KerasClassifier(build_fn=create_model, verbose=3, epochs=20)

In [3]:
pipeline_estimator = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), 
                               ('lsa', TruncatedSVD()), 
                               ('nn', keras_estimator)])

Define functions for cleaning comments and providing prediction class

In [4]:
def clean_comments(df):

    # Lowercase
    df_clean = df.str.lower()

    # Change empty space characters to spaces
    df_clean = df_clean.str.replace(r'\s', ' ', regex=True)

    # Remove non ASCII letters
    df_clean = df_clean.str.replace(r"[^a-z ']+", '', regex=True)

    # Normalize spaces (for readability)
    df_clean = pd.Series([" ".join(x.split()) for x in df_clean])
    
    return df_clean

def prediction_to_class(pred_list, input_comment):
    
    classes_list = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    res_df = pd.DataFrame(pred_list, columns = classes_list)
    res_df['comment'] = input_comment
    res_df = res_df.round()
        
    return res_df

Load the model

In [5]:
# Test loading the pipeline
test_model = joblib.load('models/pipeline.pkl')
test_model.named_steps['nn'].model = load_model('models/keras_model.h5')

### Input comment and predict the toxicity class

In [6]:
input_comment = ['You sir are a fine man :)',
                 'This is a second example of non-toxic comment']

In [8]:
# predict
input_comment_df = pd.Series(input_comment)
clean_comment_df = clean_comments(input_comment_df)
pred = test_model.predict_proba(clean_comment_df)
pred = prediction_to_class(pred, input_comment)
print('Predicted classes:')
pred.head()

Predicted classes:


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment
0,0.0,0.0,0.0,0.0,0.0,0.0,You sir are a fine man :)
1,0.0,0.0,0.0,0.0,0.0,0.0,This is a second example of non-toxic comment
