In [None]:
import numpy as np
import csv as csv
import matplotlib.pyplot as plt
import pandas as pd
from math import log
%matplotlib inline
#read cvs data with panda
print("Loading dataset...")
df = pd.read_json('messages.json', encoding='utf8', lines=True)
print ('..done')

In [None]:
from keras.preprocessing import sequence
from sklearn.feature_extraction import FeatureHasher
from keras.preprocessing.text import text_to_word_sequence
import keras.backend as backend
import mmh3 as mmh3
import re
from nltk.stem.porter import PorterStemmer
import unidecode
from nltk.corpus import stopwords


myDf = df[df['language'].isin(['en','de','it','fr','nl'])]

def replaceUmlauts(string):
    return unidecode.unidecode(string)    

def preprocess(doc):
    if doc is None:
        return None     
    
    doc = doc.lower()
    doc = replaceUmlauts(doc)
  
    words = text_to_word_sequence(doc, filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~')    
    
    cleaned = []    
    for x in words:
        if (len(x) <= 2 or len(x) > 20):
            continue
        if (x.isdigit()):         
            cleaned.append('token_number')  
            continue
        if (any(char.isdigit() for char in x)):
            cleaned.append('token_hashlike')  
            continue

        y = re.sub('\W+','', x)       
        cleaned.append(y)        
    return cleaned
        
        
print("Feature cleaning")
myDf.message = myDf.message.map(preprocess)
print ('..done')  

In [None]:
print("Building dict")
wordset = dict()
counts = dict()
index = 0

def addToDict(wordseq):
    for w in wordseq:
        global index
        if (w not in counts):
            counts[w] = 1
        else:
            counts[w] += 1
            if (w not in wordset):
                wordset[w] = index
                index += 1
       
    

myDf['message'].apply(addToDict)


print "Uniqe words " + str(len(counts))
print ">= 2 occurancies " + str(len(wordset))
print ('..done')  

In [None]:
print("Transform to vector")

def getOrNone(word):
    if (word in wordset):
        return wordset[word]
    return None    
    

def extract(sentence):    
    items = map(getOrNone, sentence)    
    return [x for x in items if x is not None]
    

myDf['nlp_features'] = myDf.message.map(extract)
print ('..done')  

In [None]:
vocabSize = len(wordset)

def toIndex(value, index):
    if (value > 0.5):
        return index
    else:
        return    

myDf['features'] = myDf['features'].map(lambda x : filter(None, x) if type(x) is list else [x] )

message_seperator = vocabSize
num_features = vocabSize + 1

myDf = myDf.drop(columns=['nlp_features'])

In [None]:
from sklearn.utils import shuffle

byComId = myDf.groupby('comId', as_index=False).agg({'status':'mean',
                                        'features': lambda series: reduce(lambda x, y: np.append(np.append(np.array(x).astype(int), message_seperator), np.array(y).astype(int)), series), 
                                        'msgId' : 'count' })  

byComId = byComId[byComId['msgId'] > 2]
byComId = shuffle(byComId)

In [None]:
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
import sklearn.metrics as sklm
from sklearn.model_selection import train_test_split
from keras.callbacks import Callback,TensorBoard,EarlyStopping,ModelCheckpoint
from keras.models import Sequential, Model
from keras.layers import Input, Conv1D, GlobalMaxPooling1D, Embedding, LSTM, Dropout, MaxPooling1D, Activation, concatenate
from keras.layers import Dense, Dropout, Embedding, LSTM, GRU, Bidirectional, Conv1D, GlobalMaxPooling1D,Bidirectional, Reshape
from keras import regularizers
import math
from sklearn.model_selection import GridSearchCV

# maximum lenght of word per thread
seq_length = 500
epochs = 25
batch_size=2048
class_weight = {0:2,1:8}
num_features_all = num_features
features = byComId['features'].map(lambda x: np.array(x)[-seq_length:]).values

print type(features[1])

X =  pad_sequences(features, maxlen=seq_length)
y = byComId['status'].astype(int)

print X.shape

print np.mean(y)
print np.min(y)
print np.max(y)

In [None]:
import keras.backend as K
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, LSTM, Dropout, MaxPooling1D, Activation
from keras.layers import Dense, Dropout, Embedding, LSTM, GRU, Bidirectional, Conv1D, GlobalMaxPooling1D,Bidirectional
from keras import regularizers
import types, copy

def matthews_correlation(y_true, y_pred):
    """Matthews correlation metric.
    It is only computed as a batch-wise average, not globally.
    Computes the Matthews correlation coefficient measure for quality
    of binary classification problems.
    """
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())



class KerasBatchClassifier(KerasClassifier):
    def fit(self, X, y, **kwargs):        
        if self.build_fn is None:
            self.model = self.__call__(**self.filter_sk_params(self.__call__))
        elif not isinstance(self.build_fn, types.FunctionType) and not isinstance(self.build_fn, types.MethodType):
            self.model = self.build_fn(**self.filter_sk_params(self.build_fn.__call__))
        else:
            self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
        
        early_stopping = EarlyStopping(monitor="val_matthews_correlation", patience=3, verbose=1, mode="max")
        model_checkpoint = ModelCheckpoint("results/best_weights.{epoch:02d}-{matthews_correlation:.5f}.hdf5", monitor="matthews_correlation", 
                                           verbose=1, save_best_only=True, mode="max", period=3)
#         tensorboard = TensorBoard(log_dir='./graph', histogram_freq=0,  
#           write_graph=True, write_images=True)
        callbacks = [early_stopping]
    
        x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)   
        
        self.__history = self.model.fit(x_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=[x_test, y_test],
            class_weight=class_weight,
            callbacks=callbacks,
            verbose=1
                   )
   
        return self.__history


    def score(self, X, y, **kwargs):
        loss_name = self.model.loss
        if hasattr(loss_name, '__name__'):
            loss_name = loss_name.__name__
        outputs = self.model.evaluate(X, y)
        if type(outputs) is not list:
            outputs = [outputs]
        for name, output in zip(self.model.metrics_names, outputs):
            if name == 'matthews_correlation':
                print ('MCC: ' + str(output))
                return output

        raise Exception('The model is not configured to compute mcc. '
                        'You should pass `metrics=["accuracy"]` to '
                        'the `model.compile()` method.')



    @property
    def history(self):
        return self.__history   


In [None]:
from keras import backend as K
def create_model(embedding_size ,recurrent_reg, kernel_reg, lstm_size, dropout):
    if K.backend() == 'tensorflow':
        K.clear_session()    
    model = Sequential()
    model.add(Embedding(num_features, output_dim=embedding_size, input_length=seq_length))
    model.add(LSTM(lstm_size, 
                kernel_regularizer=regularizers.l2(recurrent_reg), 
                dropout=dropout, recurrent_dropout=dropout, return_sequences=True, input_shape=(seq_length,num_features)))
    model.add(LSTM(lstm_size, 
                kernel_regularizer=regularizers.l2(recurrent_reg), 
                dropout=dropout, recurrent_dropout=dropout, return_sequences=True))
    model.add(LSTM(lstm_size, 
                kernel_regularizer=regularizers.l2(recurrent_reg), 
                dropout=dropout, recurrent_dropout=dropout))
    model.add(Dense(1, activation='sigmoid'))
    model.compile('adam', 'binary_crossentropy', metrics=[matthews_correlation, 'accuracy'])    
    return model

parameters = {'recurrent_reg': [0.1,0.2],
              'lstm_size': [32,48,64],
              'dropout': [0.3,0.4],
              'kernel_reg': [0.2,0.3],
              'embedding_size': [32,48,64]            
              }


model = KerasBatchClassifier(build_fn=create_model, epochs=epochs, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=parameters,return_train_score=False)
grid_result = grid.fit(X,y)
grid_result

In [None]:
print(grid_result.best_score_)
print(grid_result.best_params_)
print(grid_result.cv_results_['mean_test_score'])
print(grid_result.cv_results_['split0_test_score'].mean() )
print(grid_result.cv_results_['split1_test_score'].mean() )
print(grid_result.cv_results_['split2_test_score'].mean() )