In [1]:
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, Dropout,Flatten,GRU

import keras.layers as layers
from keras.models import Model
from keras import backend as K 

import itertools 

import matplotlib.pyplot as plt

from keras.preprocessing import text, sequence

import numpy as np
import pandas as pd
from keras.callbacks import EarlyStopping, ModelCheckpoint
np.random.seed(7)

from keras.layers import TimeDistributed


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
weight_decay = 1e-4
class Attention(Layer):
    def __init__(self, step_dim, 
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
      
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

def CNN(maxlen, max_features, embed_size, embedding_matrix,num_filters=5):
  model = Sequential()
  model.add(Embedding(max_features, embed_size, weights=[embedding_matrix],trainable=False))
  model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
  model.add(MaxPooling1D(2))
  model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
  model.add(GlobalMaxPooling1D())
  model.add(Dropout(0.5))
  model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
  model.add(Dense(2, activation='sigmoid'))
  return model;

def BidGRU(maxlen, max_features, embed_size, embedding_matrix):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],
                  trainable=False)(inp)
    x = Bidirectional(GRU(100, return_sequences=True, dropout=0.25,
                           recurrent_dropout=0.25))(x)
    x = Attention(maxlen)(x)
#    x = Flatten(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = Dense(2, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)

    return model

def UniGRU(maxlen, max_features, embed_size, embedding_matrix):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],
                  trainable=False)(inp)
    x = GRU(100, return_sequences=True, dropout=0.25,
                           recurrent_dropout=0.25)(x)
    x = Attention(maxlen)(x)
#    x = Flatten(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = Dense(2, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)

    return model

def BidGRUNoAtt(maxlen, max_features, embed_size, embedding_matrix):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],
                  trainable=False)(inp)
    x = Bidirectional(GRU(300,  dropout=0.25,
                           recurrent_dropout=0.25))(x)
    #x = Attention(maxlen)(x)
    #x = Flatten(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = Dense(2, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)

    return model

def UniGRUNoAtt(maxlen, max_features, embed_size, embedding_matrix):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],
                  trainable=False)(inp)
    x = GRU(300,  dropout=0.25,
                           recurrent_dropout=0.25)(x)
    #x = Attention(maxlen)(x)
    #x = Flatten(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = Dense(2, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)

    return model

def UniLSTMNoAtt(maxlen, max_features, embed_size, embedding_matrix):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],
                  trainable=False)(inp)
    x = LSTM(300,  dropout=0.25, return_sequences=True,
                           recurrent_dropout=0.25)(x)
    #x = Attention(maxlen)(x)
    #x = Flatten(x)
    x = TimeDistributed(Dense(256, activation="relu"))(x)
    x = TimeDistributed(Dropout(0.25))(x)
    x = Flatten()(x)
    x = Dense(2, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)

    return model
  
def BidLSTMNoAtt(maxlen, max_features, embed_size, embedding_matrix):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],
                  trainable=False)(inp)
    x = Bidirectional(LSTM(300,  dropout=0.25, return_sequences=True,
                           recurrent_dropout=0.25))(x)
    #x = Attention(maxlen)(x)
    #x = Flatten(x)
    x = TimeDistributed(Dense(256, activation="relu"))(x)
    x = TimeDistributed(Dropout(0.25))(x)
    x = Flatten()(x)
    x = Dense(2, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)

    return model

def UniLstm(maxlen, max_features, embed_size, embedding_matrix):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],
                  trainable=False)(inp)
    x = LSTM(300, return_sequences=True, dropout=0.25,
                           recurrent_dropout=0.25)(x)
    x = Attention(maxlen)(x)
#    x = Flatten(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = Dense(2, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)

    return model

def BidLstm(maxlen, max_features, embed_size, embedding_matrix):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],
                  trainable=False)(inp)
    x = Bidirectional(LSTM(300, return_sequences=True, dropout=0.25,
                           recurrent_dropout=0.25))(x)
    x = Attention(maxlen)(x)
#    x = Flatten(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = Dense(2, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)

    return model


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    plt.tight_layout()

def print_cm(y_test,y_pred):
    true_test_labels = ['negative','positive']
    cnf_matrix = confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)

    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=true_test_labels,
                      title='Confusion matrix, without normalization')

    # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=true_test_labels, normalize=True,
                      title='Normalized confusion matrix')

    plt.show()

def LstmCnn(maxlen, max_features, embed_size, embedding_matrix):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],
                  trainable=False)(inp)
    x = Bidirectional(GRU(300, return_sequences=True, dropout=0.25,
                           recurrent_dropout=0.25))(x)
    x = Attention(maxlen)(x)
#    x = Flatten(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    
    inp1 = Input(shape=(512,))
    x1 = Embedding(512, 512, weights=[embedding_matrix],
                  trainable=False)(inp1)
    x1=Conv1D(16, kernel_size=3, activation='elu', padding='same',
                             input_shape=(vector_size, 1))(x1)
    x1=Dense(512, activation='relu',input_shape=(vector_size, 1))(x1)
    x1=Dense(64, activation='relu')(x1)
    x1=Dense(8, activation='relu')(x1)
    x1=Flatten()(x1)
    x1=Dense(2, activation='softmax')(x1)
    
    model_cnn = Model(inputs=inp1, outputs=x1)

    model_cnn.add(Conv1D(16, kernel_size=3, activation='elu', padding='same',
                             input_shape=(vector_size, 1)))
    model_cnn.add(Dense(512, activation='relu',input_shape=(vector_size, 1)))
    #model.add(Dropout(0.2))
    model_cnn.add(Dense(64, activation='relu'))
    #model.add(Dropout(0.25))
    model_cnn.add(Dense(8, activation='relu'))
    #model.add(Dropout(0.25))
    model_cnn.add(Flatten())
    model_cnn.add(Dense(2, activation='softmax'))
    
    combined_model = Sequential()
    combined_model.add(Merge([model, model_cnn], mode='concat', concat_axis=1))

    return combined_model

def make_df(train_path, test_path, max_features, maxlen, list_classes, word_index):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    train = train.sample(frac=1)

    list_sentences_train = train["message"].fillna("unknown").values
    y = train[list_classes].values
    
    y_test = test[list_classes].values
    
    y=np.where(y == 'Bullish', 1.0, 0.0)
    y_test=np.where(y_test == 'Bullish', 1.0, 0.0)
    list_sentences_test = test["spans"].fillna("unknown").values

    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.word_index = word_index
    #tokenizer.fit_on_texts(list(list_sentences_train))
    list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
    #print(list_tokenized_train[0])
    list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
    X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
    X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

    #word_index = tokenizer.word_index
    
    return X_t, X_te, y, y_test

def create_sequence(word_index, sent, maxlen):
    token = text.Tokenizer()
    token.word_index=word_index
    tokenized_text = token.texts_to_sequences(sent)
    X_text = sequence.pad_sequences(tokenized_text, maxlen=maxlen)
    return X_text

def make_glovevec(glovepath, max_features, embed_size):
    embedding_matrix = np.zeros((max_features+1, embed_size))
    f = open(glovepath, encoding="utf8")
    word_index = {}
    count=0
    for line in f:
        count+=1
        if count > max_features:
            break
        else:
            values = line.split()
            word_index[values[0]]=count
            #print(values)
            #word = ' '.join(values[:-embed_size])
            coefs = np.asarray(values[-embed_size:], dtype='float32')
            embedding_matrix[count]=coefs.reshape(-1)
        #print(embeddings_index[word])
    f.close()
    
    return embedding_matrix, word_index




In [3]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-04-08 07:41:34--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-04-08 07:41:35--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-04-08 07:41:36--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [4]:
! unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
!mkdir finsent_survey

In [41]:
import keras
import pickle
from keras.utils import plot_model

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 

x_train = np.fromfile("D:/phd/github/datascience/datasets/LASER_emb/X_train.raw", dtype=np.float32, count=-1)
dim = 1024
din = 1024
x_train.resize(x_train.shape[0] // dim, din) 
X_test = np.fromfile("D:/phd/github/datascience/datasets/LASER_emb/X_test.raw", dtype=np.float32, count=-1)
X_test.resize(X_test.shape[0] // dim, din)  


model_lstm = UniGRUNoAtt(maxlen, len(word_index)+1, embed_size, embedding_vector)

file_path = "./finsent_survey/model_CNN_extended.hdf5"
ckpt = ModelCheckpoint(file_path, monitor='val_loss', verbose=1,
                       save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=50)

history = model_full.fit(x_train, y, batch_size=32, epochs=40, validation_split=0.1,callbacks=[ckpt, early])


FileNotFoundError: [Errno 2] No such file or directory: 'D:/phd/github/datascience/datasets/LASER_emb/X_train.raw'

In [0]:
print(accuracy_score(y_test, y_pred_labels))
print(precision_score(y_test, y_pred_labels))
print(recall_score(y_test, y_pred_labels))
print(f1_score(y_test, y_pred_labels))

0.620442319187089
0.7372793354101765
0.6501831501831502
0.6909975669099757


In [0]:
print(accuracy_score(y_test, y_pred_labels))
print(precision_score(y_test, y_pred_labels))
print(recall_score(y_test, y_pred_labels))
print(f1_score(y_test, y_pred_labels))

0.6401673640167364
0.7179715302491103
0.739010989010989
0.7283393501805054


In [3]:
max_features = 100000
maxlen = 64
embed_size = 300
list_classes = ["sentiment"]
embedding_vector, word_index = make_glovevec("/home/users/kostadin.mishev/datasets/glove/glove.6B.300d.txt",
                                 max_features, embed_size)

In [4]:
def make_df(train_path, test_path, max_features, maxlen, list_classes, word_index):
    train = pd.read_csv(train_path,names=["id","sentiment","a","message"],sep='\t')
    test = pd.read_csv(test_path,names=["id","sentiment","a","spans"],sep='\t')
    #train = train.sample(frac=1)

    list_sentences_train = train["message"].fillna("unknown").values
    y = train[list_classes].values
    
    y_test = test[list_classes].values
    
    #y=np.where(y == 'Bullish', 1.0, 0.0)
    #y_test=np.where(y_test == 'Bullish', 1.0, 0.0)
    list_sentences_test = test["spans"].fillna("unknown").values

    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.word_index = word_index
    #tokenizer.fit_on_texts(list(list_sentences_train))
    list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
    #print(list_tokenized_train[0])
    list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
    X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
    X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

    #word_index = tokenizer.word_index
    
    return X_t, X_te, y, y_test

In [5]:
import keras
xtr, xte, y, y_test= make_df(r"/home/users/kostadin.mishev/phd/dataset/train/train.tsv",
                                  r"/home/users/kostadin.mishev/phd/dataset/dev/dev.tsv",
                                  max_features, maxlen, list_classes, word_index)
y = keras.utils.to_categorical(y, 2)
y_test=keras.utils.to_categorical(y_test, 2)

In [23]:
xtrain = []
for sentence in xtr:
    sen = [embedding_vector[w] for w in sentence]
    xtrain.append(sen)
xtrain_mean = np.average(xtrain,axis=1)
xtrain = np.asarray(xtrain)

In [9]:
xtrain = xtrain.reshape(xtrain.shape[0],xtrain.shape[1]*xtrain.shape[2])
print(xtrain.shape)

(1748, 19200)


In [24]:
xtest = []
for sentence in xte:
    sen = [embedding_vector[w] for w in sentence]
    xtest.append(sen)
xtest = np.asarray(xtest)
xtest_mean = np.average(xtest,axis=1)
xtest = xtest.reshape(xtest.shape[0],xtest.shape[1]*xtest.shape[2])

In [12]:
y = np.argmax(y,axis=1)

In [17]:
y_test = np.argmax(y_test,axis=1)

In [18]:
y_test

array([1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,

In [25]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC # "Support vector classifier"
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

for c in [0.0025,0.025,0.1,0.25,1,10,50,100,150,200,1000,2000,5000,10000,20000]:
    print(c)
    model = SVC(kernel='linear', C=c, gamma=0.001)
    model.fit(xtrain_mean, y)
    Y_pred = model.predict(xtest_mean)
    tn, fp, fn, tp = confusion_matrix(y_test,Y_pred).ravel() 
    mcc = matthews_corrcoef(y_test, Y_pred)

    print(tp)
    print(tn)
    print(fp)
    print(fn)
    print(mcc)
    print()

0.0025
52
192
27
167
0.14844967076245041

0.025
66
186
33
153
0.18013433204450768

0.1
72
195
24
147
0.26490647141300877

0.25
106
203
16
113
0.45837302613186937

1
148
183
36
71
0.5180745433377699

10
175
178
41
44
0.6119295638588299

50
177
179
40
42
0.6255968645319715

100
177
176
43
42
0.611878525069479

150
177
177
42
42
0.6164383561643836

200
176
177
42
43
0.611878525069479

1000
176
178
41
43
0.6164640635898989

2000
174
179
40
45
0.6120316797650658

5000
174
179
40
45
0.6120316797650658

10000
172
179
40
47
0.6030478607243883

20000
170
181
38
49
0.6035014886665504



In [27]:
import xgboost as xgb

#Train the XGboost Model for Classification
model1 = xgb.XGBClassifier()
#model2 = xgb.XGBClassifier(n_estimators=10000, max_depth=256, learning_rate=0.01)

xgb_model = model1.fit(xtrain_mean, y)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

Y_pred = xgb_model.predict(xtest_mean)

tn, fp, fn, tp = confusion_matrix(y_test,Y_pred).ravel() 
mcc = matthews_corrcoef(y_test, Y_pred)

print(tp)
print(tn)
print(fp)
print(fn)
print(mcc)

151
174
45
68
0.4867098623751561


In [33]:
import sklearn
#from sklearn.cross_validation import *
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import *

xgb_model = xgb.XGBClassifier()

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}


clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=StratifiedKFold(y, n_splits=5, shuffle=True), 
                   scoring='roc_auc',
                   verbose=2, refit=True)

clf.fit(xtrain_mean, y)

TypeError: __init__() got multiple values for argument 'n_splits'

In [53]:
model_lstm = CNN(maxlen, len(word_index)+1, embed_size, embedding_vector)

model_lstm.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])
file_path = "/home/users/kostadin.mishev/phd/finsent/glove/model_BidGRU_NoAtt.hdf5"
ckpt = ModelCheckpoint(file_path, monitor='val_loss', verbose=1,
                       save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)

from keras.utils import plot_model
plot_model(model_lstm, show_shapes=True, to_file='model.png')

history = model_lstm.fit(xtr, y, batch_size=120, epochs=50, validation_data=(xte,y_test), callbacks=[ckpt, early])
y_pred=model_lstm.predict(xte)
y_pred_labels=np.argmax(y_pred,axis=1)

Train on 1748 samples, validate on 438 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.69361, saving model to /home/users/kostadin.mishev/phd/finsent/glove/model_BidGRU_NoAtt.hdf5
Epoch 2/50

Epoch 00002: val_loss did not improve from 0.69361
Epoch 3/50

Epoch 00003: val_loss improved from 0.69361 to 0.69212, saving model to /home/users/kostadin.mishev/phd/finsent/glove/model_BidGRU_NoAtt.hdf5
Epoch 4/50

Epoch 00004: val_loss improved from 0.69212 to 0.68562, saving model to /home/users/kostadin.mishev/phd/finsent/glove/model_BidGRU_NoAtt.hdf5
Epoch 5/50

Epoch 00005: val_loss improved from 0.68562 to 0.67425, saving model to /home/users/kostadin.mishev/phd/finsent/glove/model_BidGRU_NoAtt.hdf5
Epoch 6/50

Epoch 00006: val_loss improved from 0.67425 to 0.64588, saving model to /home/users/kostadin.mishev/phd/finsent/glove/model_BidGRU_NoAtt.hdf5
Epoch 7/50

Epoch 00007: val_loss improved from 0.64588 to 0.61822, saving model to /home/users/kostadin.mishev/phd/finsent/

In [0]:
import pickle
with open("./results_BidGRU+GLOVE300.pickle","wb") as f:
    pickle.dump(y_pred_labels,f)

In [56]:
model_lstm.load_weights(filepath=file_path)

y_pred=model_lstm.predict(xte)
y_pred_labels=np.argmax(y_pred,axis=1)

In [57]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

y_test_labels = np.argmax(y_test,axis=1)

print(accuracy_score(y_test_labels, y_pred_labels))
print(precision_score(y_test_labels, y_pred_labels))
print(recall_score(y_test_labels, y_pred_labels))
print(f1_score(y_test_labels, y_pred_labels))
print(matthews_corrcoef(y_test_labels, y_pred_labels))

#FP = confusion_matrix(y_test, y_pred_labels).sum(axis=0) - np.diag(confusion_matrix)  
#FN = confusion_matrix(y_test, y_pred_labels).sum(axis=1) - np.diag(confusion_matrix)
#TP = np.diag(confusion_matrix(y_test, y_pred_labels))
#TN = confusion_matrix(y_test, y_pred_labels).values.sum() - (FP + FN + TP)

print(confusion_matrix(y_test_labels, y_pred_labels))
#print(FN)
#print(TP)
#print(TN)

0.769406392694064
0.8172043010752689
0.6940639269406392
0.7506172839506172
0.5450360955491317
[[185  34]
 [ 67 152]]


In [16]:
y_test

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.

In [0]:
inp = Input(shape=(maxlen, ))
x = Embedding(len(word_index)+1, embed_size, weights=[embedding_vector],
              trainable=False)(inp)
x = Bidirectional(GRU(100,weights=model_lstm.layers[2].get_weights(), return_sequences=True, dropout=0.25,
                        recurrent_dropout=0.25))(x)
x = Attention(maxlen,weights=model_lstm.layers[3].get_weights())(x)
#    x = Flatten(x)
x = Dense(256, activation="relu",weights=model_lstm.layers[4].get_weights())(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

x_train_emb = model.predict(xte)

#x = Attention(maxlen)(x)

#x = Dense(256, activation="relu",weights=model_lstm.layers[4].get_weights())(x)
#x = Dropout(0.25)(x)
#x = Dense(2, activation="sigmoid")(x)
#model = Model(inputs=inp, outputs=x)


#def BidGRU(maxlen, max_features, embed_size, embedding_matrix)
#model = BidGRU(maxlen, len(word_index)+1, embed_size, embedding_vector)

#model.compile(loss='binary_crossentropy', optimizer='adam',
#              metrics=['accuracy'])

#model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
#activations = model.predict(X_train)

In [12]:
x_train_emb.shape

NameError: name 'x_train_emb' is not defined

In [0]:
import pickle
with open("test.pickle","wb") as f:
    pickle.dump(x_train_emb,f)