In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
project_path = '/content/drive/My Drive/Technical Debt/Codes/BiLSTM/'## we will store our data in this drive

In [None]:
import re
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk import re, SnowballStemmer

In [None]:
##building bi-lstm architecture

from keras import callbacks
from keras.models import Sequential
from keras.layers import Activation,Flatten,Dense,Dropout,Embedding,Bidirectional,LSTM



def create_model(vocabulary_size,embedding_size,embedding_matrix):
    model_glove = Sequential()
    model_glove.add(Embedding(vocabulary_size, embedding_size, weights=[embedding_matrix], trainable=False))
    model_glove.add(Bidirectional(LSTM(100)))
    model_glove.add(Dense(1, activation='sigmoid'))
    model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model_glove.summary()
    return model_glove


def callback(model_name,tf_log_dir_name='./tf-log/',patience_lr=10,):
    cb = []
    """
    Tensorboard log callback
    """
    tb = callbacks.TensorBoard(log_dir=tf_log_dir_name, histogram_freq=0)
    cb.append(tb)

    """
    Model-Checkpoint
    """
    m = callbacks.ModelCheckpoint(filepath=model_name,monitor='val_loss',mode='auto',save_best_only=True)
    cb.append(m)

    """
    Reduce Learning Rate
    """
    reduce_lr_loss = callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=patience_lr, verbose=1, epsilon=1e-4, mode='min')
    cb.append(reduce_lr_loss)

    """
    Early Stopping callback
    """
    # Uncomment for usage
    early_stop = callbacks.EarlyStopping(monitor='val_acc', min_delta=0, patience=5, verbose=1, mode='auto')
    cb.append(early_stop)



    return cb

######### Show Train Val History Graph ###############
def plot_loss_accu(history,lossLoc='Train_Val_Loss',accLoc='Train_Val_acc'):
    import matplotlib.pyplot as plt

    plt.clf()

    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(len(loss))
    plt.plot(epochs, loss, 'r')
    plt.plot(epochs, val_loss, 'b')
    plt.title('Training and validation loss')
    plt.legend(['train', 'val'], loc='upper right')
    #plt.show()
    plt.savefig(lossLoc)

    plt.clf()

    acc = history.history['acc']
    val_acc = history.history['val_acc']
    epochs = range(len(acc))
    plt.plot(epochs, acc, 'r')
    plt.plot(epochs, val_acc, 'b')
    plt.title('Training and validation accuracy')
    plt.legend(['train', 'val'], loc='lower right')
    #plt.show()
    plt.savefig(accLoc)


    return model_glove

In [None]:
from keras.models import Sequential,load_model
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import pickle
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import gc
import keras.backend as K

###########################################################################
vocabulary_size = 400000

time_step=300

dataset=pickle.load(open(project_path+"Train_Test_Data/train_df.pkl","rb"))
texts=[]
#texts=dataset['Statement'].astype(str).values.tolist()

texts=dataset['text']
#texts=texts.apply(str)
#texts=texts.map(lambda x: clean_text(x))

label=dataset['labels'].astype(int).values.tolist()
labelEncoder=LabelEncoder()
encoded_label=labelEncoder.fit_transform(label)
y=np.reshape(encoded_label,(-1,1))

tokenizer_train=Tokenizer(num_words=vocabulary_size)
tokenizer_train.fit_on_texts(texts)
encoded_train=tokenizer_train.texts_to_sequences(texts=texts)
#print(encoded_docs)
vocab_size_train = len(tokenizer_train.word_index) + 1
print(vocab_size_train)

X = sequence.pad_sequences(encoded_train, maxlen=time_step,padding='post')


f = open(project_path+'glove.6B.100d.txt',encoding='utf-8')
embeddings_train={}
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_train[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_train))

embedding_size=100

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size_train, embedding_size))
for word, i in tokenizer_train.word_index.items():
    embedding_vector_train = embeddings_train.get(word)
    if embedding_vector_train is not None:
        embedding_matrix[i] = embedding_vector_train


dataset=pickle.load(open(project_path+"Train_Test_Data/test_df.pkl","rb"))
statement=dataset.iloc[:,2:3].values
#statement=statement.lower()
texts=[]

texts=dataset['text']

#texts=texts.map(lambda x: clean_text(x))

label=dataset['labels'].astype(int).values.tolist()
labelEncoder=LabelEncoder()
encoded_label=labelEncoder.fit_transform(label)
y_test=np.reshape(encoded_label,(-1,1))

encoded_test=tokenizer_train.texts_to_sequences(texts=texts)
X_test = sequence.pad_sequences(encoded_test, maxlen=time_step, padding='post')

vocab_size=embedding_matrix.shape[0]
##########################################################################################



Fold = 1

gc.collect()
K.clear_session()
print('Fold: ', Fold)

X_train = X


y_train = y


# create model
print("Creating and Fitting Model...")
model = create_model(vocabulary_size=vocab_size,embedding_size=embedding_size,embedding_matrix=embedding_matrix)

history=model.fit(X_train, y_train, epochs=10, batch_size=128,shuffle=True)##############


# evaluate the model
print("Evaluating Model...")
##########################################
scores = model.evaluate(X_test, y_test, verbose=0)
print("Eval %s: %.2f%%" % (model.metrics_names[1], scores[1]))





In [None]:
from sklearn.metrics import precision_recall_fscore_support, classification_report

y_pred = model.predict(X_test)
y_pred = np.round(y_pred).astype(int)
print(classification_report(y_test, y_pred))