# Setup

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import pandas as pd
import numpy as np
import zipfile
import re

# Load dev data set

In [None]:
dev_corpus = pd.read_csv("dev.csv", encoding='utf-8')
dev_labels = np.array(dev_corpus['label'])

# Load Models

In [None]:
with zipfile.ZipFile('LSTM_MODEL.zip', 'r') as zip_ref:
    zip_ref.extractall('LSTM_MODEL')

LSTM_MODEL = tf.keras.models.load_model("LSTM_MODEL/content/AV_LSTM_MODEL")

#Prepare data

In [None]:
def preprocess(string):
  output = str(string).lower()
  separated_string = re.sub(r'([^\w\s])', r' \1 ', str(string))
  return output

def tokenise(data, column_1, column_2, max_sequence_length) :
  first_pairs = data[column_1].tolist()
  second_pairs = data[column_2].tolist()

  # init tokeniser
  tk = Tokenizer(oov_token='UNK', lower=True)
  tk.fit_on_texts(first_pairs + second_pairs)

  # tokenise texts
  tokenised_first_pairs = tk.texts_to_sequences(first_pairs)
  tokenised_second_pairs = tk.texts_to_sequences(second_pairs)

  # pad sequences
  tokenised_first_pairs = pad_sequences(tokenised_first_pairs, maxlen=max_sequence_length, padding='pre')
  tokenised_second_pairs = pad_sequences(tokenised_second_pairs, maxlen=max_sequence_length, padding='pre')

  # return vocabulary
  vocab = tk.word_index

  return tokenised_first_pairs, tokenised_second_pairs, vocab

def combine_pairwise_data(sequence_1, sequence_2) :
  tuple_list = []
  for i in range(len(sequence_1)) :
    tuple_list.append((sequence_1[i], sequence_2[i]))
  return np.array(tuple_list)

def prepare_test_data(dev_data) :
  dev_data["text_1"] = dev_data["text_1"].apply(lambda x: preprocess(x))
  dev_data["text_2"] = dev_data["text_2"].apply(lambda x: preprocess(x))
  SEQUENCE_SIZE = 150
  sequences_1, sequences_2, vocab = tokenise(dev_data, "text_1", "text_2", SEQUENCE_SIZE)
  return [sequences_1, sequences_2]

input_data = prepare_test_data(dev_corpus)



# Test Models

In [None]:
predictions = LSTM_MODEL.predict(input_data)
binary_predictions = (predictions >= 0.5).astype(int)



# Save predictions

In [None]:
predictions_DF = pd.DataFrame(binary_predictions, columns=['prediction'])
predictions_DF.to_csv('Group_26_B.csv', index=False)

# Generate Metrics