In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd drive/MyDrive/Colab Notebooks

/content/drive/MyDrive/Colab Notebooks


Import library

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import nltk
from nltk.corpus import stopwords
import pickle

Load tokenizer

In [4]:
# load tokenizer
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

Test data

In [5]:
# Load test data
test_data = pd.read_csv('./NLU/test.csv')
test_data

Unnamed: 0,text_1,text_2
0,We have received the executed Confidentiality ...,Per our earlier agreements with the Online Tea...
1,FYI Vince ---------------------- Forwarded by ...,http://www.math.ethz.ch/~baltes/ftp/papers.html
2,"that was funny, ben.","ita vero, new topic, 'Why our third greatest p..."
3,"Back in Thatcher's Britain , rent-a-quote Tory...","Below-par Warner's B movie . True , this is a ..."
4,"i guess i didn't mind this movie . i mean , it...",hmmm . here we have another example of classic...
...,...,...
5995,"After reading about Pele, i clicked on the 'Ed...","oh man, check out todays APOD picture. Almost ..."
5996,"I still want to read it, lamentations or not. ...",Are you asking me?
5997,"Perhaps this is how to do it. On Wednesday, ur...","'It's not so much how busy you are, but why yo..."
5998,urlLink Al Arabiya TV denies U.S. charges over...,urlLink Custom Posters grab yourself a bush ch...


Data Pre-processing

In [6]:
# Download stopwords from nltk library
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Data Cleaning
def clean_text(text):
    # Convert to string
    text = str(text)
    # Lowercase the text
    text = text.lower()
    # Remove punctuaction
    text = re.sub(r'[^A-Za-z0-9 ]', '', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Preprocess the test data
test_data['text_1'] = test_data['text_1'].apply(clean_text)
test_data['text_2'] = test_data['text_2'].apply(clean_text)
test_data['combined_text'] = test_data['text_1'] + " " + test_data['text_2']

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(test_data['combined_text'])

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=209)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Testing model on test data

In [7]:
# Load model
model = load_model('./NLU/gru.h5')

In [8]:
# Thresholding
best_threshold = 0.5

# Generate predictions
test_predictions = model.predict(padded_sequences)

# Apply threshold to convert probabilities to binary output
test_predicted_labels = (test_predictions > best_threshold).astype(int)



In [9]:
result_df = pd.DataFrame(test_predicted_labels, columns=['prediction'])
result_df.to_csv("./NLU/Group_21_B.csv", index=False)