In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd drive/MyDrive/Colab Notebooks

/content/drive/MyDrive/Colab Notebooks


Import library

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import re
import nltk
from nltk.corpus import stopwords

Train data

In [4]:
# Load train data
train_data = pd.read_csv('./NLU/train.csv')
train_data

Unnamed: 0,text_1,text_2,label
0,Nick ( Kevin Anderson ) goes back to his homet...,Bank clerk Miles Cullen ( Elliott Gould ) is r...,1
1,"Kate Nelligan , always a forthright and grippi...",Anthony Perkins reportedly felt threatened as ...,1
2,"Patrick, Please, contact Zimin Lu, 713 853 638...","Corey, Paula West Trilium Court 107 The Woodla...",1
3,"wow, ok so my mom was saying how when she gets...","dude, i'm cold",1
4,"Flat broke again , Stan borrows from ' Basher ...",I had been underwhelmed by my first viewing of...,0
...,...,...,...
29995,"John and Krishna, I am sending you an outline ...",FYI Vince,1
29996,You want to know about rabies ? Try this : One...,Wow what a great idea for a movie getting a bu...,1
29997,Plz see email. Sent you info for u & Rick. Bes...,Here's Paul's memo with Mona's edits and a cou...,1
29998,Pretty blonde Paris Hilton ( as Victoria Engli...,Crippled violinist David Miles ( as Filippo ) ...,1


Data Pre-processing

In [5]:
# Download stopwords from nltk library
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Data Cleaning
def clean_text(text):
    # Convert to string
    text = str(text)
    # Lowercase the text
    text = text.lower()
    # Remove punctuaction
    text = re.sub(r'[^A-Za-z0-9 ]', '', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_data['text_1'] = train_data['text_1'].apply(clean_text)
train_data['text_2'] = train_data['text_2'].apply(clean_text)

# Drop rows with any na values in train data
train_data = train_data.dropna(subset=['text_1', 'text_2'])

# Combine texts for a single authorship verification task
train_data['combined_text'] = train_data['text_1'] + " " + train_data['text_2']

# Tokenization and sequence preparation
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['combined_text'])
sequences = tokenizer.texts_to_sequences(train_data['combined_text'])

# Find the optimal max length for padding
max_len = np.mean([len(x) for x in sequences]) + 2 * np.std([len(x) for x in sequences])
max_len = int(min(max_len, max([len(x) for x in sequences])))

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Labels
labels = train_data['label'].values

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Word Embedding

In [6]:
# Load GloVe Embeddings
def load_glove_embeddings(path):
    embeddings_index = {}
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

embeddings_index = load_glove_embeddings('./NLU/glove.6B.100d.txt')

# Create an embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

GRU Model

In [7]:
# Building the model
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)
gru_layer = GRU(256, return_sequences=True)(embedding_layer)
pooling_layer = GlobalAveragePooling1D()(gru_layer)
dense_layer = Dense(64, activation='relu')(pooling_layer)
dropout_layer = Dropout(0.5)(dense_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

# Compile model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 209)]             0         
                                                                 
 embedding (Embedding)       (None, 209, 100)          13117900  
                                                                 
 gru (GRU)                   (None, 209, 256)          274944    
                                                                 
 global_average_pooling1d (  (None, 256)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 64)                16448     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                             

In [8]:
# Train model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(padded_sequences, labels, batch_size=32, epochs=50, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


<keras.src.callbacks.History at 0x7fed75737340>

In [9]:
# Save model
model.save('./NLU/gru.h5')

  saving_api.save_model(


Development data

In [10]:
# Load development data
dev_data = pd.read_csv('./NLU/dev.csv')
dev_data

Unnamed: 0,text_1,text_2,label
0,"Carol, Congratulations. Vince Carol Coats 10/1...","Andrew, I shall be glad to meet and discuss th...",1
1,This film has only ever been shown once in my ...,I’d been interested in watching this ever sinc...,1
2,"I can help with euro, spanish, english, and ch...",Some Oratorian information (with emphasis on o...,0
3,"CNN's website stopped the red scroll today, an...",Folks! JOE was here! Some may remember JOE fro...,0
4,---------------------- Forwarded by Tana Jones...,----- Forwarded by Tana Jones/HOU/ECT on 07/19...,1
...,...,...,...
5995,Never thought a film about a gay cannibal coul...,This takes place in 1934 Chicago . Sheet music...,1
5996,urlLink US-TX-Houston-Korean Translators manwh...,Man o man do I hate fridays at work. I want to...,0
5997,"I know a little about "" Sweeny Todd "" , but I ...",Death Knocks Twice is a somewhat interesting t...,1
5998,A Walt Disney SILLY SYMPHONY Cartoon Short . A...,What a Country was shown on syndication rather...,0


Data Pre-processing

In [11]:
# Preprocess the development data
dev_data['text_1'] = dev_data['text_1'].apply(clean_text)
dev_data['text_2'] = dev_data['text_2'].apply(clean_text)
dev_data['combined_text'] = dev_data['text_1'] + " " + dev_data['text_2']

# Convert texts to sequences
dev_sequences = tokenizer.texts_to_sequences(dev_data['combined_text'])

# Pad sequences
dev_padded_sequences = pad_sequences(dev_sequences, maxlen=max_len)

# Labels
dev_labels = dev_data['label'].values

In [12]:
# Load model
model = load_model('./NLU/gru.h5')

Testing model on development data

In [13]:
# Set threshold = 0.5
# Experiment with various settings (highest accuracy, higest F1 score ...)
best_threshold = 0.5

# Generate predictions
dev_predictions = model.predict(dev_padded_sequences)

# Apply threshold to convert probabilities to binary output
dev_predicted_labels = (dev_predictions > best_threshold).astype(int)



In [14]:
result_df = pd.DataFrame(dev_predicted_labels, columns=['prediction'])
result_df.to_csv("./NLU/gru_dev_result.csv")

Evaluation

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [16]:
print("Accuracy:", accuracy_score(dev_labels, dev_predicted_labels))
print("Precision:", precision_score(dev_labels, dev_predicted_labels))
print("Recall:", recall_score(dev_labels, dev_predicted_labels))
print("F1 Score:", f1_score(dev_labels, dev_predicted_labels))

Accuracy: 0.5795
Precision: 0.5724465558194775
Recall: 0.6403188309531717
F1 Score: 0.6044834613575796
