In [None]:
import numpy as np
import string
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from datasets import load_dataset
from tensorflow.keras.optimizers.legacy import Adam

In [None]:
dataset = load_dataset("PiC/phrase_similarity")

In [None]:
# Path to the GloVe file
word2vec_output_file = 'embeddings/glove.6B.100d.word2vec.txt'


# Loading the model
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)


In [None]:
def preprocess(phrase):
    # Convert to lowercase, remove punctuation, and split into words
    preprocessed_phrase = phrase.lower().translate(str.maketrans('', '', string.punctuation)).split()
    return preprocessed_phrase


def phrase_to_vec(phrase):
    # Convert a phrase into a vector by averaging the vectors of its words
    words = preprocess(phrase)
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:
        return np.zeros(model.vector_size)
    
    averaged_vector = np.mean(word_vectors, axis=0)
    return averaged_vector

def cosine_sim(vec1, vec2):
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]

In [None]:
def create_features(data):
    features = []
    labels = []
    for item in data:
        vec1 = phrase_to_vec(item['phrase1'])
        vec2 = phrase_to_vec(item['phrase2'])
        similarity = cosine_sim(vec1, vec2)

        features.append(similarity)
        labels.append(item['label'])
    return np.array(features), np.array(labels)



In [None]:
new_training_dataset = dataset['train'].shard(num_shards=1,index = 0)
new_val_dataset = dataset['validation'].shard(num_shards=1,index = 0)
new_test_dataset = dataset['test'].shard(num_shards=1,index = 0)

print(new_training_dataset)
print(new_val_dataset)
print(new_test_dataset)

train_features, train_labels = create_features(new_training_dataset)
val_features, val_labels = create_features(new_val_dataset)
test_features, test_labels = create_features(new_test_dataset)


In [None]:
# Define the model
def create_model(input_shape):
    input_layer = Input(shape=input_shape)
    
    x = Dense(256, activation='relu')(input_layer)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    output_layer = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
train_features = train_features.reshape(-1, 1)
val_features = val_features.reshape(-1, 1)
test_features = test_features.reshape(-1, 1)
model = create_model(train_features.shape[1:])

# Train the model
model.fit(train_features, train_labels, epochs=100, batch_size=32, validation_data=(val_features, val_labels))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_features, test_labels)
print(f'Test Accuracy: {test_accuracy}')