In [35]:
# Imports
import re
import csv
import nltk
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.metrics.pairwise import cosine_similarity
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

# Download necessary NLTK data (only needs to be done once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [36]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Remove unwanted tokens (e.g., '<unk>' placeholders)
    text = re.sub(r'<unk>', '', text)
    # Remove numbers, punctuation, special characters
    text = re.sub(r'\d+|[^\w\s]', '', text)
    # Use gensim's simple_preprocess for lowercasing and tokenizing
    prep_words = simple_preprocess(text)
    # Remove stopwords
    cleaned_words = [word for word in prep_words if word not in stop_words]
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

def lemmatize_text(text):
    words = text.split()
    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

In [37]:
with open('./data/WikiText-103.txt', 'r') as file:
    data = file.read()

# Clean the text
clean_data = clean_text(data)

# Lemmatize the cleaned text
lemmatized_data = lemmatize_text(clean_data)

# Tokenize the lemmatized text
tokenized_data = word_tokenize(lemmatized_data)

# Create chunks of 1000 tokens
chunk_size = 1000
chunks = [tokenized_data[i:i+chunk_size] for i in range(0, len(tokenized_data), chunk_size)]

# Join each token in the chunk together
chunks = [' '.join(chunk) for chunk in chunks]

# Tokenize the documents into words
tokenized_chunks = [word_tokenize(chunk) for chunk in chunks]

In [38]:
# Train Word2Vec using Skip-gram (sg=1 for Skip-gram, sg=0 for CBOW)
word2vec_model = Word2Vec(sentences=tokenized_chunks, vector_size=100, window=5, sg=1, min_count=1, workers=4)

In [39]:
# Read word pairs from csv
word_pairs = []
with open('./data/CW-1-testdata.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        word_pairs.append((row[0],row[1], row[2]))

In [40]:
# # Function to get vector for a word or phrase and handle OOV and multi-word terms
def get_word_vector(word, model):
    if word in model.wv:
        return model.wv[word]
    else:
        word_parts = word.split()
        known_words = [w for w in word_parts if w in model.wv]
        if known_words:
            # Average vectors of known words to represent the OOV word
            return np.mean([model.wv[w] for w in known_words], axis=0)
        else:
            # Return vector of a similar word or root word, or the mean of the vocabulary as a fallback
            return np.mean(model.wv.vectors, axis=0)  # Mean embedding as a fallback

In [41]:
# Prepare the output list for results
results = []

# Loop through each word pair and calculate cosine similarity
for id, word1, word2 in word_pairs:
    try:
        # Fetch vectors for both words
        vector1 = get_word_vector(lemmatize_text(word1), word2vec_model).reshape(1, -1)
        vector2 = get_word_vector(lemmatize_text(word2), word2vec_model).reshape(1, -1)

        # Calculate cosine similarity
        cosine_sim = cosine_similarity(vector1, vector2)[0][0]

        # Append the result for the current pair
        results.append((id, word1, word2, cosine_sim))

    except ValueError:
        results.append((id, word1, word2, 0.0))

# Write the results to a new CSV file
with open('10868226_task2_results.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    # Write each result row
    csvwriter.writerows(results)