In [6]:
# Imports
import csv
import re
import nltk
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess

# Download necessary NLTK data (only needs to be done once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Remove unwanted tokens (e.g., '<unk>' placeholders)
    text = re.sub(r'<unk>', '', text)
    # Remove numbers, punctuation, special characters
    text = re.sub(r'\d+|[^\w\s]', '', text)
    # Use gensim's simple_preprocess for lowercasing and tokenizing
    prep_words = simple_preprocess(text)
    # Remove stopwords
    cleaned_words = [word for word in prep_words if word not in stop_words]
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

def lemmatize_text(text):
    words = text.split()
    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

In [8]:
with open('./data/WikiText-103.txt', 'r') as file:
    data = file.read()

# Clean the text
clean_data = clean_text(data)

# Lemmatize the cleaned text
lemmatized_data = lemmatize_text(clean_data)

# Tokenize the lemmatized text
tokenized_data = word_tokenize(lemmatized_data)

# Break data into chunks (prior to tokenizing would be more memory efficient)
chunk_size = 1000
chunks = [tokenized_data[i:i + chunk_size] for i in range(0, len(tokenized_data), chunk_size)]

# Join tokens back
processed_chunks = [' '.join(chunk) for chunk in chunks]

In [9]:
# Vectorize the documents with tf-idf
# Adjust the vectorizer to handle bigrams and modify min_df to better handle rare terms
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1, max_df=0.85)
X_tfidf = vectorizer.fit_transform(processed_chunks)

# Get the vocabulary
vocab = vectorizer.get_feature_names_out()

7804243


In [10]:
# Read word pairs from csv
word_pairs = []
with open('./data/CW-1-testdata.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        word_pairs.append((row[0],row[1], row[2]))

In [11]:
# Prepare the output list for results
results = []

# Loop through each word pair and calculate cosine similarity
for id, word1, word2 in word_pairs:
    try:
        vector_list = []
        lemmatized_word1 = lemmatize_text(word1)
        lemmatized_word2 = lemmatize_text(word2)

        for word in [lemmatized_word1, lemmatized_word2]:
          if word in vocab:
            # Get the index of the word in the TF-IDF vocabulary
            word_index = vocab.tolist().index(word)

            # Extract the corresponding TF-IDF vector for the word
            vector = X_tfidf[:, word_index].toarray().flatten()
            vector_list.append(vector)
          else:
            # For OOV words or phrases, split into parts and average their vectors
            word_parts = word.split()
            known_vectors = []
            for part in word_parts:
                if part in vocab:
                    # Get the index of the part in the TF-IDF vocabulary
                    part_index = vocab.tolist().index(part)
                    part_vector = X_tfidf[:, part_index].toarray().flatten()
                    known_vectors.append(part_vector)

            if known_vectors:
                # Return the average of known vectors
                vector_list.append(np.mean(known_vectors, axis=0))
            else:
                # Return a zero vector if all parts are OOV
                vector_list.append(np.zeros(X_tfidf.shape[0]))  # Shape should match the number of documents

        # Calculate cosine similarity between the vectors
        cosine_sim = cosine_similarity([vector_list[0]], [vector_list[1]])

        # Append the result for the current pair
        results.append((id,word1, word2, cosine_sim[0][0]))
    except ValueError:
        # If a word is not in the vocabulary, append a 0.0 similarity
        results.append((id,word1, word2, 0.0))

# Write the results to a CSV file
with open('10868226_task1_results.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    # Write each result row
    csvwriter.writerows(results)

816 accept acknowledge 0.062051891863678654
957 accept recommend 0.019380229261442335
809 agree argue 0.08917866495338614
911 agree please 0.020579494983059166
242 alcohol cocktail 0.008543916513812444
697 alcohol wine 0.10794172846931928
2066 announcement news 0.09369093375434416
2164 announcement effort 0.0776404298824904
14 terrible bad 0.03932987898571876
51 great bad 0.08770056065268685
176 beach seashore 0.09829106383446484
402 beach sea 0.1327959277518781
278 beer alcohol 0.40855667142557633
279 beer beverage 0.24866635238347168
883 begin quit 0.06224793899697052
966 begin go 0.22932599244268592
633 car elevator 0.04033338164034535
2026 car automobile 0.20277859633817047
2030 coast shore 0.18637658097106297
2154 coast forest 0.049823588557094275
726 doctor temper 0.007309042601859112
2008 doctor nurse 0.0426377449063066
772 dollar people 0.04815854812223645
2081 dollar buck 0.009014474145485857
189 door doorway 0.14400596922199374
496 door floor 0.179039543932689
24 dumb foolish