In [None]:
! pip install gensim

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import pickle
from google.colab import files
import pandas as pd
import gensim.downloader as api
from gensim.models import KeyedVectors

In [None]:
# Upload xlsx file from your computer
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_excel(filename, header=None)

# Save as CSV (without headers, since func_morpheme_vector_pipeline expects no headers)
csv_filename = filename.replace('.xlsx', '.csv')
df.to_csv(csv_filename, index=False, header=False)

print(f"Converted {filename} to {csv_filename}")
print(f"First few rows:")
print(df.head())

In [None]:
# Download pre-trained fastText (English)
print("Downloading fastText embeddings...")
emb = api.load('fasttext-wiki-news-subwords-300')
print("Done!")

In [6]:

def func_morpheme_vector_pipeline(filename, myRs, emb):
    """
    This function evaluates a morpheme transformation (like plural suffix)
    using static word embeddings.
    INPUT:
        - filename: path to a .csv or .txt file with 2 columns (stem, affixed)
        - myRs: a scaling factor (e.g., for regularization or dimension reduction)
        - emb: a dictionary-like word embedding object with get_vector() method
    OUTPUT:
        - accuracy: proportion of correct top-1 predictions
        - sums: total number of correct predictions
    """

    numSamples = 100  # Number of random train/test splits
    myAcc = np.zeros(numSamples)  # Stores 1 for correct, 0 for incorrect
    myRanks = np.zeros(numSamples)  # Stores the rank of the correct word
    predictedNeighborsAll = []  # Stores neighbors for analysis
    sampleSize = 32  # Number of words per sample (1 test + 31 training)

    # Read word pairs from file
    wordList = pd.read_csv(filename, header=None)  # Expects 2 columns: [stem, affixed]

    # Loop through samples
    for s in range(numSamples):
        idx = np.random.permutation(len(wordList))[:sampleSize]  # Randomly pick 32 rows
        testIdx = idx[0]  # First word pair is for testing
        trainIdx = idx[1:]  # Remaining 31 pairs for training

        # Training words: stems and their inflected forms
        stemTrain = wordList.iloc[trainIdx, 0].values
        pluralTrain = wordList.iloc[trainIdx, 1].values

        # Get embedding vectors
        embStem = np.array([emb.get_vector(word.lower()) for word in stemTrain])
        embPlural = np.array([emb.get_vector(word.lower()) for word in pluralTrain])

        # Compute average transformation vector and apply transformation matrix
        pluralVector = np.mean(embPlural - embStem, axis=0) * myRs

        # Test word
        testStem = wordList.iloc[testIdx, 0]
        testPlural = wordList.iloc[testIdx, 1]

        # Apply plural vector to test stem
        testStemVec = emb.get_vector(testStem.lower())
        predictedVec = testStemVec + pluralVector

        # Predict nearest word from embedding space
        neighbors_tuples = emb.most_similar(predictedVec, topn=10)  # Top-10 nearest neighbors
        neighbors = [word for word, score in neighbors_tuples]  # Extract just the words

        # Store neighbors and correct form
        predictedNeighborsAll.append({
            'testPlural': testPlural,
            'neighbors': neighbors
        })

        # Check if the correct inflected form is in the top-10
        neighbors_lower = [n.lower() for n in neighbors]
        if testPlural.lower() in neighbors_lower:
            rank = neighbors_lower.index(testPlural.lower()) + 1
        else:
            rank = 11  # If not found in top-10

        myRanks[s] = rank
        myAcc[s] = (rank == 1)  # Top-1 accuracy

    # Calculate overall accuracy and correct prediction count
    accuracy = np.sum(myAcc) / numSamples
    sums = np.sum(myAcc)

    # Save outputs to file
    structName = f'Rs_{myRs:.1f}'
    structName = structName.replace('.', '_')

    baseName = filename.split('/')[-1].split('.')[0]

    x = datetime.now()
    myDate = x.strftime('%m%d')

    saveDict = {
        'myAcc': myAcc,
        'myRanks': myRanks,
        'predictedNeighborsAll': predictedNeighborsAll,
        'accuracy': accuracy
    }

    saveName = f'morphoEmbVec{structName}_{baseName}_{myDate}.pkl'
    with open(saveName, 'wb') as f:
        pickle.dump(saveDict, f)

    return accuracy, sums