In [None]:
! pip install gensim

In [None]:
import pandas as pd
import numpy as np
import glob
import pickle

In [None]:
# Upload xlsx file from your computer
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_excel(filename, header=None)

# Save as CSV (without headers, since func_morpheme_vector_pipeline expects no headers)
csv_filename = filename.replace('.xlsx', '.csv')
df.to_csv(csv_filename, index=False, header=False)

print(f"Converted {filename} to {csv_filename}")
print(f"First few rows:")
print(df.head())

In [None]:
# Download pre-trained fastText (English)
print("Downloading fastText embeddings...")
emb = api.load('fasttext-wiki-news-subwords-300')
print("Done!")

In [6]:
def func_morpheme_vector_pipeline(filename, myRs, emb):
    """
    This function evaluates a morpheme transformation (like plural suffix)
    using static word embeddings.
    INPUT:
        - filename: path to a .csv or .txt file with 2 columns (stem, affixed)
        - myRs: a scaling factor (e.g., for regularization or dimension reduction)
        - emb: a dictionary-like word embedding object with get_vector() method
    OUTPUT:
        - accuracy: proportion of correct top-1 predictions
        - sums: total number of correct predictions
    """

    numSamples = 100  # Number of random train/test splits
    myAcc = np.zeros(numSamples)  # Stores 1 for correct, 0 for incorrect
    myRanks = np.zeros(numSamples)  # Stores the rank of the correct word
    predictedNeighborsAll = []  # Stores neighbors for analysis
    sampleSize = 32  # Number of words per sample (1 test + 31 training)

    # Read word pairs from file
    wordList = pd.read_csv(filename, header=None)  # Expects 2 columns: [stem, affixed]

    # Loop through samples
    for s in range(numSamples):
        idx = np.random.permutation(len(wordList))[:sampleSize]  # Randomly pick 32 rows
        testIdx = idx[0]  # First word pair is for testing
        trainIdx = idx[1:]  # Remaining 31 pairs for training

        # Training words: stems and their inflected forms
        stemTrain = wordList.iloc[trainIdx, 0].values
        pluralTrain = wordList.iloc[trainIdx, 1].values

        # Get embedding vectors
        embStem = np.array([emb.get_vector(word.lower()) for word in stemTrain])
        embPlural = np.array([emb.get_vector(word.lower()) for word in pluralTrain])

        # Compute average transformation vector and apply transformation matrix
        pluralVector = np.mean(embPlural - embStem, axis=0) * myRs

        # Test word
        testStem = wordList.iloc[testIdx, 0]
        testPlural = wordList.iloc[testIdx, 1]

        # Apply plural vector to test stem
        testStemVec = emb.get_vector(testStem.lower())
        predictedVec = testStemVec + pluralVector

        # Predict nearest word from embedding space
        neighbors_tuples = emb.most_similar(predictedVec, topn=10)  # Top-10 nearest neighbors
        neighbors = [word for word, score in neighbors_tuples]  # Extract just the words

        # Store neighbors and correct form
        predictedNeighborsAll.append({
            'testPlural': testPlural,
            'neighbors': neighbors
        })

        # Check if the correct inflected form is in the top-10
        neighbors_lower = [n.lower() for n in neighbors]
        if testPlural.lower() in neighbors_lower:
            rank = neighbors_lower.index(testPlural.lower()) + 1
        else:
            rank = 11  # If not found in top-10

        myRanks[s] = rank
        myAcc[s] = (rank == 1)  # Top-1 accuracy

    # Calculate overall accuracy and correct prediction count
    accuracy = np.sum(myAcc) / numSamples
    sums = np.sum(myAcc)

    # Save outputs to file
    structName = f'Rs_{myRs:.1f}'
    structName = structName.replace('.', '_')

    baseName = filename.split('/')[-1].split('.')[0]

    x = datetime.now()
    myDate = x.strftime('%m%d')

    saveDict = {
        'myAcc': myAcc,
        'myRanks': myRanks,
        'predictedNeighborsAll': predictedNeighborsAll,
        'accuracy': accuracy
    }

    saveName = f'morphoEmbVec{structName}_{baseName}_{myDate}.pkl'
    with open(saveName, 'wb') as f:
        pickle.dump(saveDict, f)

    return accuracy, sums

In [13]:
# Create func_emb_metrics.py
def func_emb_metrics(xlsxFile):
    """
    FUNC_EMB_METRICS calculates Top-1, Top-10, and Mean Rank from prediction results.

    Input:
        xlsxFile - Excel file containing 'Expected' column and 10 nearest neighbors

    Output:
        top1     - proportion of correct words ranked at position 1
        top10    - proportion of correct words ranked in top 10
        meanRank - average rank of correct words
    """

    T = pd.read_excel(xlsxFile)

    # Print columns to debug
    print(f"Columns in file: {T.columns.tolist()}")

    expected = T.iloc[:, 0]  # First column
    myNeighbors = T.iloc[:, 1:].values  # all columns after the first

    n = len(T)
    top1Count = 0
    top10Count = 0
    ranks = np.zeros(n)

    for i in range(n):
        currentExpected = expected[i]
        currentNeighbors = myNeighbors[i, :]

        # Remove empty entries
        currentNeighbors = [n for n in currentNeighbors if isinstance(n, str) and n != '']

        # Find rank of expected form
        matchIdx = []
        for j, neighbor in enumerate(currentNeighbors):
            if currentExpected.lower() == neighbor.lower():
                matchIdx.append(j + 1)

        if len(matchIdx) > 0:
            if matchIdx[0] == 1:
                top1Count = top1Count + 1
            top10Count = top10Count + 1
            ranks[i] = matchIdx[0]
        else:
            ranks[i] = np.nan  # mark as not found

    # Compute metrics
    top1Acc = top1Count / n
    top10Acc = top10Count / n
    meanRank = np.nanmean(ranks)

    return top1Acc, top10Acc, meanRank

Writing func_emb_metrics.py


In [None]:
# Compute embedding metrics
# This script processes the output from the morpheme vector pipeline:
# 1) extract the predicted neighbours
# 2) saves as an excel file
# 3) evaluates neighbors using the func_emb_metrics function

files = glob.glob('morphoEmbVec*.pkl')
fileName = files[0]  # CHOOSE WHICH ONE YOU WANT TO EVLUATE
print(f"Processing: {fileName}")

with open(fileName, 'rb') as f:
    data = pickle.load(f)

predictedNeighborsAll = data['predictedNeighborsAll']

n = len(predictedNeighborsAll)
expectedForms = []
neighborsTop10 = []

for i in range(n):
    expectedForms.append(predictedNeighborsAll[i]['testPlural'])
    theseNeighbors = predictedNeighborsAll[i]['neighbors']
    row = []
    for j in range(10):
        if j < len(theseNeighbors):
            row.append(theseNeighbors[j])
        else:
            row.append('')
    neighborsTop10.append(row)

neighborHeaders = [f'Neighbor_{x+1}' for x in range(10)]

# Create table
data_dict = {'Expected': expectedForms}
for i, header in enumerate(neighborHeaders):
    data_dict[header] = [row[i] for row in neighborsTop10]

T = pd.DataFrame(data_dict)

# Save as excel
baseName = fileName.replace('morphoEmbVec', '').replace('.pkl', '')
tableName = f'predNeighbors{baseName}.xlsx'
print(f"Saving table: {tableName}")
T.to_excel(tableName, index=False)

# Call the metrics function (Turkish as an example)
t1_tr, t10_tr, rank_tr = func_emb_metrics(tableName)
print(f'Turkish Top-1: {t1_tr*100:.2f}%, Top-10: {t10_tr*100:.2f}%, Mean Rank: {rank_tr:.2f}')