In [28]:
#Load packeges:

import spacy
import pandas as pd
from sentence_transformers import SentenceTransformer 
import kagglehub
from gensim.models import KeyedVectors
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


#Model to extract class of words
nlp = spacy.load("en_core_web_sm")

#Model for embedding
ST_model = SentenceTransformer('all-MiniLM-L6-v2')
#ST_model = SentenceTransformer('all-mpnet-base-v2')


In [29]:
# Word embedding:

def embedding_function(word):
    embedded_word = ST_model.encode(word)
    return embedded_word


In [30]:
# load list of words
with open("data/large_word_list.txt", 'r') as file:
    word_list = [line.strip() for line in file]  # Remove trailing newlines and spaces



In [31]:
print(len(word_list))

10000


In [32]:
#Extracts the nouns and reduces words to base form
nouns = []

for word in word_list:
    doc = nlp(word)  
    for token in doc:  # Iterate through tokens in the processed word
        word_base = token.lemma_  # Extractz the base form 
        pos_tag = token.pos_  # Extract word class tag
        if pos_tag == "NOUN":
            nouns.append(word_base)  # Append to nouns if it's a noun

# Removes dublicates from list:
nouns = list(set(nouns))

In [33]:

print(len(nouns))

3595


In [34]:
#Embedding all the nouns and adding both embeddings and word to lists

embeddings = []
embedded_words = []

for word in nouns:
    try:
        current_embedded_word = ST_model.encode(word)
        embeddings.append(current_embedded_word)
        embedded_words.append(word)
    except Exception as e:
        print(f"Skipping word: {word}")

In [35]:

# Step 2: Embed the target words
word1 = "groundwater"
word2 = "earth"

combined_embedding = word1 + " and " + word2 + " combinned"
combined_embedding = embedding_function(combined_embedding)

#embedding1 = embedding_function(word1)
#embedding2 = embedding_function(word2)

# Normalize individual embeddings
#embedding1 = embedding1 / np.linalg.norm(embedding1)
#embedding2 = embedding2 / np.linalg.norm(embedding2)

# Combine embeddings with normalization
#combined_embedding = embedding1 + embedding2
#combined_embedding = combined_embedding / np.linalg.norm(combined_embedding)

# Compute cosine similarities with filtered embeddings
similarities = cosine_similarity([combined_embedding], embeddings)

# Sort indices to get top 10 matches
sorted_indices = np.argsort(similarities[0])[::-1][:10]  # Top 10 matches

# Retrieve the top words and their corresponding similarity scores
top_words = [embedded_words[i] for i in sorted_indices]
top_similarities = [similarities[0][i] for i in sorted_indices]

# Print results with cosine similarity
print("Top 10 closest words with cosine similarity:")
for word, similarity in zip(top_words, top_similarities):
    print(f"{word}: {similarity:.4f}")

if top_words[0] not in [word1, word2]:
    top_word = top_words[0]
elif top_words[1] not in [word1, word2]:
    top_word = top_words[1]
else:
    top_word = top_words[2]
    



print(f"Top word is: {top_word}") 

Top 10 closest words with cosine similarity:
groundwater: 0.8155
soil: 0.4909
geology: 0.4371
drainage: 0.4343
irrigation: 0.4282
earth: 0.4258
ground: 0.4239
precipitation: 0.4133
fountain: 0.3675
basin: 0.3658
Top word is: soil


In [12]:
#save word
np.save("embedded_words_ST.npy", embedded_words)
np.save("embeddings_ST.npy", embeddings)

In [8]:
print(len(embeddings[2]))

384


In [13]:
emoji_df = pd.read_csv("data/full_emoji.csv")

In [15]:
emoji_list = emoji_df['emoji'].tolist()

In [18]:
print(emoji_list[3:1000])

['😁', '😆', '😅', '🤣', '😂', '🙂', '🙃', '😉', '😊', '😇', '🥰', '😍', '🤩', '😘', '😗', '☺', '😚', '😙', '🥲', '😋', '😛', '😜', '🤪', '😝', '🤑', '🤗', '🤭', '🤫', '🤔', '🤐', '🤨', '😐', '😑', '😶', '😶\u200d🌫️', '😏', '😒', '🙄', '😬', '😮\u200d💨', '🤥', '😌', '😔', '😪', '🤤', '😴', '😷', '🤒', '🤕', '🤢', '🤮', '🤧', '🥵', '🥶', '🥴', '😵', '😵\u200d💫', '🤯', '🤠', '🥳', '🥸', '😎', '🤓', '🧐', '😕', '😟', '🙁', '☹', '😮', '😯', '😲', '😳', '🥺', '😦', '😧', '😨', '😰', '😥', '😢', '😭', '😱', '😖', '😣', '😞', '😓', '😩', '😫', '🥱', '😤', '😡', '😠', '🤬', '😈', '👿', '💀', '☠', '💩', '🤡', '👹', '👺', '👻', '👽', '👾', '🤖', '😺', '😸', '😹', '😻', '😼', '😽', '🙀', '😿', '😾', '🙈', '🙉', '🙊', '💋', '💌', '💘', '💝', '💖', '💗', '💓', '💞', '💕', '💟', '❣', '💔', '❤️\u200d🔥', '❤️\u200d🩹', '❤', '🧡', '💛', '💚', '💙', '💜', '🤎', '🖤', '🤍', '💯', '💢', '💥', '💫', '💦', '💨', '🕳', '💣', '💬', '👁️\u200d🗨️', '🗨', '🗯', '💭', '💤', '👋', '🤚', '🖐', '✋', '🖖', '👌', '🤌', '🤏', '✌', '🤞', '🤟', '🤘', '🤙', '👈', '👉', '👆', '🖕', '👇', '☝', '👍', '👎', '✊', '👊', '🤛', '🤜', '👏', '🙌', '👐', '🤲', '🤝', '🙏', '✍', '💅', '🤳', '💪', '🦾', '🦿', '🦵

In [51]:
df_ST = pd.read_csv("ST_score.csv", delimiter=";")

In [52]:
df_ST = df.drop_duplicates()

In [59]:
df_ST

Unnamed: 0,element1,element2,new_element,score,notes
0,water,water,liquid,2,
1,water,wind,rain,2,
2,water,earth,groundwater,2,
3,water,fire,flame,1,Water can be extinguished by fire
4,water,liquid,mixture,2,
...,...,...,...,...,...
200,precipitation,irrigation,rain,2,
201,precipitation,agriculture,irrigation,2,
202,precipitation,air,weather,2,
203,precipitation,atmosphere,climate,2,


In [63]:
unique_values_count = df_ST['new_element'].nunique()
print(unique_values_count)

37


In [68]:
df_W2V = pd.read_csv("W2V_score.csv", delimiter=";")

In [69]:
df_W2V = df.drop_duplicates()

In [70]:
df_W2V

Unnamed: 0,element1,element2,new_element,score,notes
0,water,water,liquid,2,
1,water,wind,rain,2,
2,water,earth,groundwater,2,
3,water,fire,flame,1,Water can be extinguished by fire
4,water,liquid,mixture,2,
...,...,...,...,...,...
200,precipitation,irrigation,rain,2,
201,precipitation,agriculture,irrigation,2,
202,precipitation,air,weather,2,
203,precipitation,atmosphere,climate,2,


In [71]:
unique_values_count = df_W2V['new_element'].nunique()
print(unique_values_count)

37
