<a href="https://colab.research.google.com/github/enesbol/Lexicon_Extension/blob/main/Lexicon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import gensim.downloader

In [2]:
model = gensim.downloader.load('glove-twitter-100')

In [3]:
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [4]:
# nlp = spacy.load("en_core_web_md")  # Load medium-sized English model
nlp = spacy.load("en_core_web_lg")

In [23]:
lexicon_df = pd.read_excel("/content/Consumer_Adjective_Clean.xlsx")

# trailing whitespace in column names in lexicon_df
lexicon_df.columns = lexicon_df.columns.str.strip()

In [24]:
base_vector_dict = {}

#Finally, list(dict.fromkeys(...)) is used to remove duplicates from the resulting list, ensuring only unique values are present in string_list.
for col in lexicon_df:
    # Create a list of unique lowercase, stripped, and asterisk-free values from the column
    string_list = list(dict.fromkeys([re.sub(r'[^a-zA-Z]', '', str(x).lower().strip().replace('*', '')) for x in lexicon_df[col].dropna().tolist()]))

    # Convert the string list into a NumPy array of spaCy word vectors
    string_list_vectors = np.array([nlp(s).vector for s in string_list])

    # Add the column name and its corresponding array of word vectors to the base_vector_dict
    base_vector_dict[col + "_base_vector"] = string_list_vectors

In [38]:
base_vector_dict

{'Spatial_base_vector': array([[-0.068492,  5.7971  , -0.71815 , ..., -1.8815  , -2.7432  ,
          2.6032  ],
        [-4.5671  , -0.72469 ,  0.35116 , ..., -1.8325  , -0.47618 ,
          2.9645  ],
        [-0.79739 , -2.5883  , -1.9291  , ...,  1.206   ,  0.89389 ,
         -0.142   ],
        ...,
        [ 1.5942  ,  2.4367  ,  2.4225  , ..., -1.5043  , -1.0966  ,
          1.6429  ],
        [-4.3635  ,  0.42556 , -2.5431  , ..., -5.0893  , -0.29509 ,
          1.7555  ],
        [-3.4736  ,  1.2572  ,  0.037179, ...,  0.12902 , -2.187   ,
          0.64316 ]], dtype=float32),
 'Normative_base_vector': array([[-0.41948 , -0.056428, -1.8163  , ...,  0.78343 , -4.5137  ,
          0.66713 ],
        [ 0.73066 ,  0.58313 ,  2.6591  , ..., -0.1572  , -4.7668  ,
         -0.59006 ],
        [ 1.1006  , -1.0492  , -2.8242  , ...,  1.4446  , -2.2498  ,
          0.48874 ],
        ...,
        [ 0.21383 ,  1.1599  , -3.1271  , ..., -1.4491  , -6.6107  ,
          1.4956  ],
        [

In [40]:
from tqdm import tqdm
tqdm.pandas()

# Create empty lists for potential words and out-of-vocabulary words
potential_words = []
out_words = []

# Create an empty DataFrame to store potential words
potential_words_df = pd.DataFrame()

# Iterate over each column in the lexicon_df
for col in tqdm(lexicon_df.columns):
    # Clear the potential_words list for each column
    potential_words = []

    # Clean and preprocess the words in the column
    sp = list(dict.fromkeys([re.sub(r'[^a-zA-Z]', '', str(x).lower().strip().replace('*', '')) for x in lexicon_df[col].dropna().tolist()]))

    # Iterate over each word in the cleaned list
    for word in sp:
        try:
            # Retrieve similar words for the current word using the pre-trained model.
            # Set topn parameter to 100 for producing 100 words for each word at lexicon.
            similar_words = model.most_similar(word, topn = 250)
            potential_words.append(similar_words)
        except KeyError:
            # If the word is not found in the model's vocabulary, add it to the out_words list
            out_words.append(word)

    # Extract only the words from the potential_words list and filter out non-Latin characters
    words_only = [word for sublist in potential_words for word, _ in sublist]
    latin_words = [word for word in words_only if re.match(r'^[a-zA-Z]+$', word)]
    latin_words = list(set(latin_words))

    # Add the potential words as a new column in the potential_words_df DataFrame
    potential_words_df["potential_"+col] = pd.Series(latin_words)


100%|██████████| 3/3 [00:15<00:00,  5.14s/it]


In [41]:
potential_words_df

Unnamed: 0,potential_Spatial,potential_Normative,potential_Intensity
0,societies,societies,minutes
1,anything,minutes,anything
2,kombat,campaig,seriousness
3,argosy,anything,unstructured
4,cave,bratz,nursingschoolproblems
...,...,...,...
11572,axed,,
11573,waynesville,,
11574,tended,,
11575,ago,,


In [42]:
# Iterate through each column in potential_words_df
for col in potential_words_df:
    potential_words_df[col+"_vectorized"] = pd.Series(potential_words_df[col].fillna('').progress_apply(lambda x: nlp(x).vector))

# Create a new column in potential_words_df with "_vectorized" suffix and populate it with the vectorized representation
# of each element in the corresponding column

100%|██████████| 11577/11577 [01:13<00:00, 157.33it/s]
100%|██████████| 11577/11577 [01:08<00:00, 168.62it/s]
100%|██████████| 11577/11577 [00:51<00:00, 224.31it/s] 


In [43]:
potential_words_df.head()

Unnamed: 0,potential_Spatial,potential_Normative,potential_Intensity,potential_Spatial_vectorized,potential_Normative_vectorized,potential_Intensity_vectorized
0,societies,societies,minutes,"[-0.88241, -1.4953, -0.46298, 0.54697, 5.5311,...","[-0.88241, -1.4953, -0.46298, 0.54697, 5.5311,...","[-1.6567, 3.6617, -0.31617, -0.80445, 2.2406, ..."
1,anything,minutes,anything,"[0.3198, 2.4962, -4.2347, -2.8765, -1.6926, -0...","[-1.6567, 3.6617, -0.31617, -0.80445, 2.2406, ...","[0.3198, 2.4962, -4.2347, -2.8765, -1.6926, -0..."
2,kombat,campaig,seriousness,"[1.4838, -0.83254, -0.25512, 3.2036, 1.2825, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.0254, 0.4329, -1.843, 0.43026, 0.97198, 0.8..."
3,argosy,anything,unstructured,"[-0.17297, 0.21296, -0.18709, -0.15855, 0.1219...","[0.3198, 2.4962, -4.2347, -2.8765, -1.6926, -0...","[-0.9907, -0.39896, 0.63055, -0.48094, 2.8306,..."
4,cave,bratz,nursingschoolproblems,"[3.9985, 0.55137, 0.050358, -3.3203, -1.3749, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [30]:
"""
The variable base_vector_dict is a dictionary that stores the base vectors for each lexicon.
In this case, base_vector_dict['Spatial_base_vector'] contains the vector representations of
all the words from the "Spatial" lexicon. These vector representations are obtained by vectorizing
the words using a pre-trained word embedding model.

To access a specific word vector, we can use indexing. In the given code, [90] is used to access the
91st word's vector representation from the "Spatial" lexicon. By calling .shape on this vector, we obtain
its shape, which in this case is (300,).

The shape (300,) indicates that the vector representation of the 91st word in the "Spatial" lexicon has a length of 300.
This means that the word is represented by a 300-dimensional vector, where each dimension captures different aspects of
the word's meaning or context.
"""

'\nThe variable base_vector_dict is a dictionary that stores the base vectors for each lexicon.\nIn this case, base_vector_dict[\'Spatial_base_vector\'] contains the vector representations of\nall the words from the "Spatial" lexicon. These vector representations are obtained by vectorizing\nthe words using a pre-trained word embedding model.\n\nTo access a specific word vector, we can use indexing. In the given code, [90] is used to access the\n91st word\'s vector representation from the "Spatial" lexicon. By calling .shape on this vector, we obtain\nits shape, which in this case is (300,).\n\nThe shape (300,) indicates that the vector representation of the 91st word in the "Spatial" lexicon has a length of 300.\nThis means that the word is represented by a 300-dimensional vector, where each dimension captures different aspects of\nthe word\'s meaning or context.\n'

In [31]:
base_vector_dict['Spatial_base_vector'][90].shape

(300,)

In [32]:
base_vector_dict['Spatial_base_vector'][90]

array([-3.4736   ,  1.2572   ,  0.037179 , -3.447    ,  2.3293   ,
        0.78382  ,  3.0362   ,  2.8004   ,  1.176    ,  0.77362  ,
       -0.13223  ,  0.65523  , -2.3919   ,  1.7062   , -0.11605  ,
       -0.62067  ,  0.090005 ,  0.41134  ,  1.1019   , -2.9679   ,
       -0.21581  , -1.6597   , -0.20205  ,  2.0588   , -0.30373  ,
       -1.2938   , -3.6779   , -1.1779   , -0.58174  ,  2.5519   ,
        0.03169  , -0.27004  , -0.086378 , -1.6005   , -4.739    ,
       -2.6656   , -1.8855   ,  2.0184   ,  0.51353  ,  0.048929 ,
       -1.3999   , -0.87185  , -0.76266  ,  2.0184   , -4.1461   ,
        2.3547   , -1.2481   , -0.89289  ,  0.17742  ,  1.9663   ,
        1.7749   ,  0.44471  , -0.79376  , -3.9409   , -0.1602   ,
        0.18716  ,  1.0368   , -2.145    ,  3.8091   ,  3.0528   ,
        0.084441 , -3.8565   ,  2.8445   , -0.89202  ,  0.10439  ,
       -0.16368  ,  1.7791   ,  0.11798  ,  2.8873   , -0.77619  ,
       -0.56284  , -2.1953   , -0.48176  ,  1.1212   ,  2.5832

In [46]:
def compute_mean_cosine_sim(row):
    # Compute the mean cosine similarity between a potential word vector and a set of base vectors

    potential_vector = row  # Store the potential word vector
    base_vectors = base_vector_dict[col.split('_')[1] + '_base_vector']  # Retrieve the corresponding base vectors

    if isinstance(base_vectors, list):
        base_vectors = np.array(base_vectors)  # Convert base vectors to NumPy array if it's a list
    if len(base_vectors) == 0:
        return 0  # Return 0 if there are no base vectors for comparison

    try:
        cos_sim = cosine_similarity(potential_vector.reshape(1, -1), base_vectors)  # Compute cosine similarity
    except:
        return 0  # Return 0 if an exception occurs during computation

    cos_sim = np.abs(cos_sim)  # Take the absolute value of cosine similarity

    return np.sum(cos_sim)  # Return the sum of cosine similarities


x = ['potential_Spatial_vectorized', 'potential_Normative_vectorized', 'potential_Intensity_vectorized']

for col in x:
    colname = col.split('_')[1] + '_score'  # Generate the score column name based on the lexicon category
    potential_words_df[colname] = potential_words_df[col].progress_apply(compute_mean_cosine_sim)

100%|██████████| 11577/11577 [00:06<00:00, 1763.15it/s]
100%|██████████| 11577/11577 [00:08<00:00, 1406.74it/s]
100%|██████████| 11577/11577 [00:06<00:00, 1876.85it/s]


In [47]:
potential_words_df

Unnamed: 0,potential_Spatial,potential_Normative,potential_Intensity,potential_Spatial_vectorized,potential_Normative_vectorized,potential_Intensity_vectorized,Spatial_score,Normative_score,Intensity_score
0,societies,societies,minutes,"[-0.88241, -1.4953, -0.46298, 0.54697, 5.5311,...","[-0.88241, -1.4953, -0.46298, 0.54697, 5.5311,...","[-1.6567, 3.6617, -0.31617, -0.80445, 2.2406, ...",23.240526,23.444916,10.505219
1,anything,minutes,anything,"[0.3198, 2.4962, -4.2347, -2.8765, -1.6926, -0...","[-1.6567, 3.6617, -0.31617, -0.80445, 2.2406, ...","[0.3198, 2.4962, -4.2347, -2.8765, -1.6926, -0...",17.658447,6.652695,25.108273
2,kombat,campaig,seriousness,"[1.4838, -0.83254, -0.25512, 3.2036, 1.2825, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.0254, 0.4329, -1.843, 0.43026, 0.97198, 0.8...",12.606293,0.000000,28.736338
3,argosy,anything,unstructured,"[-0.17297, 0.21296, -0.18709, -0.15855, 0.1219...","[0.3198, 2.4962, -4.2347, -2.8765, -1.6926, -0...","[-0.9907, -0.39896, 0.63055, -0.48094, 2.8306,...",9.011262,20.438667,24.148090
4,cave,bratz,nursingschoolproblems,"[3.9985, 0.55137, 0.050358, -3.3203, -1.3749, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",24.486513,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
11572,axed,,,"[-1.6742, -1.7183, -0.87842, 1.2215, -0.16315,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7.395191,0.000000,0.000000
11573,waynesville,,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000,0.000000,0.000000
11574,tended,,,"[-2.8258, 0.64953, -1.2183, 1.2276, 4.2964, -2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",21.751247,0.000000,0.000000
11575,ago,,,"[-5.1593, -5.4345, -7.9252, -5.1833, 5.0262, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3.887825,0.000000,0.000000


In [49]:
def clean_list(col):
    # Clean and preprocess the values in a column of the lexicon dataframe
    sp_list = list(dict.fromkeys([re.sub(r'[^a-zA-Z]', '', str(x).lower().strip().replace('*', '')) for x in lexicon_df[col].dropna().tolist()]))
    # Remove non-alphabetic characters, convert to lowercase, strip leading/trailing whitespace, and remove duplicates
    return sp_list

columns = ['Spatial', 'Normative', 'Intensity']
extended_lexicons = []  # List to store extended lexicons for each column

for col in columns:
    sp = clean_list(col)  # Clean the values in the column
    potential_col = 'potential_' + col  # Generate the name of the potential column
    extended_lexicon = potential_words_df[~potential_words_df[potential_col].isin(sp)].sort_values(col + '_score', ascending=False)[[potential_col, col + '_score']]
    # Filter potential words not present in the lexicon, sort by score in descending order, and select relevant columns
    extended_lexicons.append(extended_lexicon)  # Append the extended lexicon to the list

In [50]:
extended_lexicons[0].head()

Unnamed: 0,potential_Spatial,Spatial_score
10966,sheltered,33.443504
10726,underground,33.051483
6330,confined,31.331417
4102,somwhere,31.297007
6562,surrounding,31.217823


In [51]:
sorted_extended_lexicons = []  # List to store sorted extended lexicons

for lexicon in extended_lexicons:
    sorted_lexicon = lexicon.sort_values(by=[lexicon.columns[1]], ascending=False).reset_index(drop=True)
    # Sort the lexicon DataFrame by the score column in descending order and reset the index
    sorted_extended_lexicons.append(sorted_lexicon)  # Append the sorted lexicon to the list

combined_extended_lexicon = pd.concat(sorted_extended_lexicons, axis=1)
# Concatenate the sorted extended lexicons along the columns axis to create the combined extended lexicon DataFrame

In [54]:
combined_extended_lexicon.to_excel("Result_Lexicons_with_Scores.xlsx") # save to excel.

In [55]:
combined_extended_lexicon

Unnamed: 0,potential_Spatial,Spatial_score,potential_Normative,Normative_score,potential_Intensity,Intensity_score
0,sheltered,33.443504,fascinating,34.617973,fascinating,40.046127
1,underground,33.051483,fantastical,34.385860,forgettable,39.513187
2,confined,31.331417,mystical,33.874863,unexciting,39.440746
3,somwhere,31.297007,imaginary,33.469307,exciting,38.967720
4,surrounding,31.217823,magical,33.351353,unenjoyable,38.967373
...,...,...,...,...,...,...
11511,galapagos,0.000000,,0.000000,,
11512,,,,0.000000,,
11513,,,,0.000000,,
11514,,,,0.000000,,
