This Notebook shows the approach that was chosen for the practical examples in the study "Comprehensive Validation of Word Embeddings for Social Science Research"

In [2]:
### the following packages are needed
import os
import gensim
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from collections import Counter
from collections import defaultdict


In [3]:
#### needed functions

def load_model(file_path):
    """Loads a word embedding model from the given file path."""
    if file_path.endswith('.bin'):
        model = gensim.models.fasttext.load_facebook_vectors(file_path)
    elif file_path.endswith('.vec'):
        model = gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=False)
    else:
        raise ValueError(f"Unsupported file format: {file_path}")
    return model

def get_nearest_neighbors(model, keyword, top_n=100):
    """Fetches the top N nearest neighbors for a given keyword from the model."""
    return model.most_similar(keyword, topn=top_n)

# Function to calculate percentage overlap
def calculate_percentage_overlap(df):
    overlap_matrix = pd.DataFrame(index=df.columns, columns=df.columns, dtype=float)
    for col1 in df.columns:
        for col2 in df.columns:
            set1 = set(df[col1])
            set2 = set(df[col2])
            overlap = len(set1.intersection(set2))
            percentage_overlap = (overlap / len(set1)) * 100
            overlap_matrix.loc[col1, col2] = percentage_overlap
    return overlap_matrix

# Function to count word presence
def count_word_presence(df):
    word_presence = {}
    word_models = {}
    for col in df.columns:
        for word in df[col]:
            if word in word_presence:
                word_presence[word] += 1
                word_models[word].append(col)
            else:
                word_presence[word] = 1
                word_models[word] = [col]
    return word_presence, word_models

# Function to filter word presence
def filter_word_presence(word_presence, min_models=3):
    return {word: count for word, count in word_presence.items() if count >= min_models}

# Function to plot heatmap with modified labels and custom color scheme
def plot_heatmap(overlap_matrix, filename):
    plt.figure(figsize=(20, 16))
    # Create a colormap with different hues of the color #0063A6
    cmap = mcolors.LinearSegmentedColormap.from_list("custom_blue", ["#ffffff", "#0063A6"])
    sns.heatmap(overlap_matrix, annot=True, fmt=".0f", cmap=cmap, xticklabels=short_labels(overlap_matrix.columns), yticklabels=short_labels(overlap_matrix.index))
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.savefig(filename, format='pdf')
    plt.show()

# Function to plot word presence with modified labels and filter
def plot_word_presence(word_presence, min_models, filename):
    filtered_word_presence = filter_word_presence(word_presence, min_models)
    presence_df = pd.DataFrame.from_dict(filtered_word_presence, orient='index', columns=['count'])
    presence_df = presence_df.sort_values(by='count', ascending=False)
    plt.figure(figsize=(10, 8))
    sns.barplot(x=presence_df.index, y=presence_df['count'])
    plt.xticks(rotation=90)
    plt.savefig(filename, format='pdf')
    plt.show()

# Helper function to get first two words of column names
def short_labels(labels):
    return ['_'.join(label.split('_')[:2]) for label in labels]

# Function to calculate percentage overlap and average overlap
def calculate_percentage_overlap_and_average(df):
    overlap_matrix = pd.DataFrame(index=df.columns, columns=df.columns, dtype=float)
    total_overlap = 0
    num_pairs = 0
    
    for col1 in df.columns:
        for col2 in df.columns:
            set1 = set(df[col1])
            set2 = set(df[col2])
            overlap = len(set1.intersection(set2))
            percentage_overlap = (overlap / len(set1)) * 100
            overlap_matrix.loc[col1, col2] = percentage_overlap
            
            if col1 != col2:
                total_overlap += percentage_overlap
                num_pairs += 1
    
    average_overlap = total_overlap / num_pairs if num_pairs > 0 else 0
    print(f"Average overlap: {average_overlap:.2f}%")
    
    return overlap_matrix

def find_words_in_all_columns(words_table):
    word_locations = defaultdict(set)
    # Iterate through each column and record word occurrences
    for col in words_table.columns:
        for word in words_table[col].dropna():  # dropna() to skip NaN values
            word_locations[word].add(col)
    # Group words by the number of columns they appear in
    words_by_column_count = defaultdict(list)
    for word, columns in word_locations.items():
        words_by_column_count[len(columns)].append((word, columns))
    return words_by_column_count


In [None]:
# Define keywords and their variants
keywords = {
    'frau': ('frau', 'Frau'),
    'femizid': ('femizid', 'Femizid')
}

model_dir = 'path_to_models'
file_type = '.vec'

# Loop through each keyword
for keyword, (keyword_lower, keyword_cased) in keywords.items():
    # Create an empty DataFrame to store all neighbors
    all_neighbors = pd.DataFrame(columns=['Model', 'Neighbor', 'Similarity'])

    # List to store the frequency of each neighbor
    neighbor_counter = Counter()

    # Load each model and get nearest neighbors
    for model_file in os.listdir(model_dir):
        if model_file.endswith(file_type):
            model_path = os.path.join(model_dir, model_file)
            model_name = os.path.splitext(model_file)[0]
            print(f"Processing model: {model_name} for keyword: {keyword}")

            if 'lower' in model_name:
                selected_keyword = keyword_lower
            else:
                selected_keyword = keyword_cased

            model = load_model(model_path)
            neighbors = get_nearest_neighbors(model, selected_keyword)

            for neighbor, similarity in neighbors:
                neighbor_lower = neighbor.lower()
                all_neighbors = all_neighbors.append({'Model': model_name, 'Neighbor': neighbor_lower, 'Similarity': similarity}, ignore_index=True)
                neighbor_counter[neighbor_lower] += 1

    # Save the neighbors to a CSV file
    all_neighbors.to_csv(f'table_allneighbours_{keyword}.csv', index=False)

    # Save the frequency count of neighbors
    with open(f'neighbor_counts_{keyword}.txt', 'w') as f:
        for neighbor, count in neighbor_counter.most_common():
            f.write(f"{neighbor}: {count}\n")

    ### Prepare Table of Nearest Neighbors for Analysis

    df = pd.read_csv(f'table_allneighbours_{keyword}.csv')

    # Count of all words in the Neighbor column
    word_count = df['Neighbor'].nunique()

    # Creating a new table with each model as a column
    grouped_df = df.groupby('Model')['Neighbor'].apply(list).reset_index()

    # Convert the lists to columns
    max_neighbors = grouped_df['Neighbor'].apply(len).max()
    new_df = pd.DataFrame({model: neighbors + [None] * (max_neighbors - len(neighbors)) 
                        for model, neighbors in zip(grouped_df['Model'], grouped_df['Neighbor'])})

    # Transpose the DataFrame so each model is a column
    new_df = new_df.T.reset_index(drop=True).T

    # Assign columns names
    new_df.columns = grouped_df['Model']

    print(f"Count of all unique words: {word_count}")
    new_df.to_csv(f'table_allneighbours_{keyword}_pivot.csv')

    """At this point the 'neighbor_counts_{keyword}.txt' file was used to manually create a dictionnary, which will serve to combined words, that are the same, except for their grammatical strucutre e.g. (woman and women)"""

    # Load data
    df = pd.read_csv('table_allneighbours_{keyword}_pivot.csv', sep=',', index_col='Unnamed: 0')

    # Load the manual lemmatization dictionary
    wf = pd.read_csv('path_to_dict', sep=';') 
    lemma_dict = dict(zip(wf['original_word'], wf['lemma_word']))

    def translate_word(word):
        return lemma_dict.get(word, word)  # Return the original word if not found in dictionary

    # Apply the translation function to the table
    lemmatized_df = df.applymap(translate_word)

    # Calculate percentage overlap for original data
    original_percentage_overlap = calculate_percentage_overlap(df)
    plot_heatmap(original_percentage_overlap, f'original_overlap_heatmap_{keyword}.pdf')

    # Lemmatize neighbors using the manual dictionary and calculate percentage overlap
    lemmatized_percentage_overlap = calculate_percentage_overlap(lemmatized_df)
    plot_heatmap(lemmatized_percentage_overlap, f'lemmatized_overlap_heatmap_{keyword}.pdf')

    # Calculate word presence for original data
    original_word_presence, original_word_models = count_word_presence(df)
    plot_word_presence(original_word_presence, min_models=16, filename=f'original_word_presence_{keyword}.pdf')

    ###### some code parts to get descriptives
    words_in_all_columns = find_words_in_all_columns(lemmatized_df)

    total_words = 0
    for column_count in sorted(words_in_all_columns.keys(), reverse=True):
        words_list = words_in_all_columns[column_count]
        total_words += len(words_list)
        print(f"The following words are in {column_count} columns (Total: {len(words_list)}):")
        for word, columns in words_list:
            print(f"  Word: {word} appears in columns: {', '.join(columns)}")
        print()

    print(f"Total number of words for {keyword}: {total_words}")

    # words in half the columns or fewer
    half_columns = len(lemmatized_df.columns) // 2
    words_in_half_or_less = sum(len(words_in_all_columns[count]) for count in range(1, half_columns + 1))

    print(f"Number of words that are in half the columns or fewer for {keyword}: {words_in_half_or_less}")

    # words in more than 75% of the columns
    seventy_five_percent_columns = len(lemmatized_df.columns) * 0.75
    words_in_seventy_five_percent_or_more = sum(len(words_in_all_columns[count]) for count in range(int(seventy_five_percent_columns) + 1, len(lemmatized_df.columns) + 1))

    print(f"Number of words that are in more than 75% of the columns for {keyword}: {words_in_seventy_five_percent_or_more}")

    # average overlap
    calculate_percentage_overlap_and_average(lemmatized_df)
