In [None]:
import pandas as pd
import os
import numpy as np
import statistics
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
import ast

os.chdir("/Users/rapha/EPFL/ADA/ada-2024-project-theadacuates")

# Load data

In [None]:
df_paths = pd.read_csv("data/output/base_data/all_articles_processed.csv")

df_paths['path'] = df_paths['path'].str.split(';')

df_pf = df_paths.loc[df_paths["finished"] == True]
df_uf = df_paths.loc[df_paths["finished"] == False]

cols_to_convert = ["path_list", "path_list_id", "resolved_path_list_id", "resolved_path_list_name"]

for col in cols_to_convert:
    df_paths[col] = df_paths[col].apply(ast.literal_eval)

df_names = pd.read_csv("data/output/base_data/articles_processed.csv")

df_paths.head(2)

# Semantic similarity with transformers

We will now analyse the paths using semantic similary with SentenceTransmormer, which allows us to convert text to multidimensional vectors that can then be compared by taking the angle between vectors.

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

model.similarity_fn_name = "cosine" # valid options are “cosine”, “dot”, “euclidean”, and "manhattan"

In [None]:
examples1 = ["William Shakespeare", "playwright", "Romeo and Juliet", "17th century", "love", "Data analysis"]
embeddings1 = model.encode(examples1)

similarities = model.similarity(embeddings1, embeddings1)
similarities = pd.DataFrame(similarities)
similarities.columns = examples1
similarities.index = examples1

sns.heatmap(similarities)
plt.show()

This is an example to show the semantic similarity at work. The results obtained are about what we would expect, William Shakespeare being semantically close to playwright, or Romeo and Juliet, but having nothing to do with Data analysis. We can start to see how this could be used to analyse similarity between articles. Obviously this isn't perfect as it doesn't work for more complex relationships such as between "Romeo and Juliet" and "love" which one could expect to be closely related.

In [None]:
similarity_matrix_path = "data/output/semantic_similarity_data/similarity_matrix.csv"

def clamp(n, min, max): 
    if n < min: 
        return min
    elif n > max: 
        return max
    else: 
        return n 

if not os.path.exists(similarity_matrix_path):
    embeddings = model.encode(df_names["article_name"])
    num_articles = len(embeddings)
    matrix = np.zeros(shape=(num_articles, num_articles))

    for i in range (0, num_articles):
        similarities = model.similarity([embeddings[i]], embeddings[0:(i+1)])[0]
        for j in range(0, i+1):
            similarity = clamp(similarities[j], 0, 1)
            matrix[i][j] = similarity
            if i != j:
                matrix[j][i] = similarity

    matrix = pd.DataFrame(matrix, index = df_names["article_name"], columns=df_names["article_name"])
    matrix.to_csv(similarity_matrix_path)

similarity_matrix = pd.read_csv(similarity_matrix_path, sep=',', header=0, comment='#', index_col="article_name")
display(similarity_matrix)

def SemanticSimilarity(article_from, article_to):
    try:
        return similarity_matrix[article_from][article_to]
    except:
        print("Cannot find simlarity between", article_from, "and", article_to)
        return np.nan

## Semantic shift

Semantic shift denotes the semantic distance between successive articles. Large semantic shifts between articles can represent a higher cognitive load for the player (source?). We will try to analyse this semantic shift along paths to see if this can have an impact on whether the target is reached or not.

In [None]:
def CreateSemanticShiftList(row):
    path_list = row.resolved_path_list_name
    semantic_shift_list = []

    for i in range(len(path_list) - 1):
        semantic_similarity = SemanticSimilarity(path_list[i], path_list[i+1])
        # we want the shift, how different the two articles so take 1 - similarity
        semantic_shift_list.append(1 - semantic_similarity)
    
    return semantic_shift_list

df_paths["path_semantic_shift"] = df_paths.apply(lambda row: CreateSemanticShiftList(row), axis = 1)
df_paths["average_semantic_shift"] = df_paths["path_semantic_shift"].apply(lambda path: statistics.mean(path) if len(path) > 1 else np.nan)

average_semantic_shift_pf = df_paths[df_paths["finished"] == True]["average_semantic_shift"]
average_semantic_shift_pf = [x for x in average_semantic_shift_pf if ~np.isnan(x)]

average_semantic_shift_uf = df_paths[df_paths["finished"] == False]["average_semantic_shift"]
average_semantic_shift_uf = [x for x in average_semantic_shift_uf if ~np.isnan(x)]

print(f"Semantic shift (finished paths): mean = {statistics.mean(average_semantic_shift_pf):.4f} median = {statistics.median(average_semantic_shift_pf):.4f}")
print(f"Semantic shift (unfinished paths): mean = {statistics.mean(average_semantic_shift_uf):.4f} median = {statistics.median(average_semantic_shift_uf):.4f}")
pval = ttest_ind(average_semantic_shift_pf, average_semantic_shift_uf).pvalue
print("P-value = {} so the semantic shift between finished and unfinished paths is significantly different???????".format(pval))

sns.histplot(data=df_paths, x="average_semantic_shift", hue="finished", multiple="layer", binwidth=0.02, binrange=[0,1], stat="proportion", common_norm = False).set(title = "Semantic shift of a path for finished vs unfinished paths", xlabel = "Average semantic shift of path")
plt.show()

We can see that semantic shift along finished paths is smaller than for unfinished paths, which means finished paths tend to follow more closely related articles than unfinished paths.

In [None]:
def CreateSemanticSimilarityList(row):
    path_list = row.resolved_path_list_name
    similarity_list = []

    for article in path_list:
        similarity = SemanticSimilarity(article, row.target_link)
    
        similarity_list.append(similarity)
    
    return similarity_list

df_paths["path_similarity"] = df_paths.apply(lambda row: CreateSemanticSimilarityList(row), axis = 1)
df_paths["average_similarity"] = df_paths["path_similarity"].apply(lambda path: statistics.mean(path) if len(path) > 0 else np.nan)


In [None]:
max_length = 5

df_uf_similarity = pd.DataFrame(index = [x for x in range (2, max_length + 1)], columns = [x for x in range (0, max_length)])
df_pf_similarity = pd.DataFrame(index = [x for x in range (2, max_length + 1)], columns = [x for x in range (0, max_length)])

for length in range (2, max_length + 1):
    similarities = df_paths.loc[(df_paths["finished"] == False) & (df_paths["n_click"] == length)]["path_similarity"].tolist()
    similarities = np.nanmean(np.array(similarities), axis=0).tolist()
    similarities += [np.nan] * (max_length - length)
    df_uf_similarity.iloc[length - 2] = similarities

    similarities = df_paths.loc[(df_paths["finished"] == True) & (df_paths["n_click"] == length + 1)]["path_similarity"].tolist()
    # remove last node, always max semantic similarity because reached target
    similarities = [x[:-1] for x in similarities]
    similarities = np.nanmean(np.array(similarities), axis=0).tolist()
    similarities += [np.nan] * (max_length - length)
    df_pf_similarity.iloc[length - 2] = similarities


fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 5), sharey=True)
fig.supxlabel("article number")
fig.supylabel("similarity to target article")

y = [x for x in range (max_length)]
len_path = 2
for it, row in df_uf_similarity.iterrows():
    axs[0].plot(y, row, marker = "o", label = len_path)
    axs[0].set_title("unfinished paths")
    axs[0].legend(title="path length", loc = "upper left")
    axs[0].xaxis.set_major_locator(plt.MultipleLocator(1))
    len_path += 1

len_path = 2
y = [x for x in range (max_length)]
for it, row in df_pf_similarity.iterrows():
    axs[1].plot(y, row, marker = "o", label = len_path)
    axs[1].set_title("finished paths (final article ommited)")
    axs[1].legend(title="path length", loc = "upper left")
    axs[1].xaxis.set_major_locator(plt.MultipleLocator(1))
    len_path += 1

fig.tight_layout()

We see a clear difference between the finished and unfinished paths, unsfinished paths stagnating to around 0.3 semantic similarity to the target before being abandonned, while finished paths being at 0.5 similarity to the target on the article before the target.
We can also observe that paths which finish in a shorter number of articles on average start at a semantically closer article to the target.