In [None]:
import pandas as pd
import os
import numpy as np
import statistics
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
import ast

os.chdir("/Users/rapha/EPFL/ADA/ada-2024-project-theadacuates")

# Load data

In [None]:
df_paths = pd.read_csv("data/output/base_data/all_articles_processed.csv")

df_paths['path'] = df_paths['path'].str.split(';')

df_pf = df_paths.loc[df_paths["finished"] == True]
df_uf = df_paths.loc[df_paths["finished"] == False]

cols_to_convert = ["path_list", "path_list_id", "resolved_path_list_id", "resolved_path_list_name"]

for col in cols_to_convert:
    df_paths[col] = df_paths[col].apply(ast.literal_eval)

df_names = pd.read_csv("data/output/base_data/articles_processed.csv")

df_paths.head(2)

Difficulty rating:

In [None]:
axs = sns.barplot(data=df_pf, x='rating', y='n_back')
axs.set_title('Number of back clicks for difficulty rating')
axs.set_xlabel('Difficulty rating')
axs.set_ylabel('Number of back clicks')
plt.grid(False)
plt.show()

# How far was the player when back clicking (using bfs)

In [None]:
bfs_matrix_path = "data/output/base_data/bfs_matrix.csv"
bfs_matrix = pd.read_csv(bfs_matrix_path, sep=',', header=0, comment='#', index_col="article_name")

# returns shortest path distance between two articles, nan if cannot find the articles or no path exists

def ShortestPath(article_from, article_to):
    try:
        return bfs_matrix[article_from][article_to]
    except:
        print("Cannot find from", article_from, "to", article_to)
        return np.nan

In [None]:
# Given a row, generates the list of distance in clicks to target from each article in path

def CreateDistanceList(row):
    path_list = row.resolved_path_list_name
    back_click_distance = []

    for article in path_list:
        shortest_path = ShortestPath(article, row.target_link)
    
        back_click_distance.append(shortest_path)
    
    return back_click_distance

df_paths["path_distance"] = df_paths.apply(lambda row: CreateDistanceList(row), axis = 1)

In [None]:
# Given a row, generates the distance to the target in order of each back click(only takes first backclick when multiple in a row)

def CreateBackClickDistanceList(row):
    path_list = row.path_list
    back_click_distance = []

    for i in range(len(path_list) - 1):
        if path_list[i + 1] == "<" and path_list[i] != "<": # Ignore back clicks after the first one in a chain
            shortest_path = ShortestPath(path_list[i], row.target_link)
            back_click_distance.append(shortest_path)
    
    return back_click_distance

df_paths["back_click_distance"] = df_paths.apply(lambda row: CreateBackClickDistanceList(row), axis = 1)


In [None]:
# number of back click sequences, multiple back clicks in a row count as 1, matches length of back_click_distances

df_paths["back_click_sequences"] = df_paths.apply(lambda row: len(row.back_click_distance), axis = 1)

In [None]:
# put all distances in a list

distances_uf = sum(df_paths[df_paths["finished"] == False]["back_click_distance"].tolist(), [])
distances_uf = [x for x in distances_uf if x > 0]

distances_pf = sum(df_paths[df_paths["finished"] == True]["back_click_distance"].tolist(), [])
distances_pf = [x for x in distances_pf if x > 0]

# get mean of those distances
print("Average distance from back click to target(for unfinished paths): {:.3f}".format(statistics.mean(distances_uf)))
print("Average distance from back click to target(for finished paths): {:.3f}".format(statistics.mean(distances_pf)))
pval = ttest_ind(distances_pf, distances_uf).pvalue
print("P-value = {} so result is significant".format(pval))

# convert 0s to nans so don't get counted in mean
bfs_matrix_nan = bfs_matrix.copy()
bfs_matrix_nan[bfs_matrix_nan == 0] = np.nan
matrix_mean = np.nanmean(bfs_matrix_nan)
print("Average distance from any article to any other article: {:.3f}".format(matrix_mean))

In [None]:
df_combined_freq = pd.DataFrame()

# distances_all = sum(df_paths["path_distance"].tolist(), [])
# distances_all = [x for x in distances_all if x > 0]

# dfs = {"back click unfinished": distances_uf, "back click finished": distances_pf, "entire path": distances_all}
dfs = {"back click unfinished": distances_uf, "back click finished": distances_pf}

for origin, df in dfs.items():
    labels, counts = np.unique(df, return_counts=True)
    freq = counts / counts.sum()
    df_temp = pd.DataFrame()
    df_temp["distance"] = labels
    df_temp["frequency"] = freq
    df_temp["origin"] = origin
    df_combined_freq = pd.concat([df_combined_freq, df_temp.reset_index()], ignore_index=True)

axs = sns.barplot(data=df_combined_freq, x='distance', y='frequency', hue="origin", errorbar=None)
axs.set_title('Distance(bfs) to target')
axs.set_xlabel('Distance in clicks')
axs.set_ylabel("Frequency")
plt.grid(False)
plt.show()

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

model.similarity_fn_name = "cosine" # valid options are “cosine”, “dot”, “euclidean”, and "manhattan"

In [None]:
similarity_matrix_path = "data/output/semantic_similarity_data/similarity_matrix.csv"

def clamp(n, min, max): 
    if n < min: 
        return min
    elif n > max: 
        return max
    else: 
        return n 

if not os.path.exists(similarity_matrix_path):
    embeddings = model.encode(df_names["article_name"])
    num_articles = len(embeddings)
    matrix = np.zeros(shape=(num_articles, num_articles))

    for i in range (0, num_articles):
        similarities = model.similarity([embeddings[i]], embeddings[0:(i+1)])[0]
        for j in range(0, i+1):
            similarity = clamp(similarities[j], 0, 1)
            matrix[i][j] = similarity
            if i != j:
                matrix[j][i] = similarity

    matrix = pd.DataFrame(matrix, index = df_names["article_name"], columns=df_names["article_name"])
    matrix.to_csv(similarity_matrix_path)

similarity_matrix = pd.read_csv(similarity_matrix_path, sep=',', header=0, comment='#', index_col="article_name")

def SemanticSimilarity(article_from, article_to):
    try:
        return similarity_matrix[article_from][article_to]
    except:
        print("Cannot find simlarity between", article_from, "and", article_to)
        return np.nan

In [None]:
def CreateBackClickSemanticSimilarityList(row):
    path_list = row.path_list
    back_click_similarity = []

    for i in range(len(path_list) - 1):
        if path_list[i + 1] == "<" and path_list[i] != "<":
            similarity = SemanticSimilarity(path_list[i], row.target_link)
            back_click_similarity.append(similarity)
    
    return back_click_similarity


df_paths["back_click_similarity"] = df_paths.apply(lambda row: CreateBackClickSemanticSimilarityList(row), axis = 1)


Does semantic similarity to target evolve with each back click?

In [None]:

num_backclicks = 10

similarities = df_paths.loc[df_paths["back_click_sequences"] >= num_backclicks]["back_click_similarity"].to_list()
similarities = [x[:num_backclicks] for x in similarities]
similarities = np.array(similarities)

similarities_df = pd.DataFrame()
similarities_df["mean_similarity"] = np.mean(similarities, axis=0)
similarities_df["std_similarity"] = np.std(similarities, axis=0)


# plt.errorbar(similarities_df.index, similarities_df.mean_similarity, yerr = similarities_df.std_similarity, capsize= 3)
plt.plot(similarities_df.index, similarities_df.mean_similarity)
ax = plt.gca()
ax.set_ylim([0, 0.5])
plt.xlabel('Back click')
plt.ylabel('Semantic similarity')

plt.show()

## When do back clicks happen?

entropy, connectivity, n th article (compare to average finished path length), semantic similarity

In [None]:
first_back_click_pos = df_paths.loc[df_paths["n_back"] >= 1]["path_list"].apply(lambda path: path.index("<")).mean()

target_pos_mean = df_paths.loc[df_paths["finished"] == True]["n_click"].mean()

print("Mean position of target: %.2fth article, Mean first back click position: %.2fth article" % (target_pos_mean, first_back_click_pos))

In [None]:
df_fatigue = pd.read_csv("data/output/fatigue_metric_data/articles_fatigues_long.csv")

df_fatigue.head(2)

In [None]:
df_paths['path_Readability Consensus'] = df_paths['resolved_path_list_id'].apply(lambda x: df_fatigue["Readability Consensus"].iloc[x].tolist())
df_paths['mean_path_Readability Consensus'] = df_paths['path_Readability Consensus'].apply(lambda x: np.mean(x))

df_paths.head(2)

In [None]:
# make list of first article back clicked on
first_back_click_articles = df_paths.loc[df_paths["n_back"] >= 1]["path_list"].apply(lambda path: path[path.index("<") - 1]).tolist()
# get readability of these articles
back_click_readability = [df_fatigue.loc[df_fatigue["article_name"] == article]["Readability Consensus"].tolist() for article in first_back_click_articles]
# flatten list
back_click_readability = [
    x
    for xs in back_click_readability
    for x in xs
]
back_click_readability_mean = np.mean(back_click_readability)

# get list of readability of all articles in all paths
all_readability = df_paths["path_Readability Consensus"].tolist()
# flatten list
all_readability = [
    x
    for xs in all_readability
    for x in xs[:-1]
]
all_readability_mean = np.mean(all_readability)

pvalue = ttest_ind(back_click_readability, df_fatigue["Readability Consensus"].tolist(), equal_var=False).pvalue

print("Mean readbility of articles in path: %.2f  Mean readability of back clicked articles: %.2f" % (all_readability_mean, back_click_readability_mean))
print("p-value: {} so result is significant".format(pvalue))