# ADA CAPI Notebook for Data Exploration

In [None]:
import pandas as pd
import networkx as nx
import numpy as np
import os
from scipy import stats 

# Helper functions from utils folder
from utils.analysis import t_test_article_metrics, visualize_article_connections_per_category
from utils.preprocessing import get_all_links, merge_articles_categories

# Formatting libraries
import urllib
import datetime as datetime

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Imports to perform article analysis
import textstat
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt') # Punkt tokenizer
nltk.download('stopwords') # Commong stopwords

# Load config and extract variables
import config
DATA_PATH = config.PATH_TO_DATA
PATH_GRAPGH_FOLDER = "wikispeedia_paths-and-graph"
ARTICLE_FOLDER = "plaintext_articles"
GENERATED_METRICS = "generated_data"

### Loading and Preparing the Data
Load and clean up the paths, load into weighted graph structure etc.

#### Load Tabular Data

In [None]:
# load in all data (except wikipedia articles)
finished_paths = pd.read_csv(os.path.join(DATA_PATH, PATH_GRAPGH_FOLDER, "paths_finished.tsv"), sep='\t', skiprows=15, names=["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"])
unfinished_paths = pd.read_csv(os.path.join(DATA_PATH, PATH_GRAPGH_FOLDER, "paths_unfinished.tsv"), sep='\t', skiprows=16, names=["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "type"])
edges = pd.read_csv(os.path.join(DATA_PATH, PATH_GRAPGH_FOLDER, "links.tsv"), sep='\t', skiprows=15, names=["start", "end"], encoding="utf-8")
articles = pd.read_csv(os.path.join(DATA_PATH, PATH_GRAPGH_FOLDER, "articles.tsv"), sep='\t', skiprows=12, names=["article"], encoding="utf-8")
categories = pd.read_csv(os.path.join(DATA_PATH, PATH_GRAPGH_FOLDER, "categories.tsv"), sep='\t', skiprows=13, names=["article", "category"], encoding="utf-8")
shortest_paths = np.genfromtxt(os.path.join(DATA_PATH, PATH_GRAPGH_FOLDER, "shortest-path-distance-matrix.txt"), delimiter=1, dtype=np.uint8)

In [None]:
display(finished_paths.info())
display(finished_paths.head())

In [None]:
display(unfinished_paths.info())
display(unfinished_paths.head())

In [None]:
display(edges.info())
display(edges.head())

In [None]:
display(articles.info())
display(articles.head())

In [None]:
display(categories.head())
display(categories.head())

In [None]:
# shortest paths corresponds to numpy matrix, where 255 signifies no path (underscore in the .txt file), the diagonal is zero
# the row index is the zero-based index corresponding to the index in the articles dataframe, same for the columns (target article)
print((np.diag(shortest_paths)==0).all())
shortest_paths

#### Clean Tabular Data

In [None]:
# Clean up edge list
display(edges.head())
edges["start"] = edges.start.apply(urllib.parse.unquote)
edges["end"] = edges.end.apply(urllib.parse.unquote)
display(edges.head())

In [None]:
# Format datetime as datetime object
display(articles.head())
finished_paths["datetime"] = finished_paths.timestamp.apply(datetime.datetime.fromtimestamp)
unfinished_paths["datetime"] = unfinished_paths.timestamp.apply(datetime.datetime.fromtimestamp)
display(unfinished_paths.head())

In [None]:
# Clean up url encoding for articles
display(articles.head())
articles["article"] = articles.article.apply(urllib.parse.unquote)
display(articles.head())

In [None]:
# Clean up url encoding for categories
display(categories.head())
categories["article"] = categories.article.apply(urllib.parse.unquote)
display(categories.head())

In [None]:
# Identify broad categories of articles
display(categories.head())
categories["broad_category"] = categories["category"].apply(lambda x: x.split(".")[1])
display(categories.head())

In [None]:
# merge articles and categories
articles_categories = pd.merge(articles, categories, how="left", on="article")
display(articles_categories.head())
# 6 articles without category!
print("Merge introduced {} NAs in category columns:".format(articles_categories.category.isna().sum()))
articles_categories[articles_categories.category.isna()]

In [None]:
# Convert paths to a readable format (lists)
finished_paths["path"] = finished_paths["path"].apply(lambda x: x.split(";"))
finished_paths["path"] = finished_paths["path"].apply(lambda x: [urllib.parse.unquote(y) for y in x])

unfinished_paths["path"] = unfinished_paths["path"].apply(lambda x: x.split(";"))
unfinished_paths["path"] = unfinished_paths["path"].apply(lambda x: [urllib.parse.unquote(y) for y in x])

In [None]:
# Add start and target articles of path
finished_paths["start"] = [path[0] for path in finished_paths["path"]]
finished_paths["target"] = [path[-1] for path in finished_paths["path"]]

unfinished_paths["start"] = [path[0] for path in unfinished_paths["path"]]
unfinished_paths["target"] = unfinished_paths["target"].apply(urllib.parse.unquote)

In [None]:
# get all finished links
finished_links = get_all_links(finished_paths)
finished_links.sort_values(by="weight", ascending=False) # TODO: what is up with these <<< signs?

In [None]:
# get all unfinished links
unfinished_links = get_all_links(unfinished_paths)
unfinished_links.sort_values(by="weight", ascending=False) # TODO: what is up with these <<< signs?

In [None]:
# create newtorkx graph from finished paths
finished_graph = nx.from_pandas_edgelist(finished_links,source="source", target="target", edge_attr="weight")
hist = nx.degree_histogram(finished_graph)
plt.bar(range(len(hist)), hist)
pd.Series(hist).describe()

In [None]:
# create newtorkx graph from unfinished paths
unfinished_graph = nx.from_pandas_edgelist(unfinished_links,source="source", target="target", edge_attr="weight")
hist = nx.degree_histogram(unfinished_graph)
plt.bar(range(len(hist)), hist)
pd.Series(hist).describe()

#### Load Article data

##### Metrics to be extracted from articles

* Total word count: To understand the length of the article.
* Non stopword frequency: To identify words that contribute to the content's meaning.
* Stopword frequency: To identify common words that may not contribute to the content's meaning.
* Average word length: To assess the complexity of the language used.
* Average sentence length: Longer or more complex sentences (based on characters) may contribute to frustration.
* Number of paragraphs: To see if the article's structure plays a role in people giving up.
* Keyword frequency: To identify the most common keywords to understand the article's focus.
* Readability: Ease of reading the article (metric: Flesch Reading Ease Score) Link: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

In [None]:
def proprocess_article(article_text):
    preprocessed_text = article_text
    preprocessed_text = preprocessed_text.lower()
    preprocessed_text = preprocessed_text.replace("\n   ", " ") # As the articles are not continuous sentences
    return preprocessed_text

def calculate_article_metrics(article_text):
    preprocessed_text = proprocess_article(article_text)

    words = word_tokenize(preprocessed_text)
    sentences = sent_tokenize(preprocessed_text)

    # Calculate total word count
    total_word_count = len(words)

    # Calculate stopword frequency
    stop_words = set(stopwords.words("english"))
    stopwords_count = 0
    unique_words = []
    for word in words:
        if word.isalpha() and word.lower() in stop_words:
            stopwords_count +=1
        if word.isalpha() and word.lower() not in stop_words:
            unique_words.append(word.lower())

    # Calculate average word length
    average_word_length = sum(len(word) for word in words) / total_word_count

    # Calculate average sentence length
    average_sentence_length = sum(len(sentence) for sentence in sentences) / len(sentences)

    # Calculate number of paragraphs (assume every new line \n is paragraph)
    paragraphs_count = preprocessed_text.count('\n') + 1 # Count last paragraph

    # Calculate keyword frequency
    word_freq = nltk.FreqDist(unique_words)
    most_common_words = word_freq.most_common(10)  # Parameter to adjust

    # Calculate readability (Flesch Reading Ease Score) - 100: Easy to read, 0: Very confusing
    readability = textstat.flesch_reading_ease(preprocessed_text)

    return {
        "word_count": total_word_count,
        "non_stopword_count": total_word_count - stopwords_count,
        "stopword_count": stopwords_count,
        "avg_word_length": average_word_length,
        "avg_sent_length": average_sentence_length,
        "paragraph_count": paragraphs_count,
        "common_words": most_common_words,
        "readability_score": readability,
    }

##### Loading the article data
To reduce runtime, we compute the article metrics once and then read the generated csv file.

In [None]:
"""
folder_path = os.path.join(DATA_PATH, ARTICLE_FOLDER)
if os.path.exists(folder_path) and os.path.isdir(folder_path):

  article_metrics = pd.DataFrame(columns=["article", "word_count", "non_stopword_count", "stopword_count", "avg_word_length", "avg_sent_length", "paragraph_count", "common_words", "readability_score"])

  for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    
    if os.path.isfile(file_path):
      root, extension = os.path.splitext(file_name)
      readable_file_name = urllib.parse.unquote(root)
      
      with open(file_path, "r", encoding="utf-8") as article:
        metrics = calculate_article_metrics(article.read())

        metrics["article"] = readable_file_name
        article_metrics.loc[len(article_metrics)] = metrics
else:
  raise FileNotFoundError("The specified folder path does not exist or is not a directory.")

article_metrics.to_csv(os.path.join(GENERATED_METRICS, "article_metrics.csv"), index=False)
"""

In [None]:
article_metrics = pd.read_csv(os.path.join(GENERATED_METRICS, "article_metrics.csv"))

In [None]:
display(article_metrics.info())
display(article_metrics.head())

### General Data Exploration
Explore distribution of all relevant variables, analyze and potentially fill missing values, sîmple summary stats

#### Explore Path lengths across finished and unfinished paths

In [None]:
unfinished_paths

In [None]:
# distribution of path lengths disaggregated across finished and unfinished
unfinished_paths["path_length"] = unfinished_paths.path.apply(lambda el: len(el))
finished_paths["path_length"] = finished_paths.path.apply(lambda el: len(el))

print("Finished Paths: Length")
display(finished_paths["path_length"].describe())
display(finished_paths.path_length.value_counts())

print("Unfinished Paths: Length")
display(unfinished_paths["path_length"].describe())
unfinished_paths.path_length.value_counts()


In [None]:
# make plot of path lengths
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 4), sharey=True)

sns.histplot(data=finished_paths, x="path_length", ax=axes[0])
axes[0].set_title("Finished Paths")
sns.histplot(data=unfinished_paths, x="path_length", ax=axes[1], hue="type")
axes[1].set_title("Uninished Paths")

# --> highly skewed and many unlikely outcomes (e.g. unfinished paths path length = 1, did they really give up? or not play at all?)

In [None]:

# make plot of path lengths
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14, 4), sharey=True)
threshold = 30


sns.histplot(x=finished_paths.path_length[finished_paths.path_length < threshold], ax=axes[0], discrete=True)
axes[0].set_title("Finished Paths")

unfinished_clean = unfinished_paths[(unfinished_paths.path_length < threshold) & (unfinished_paths.type == "restart")]
sns.histplot(data=unfinished_clean, x="path_length", ax=axes[1], discrete=True,)
axes[1].set_title("Uninished Paths - Restart")

unfinished_clean = unfinished_paths[(unfinished_paths.path_length < threshold) & (unfinished_paths.type == "timeout")]
sns.histplot(data=unfinished_clean, x="path_length", ax=axes[2], discrete=True,)
axes[2].set_title("Uninished Paths - Timeout")

#### Explore categories in the paths

In [None]:
# Seeing which categories are most represented in articles
count_articles = categories.groupby("broad_category").size()

print("Below shows how many articles each of the broad categories are represented by")
display(count_articles)

In [None]:
# Create dictionaries for easy discovery of what categories an article belongs to
article_to_category = {}
article_to_broad_category = {}
for i in range(len(categories)):
    if categories.iloc[i]["article"] in article_to_category:
        article_to_category[categories.iloc[i]["article"]].append(categories.iloc[i]["category"])
        article_to_broad_category[categories.iloc[i]["article"]].append(categories.iloc[i]["broad_category"])
    else:
        article_to_category[categories.iloc[i]["article"]] = [categories.iloc[i]["category"]]
        article_to_broad_category[categories.iloc[i]["article"]] = [categories.iloc[i]["broad_category"]]

In [None]:
# Count how many times each category has occured as a target in the finished and unfinished paths
# NOTE THAT SOME ARTICLES ARE REPRESENTED BY MULTIPLE CATEGORIES AND ARE COUNTED TWICE

all_target_broad_categories_f = [
  article_to_broad_category[target] for target in finished_paths["target"] if target in article_to_broad_category
]
all_target_broad_categories_f = [item for sublist in all_target_broad_categories_f for item in sublist]
count_cats_finished_target = Counter(all_target_broad_categories_f)
keys = list(count_cats_finished_target.keys())
keys.sort()
sorted_cats_f = {i: count_cats_finished_target[i] for i in keys}
#display(sorted_cats_f)

all_target_broad_categories_u = [
  article_to_broad_category[target] for target in unfinished_paths["target"] if target in article_to_broad_category
]
all_target_broad_categories_u = [item for sublist in all_target_broad_categories_u for item in sublist]
count_cats_unfinished_target = Counter(all_target_broad_categories_u)
keys = list(count_cats_unfinished_target.keys())
keys.sort()
sorted_cats_u = {i: count_cats_unfinished_target[i] for i in keys}
#display(sorted_cats_u)

ax = plt.barh(list(sorted_cats_f.keys()), sorted_cats_f.values(), label="Finished paths")
ax2 = plt.barh(list(sorted_cats_u.keys()), sorted_cats_u.values(), label="Unfinished paths")
plt.xlabel("Count")
plt.title("Occurences of categories as targets")
plt.gca().invert_yaxis()
plt.legend()
plt.show()

In [None]:
# Which countries are targets in finished paths
country_targets_f = [
  target for target in finished_paths["target"] if target in article_to_broad_category and "Countries" in article_to_broad_category[target]
]
count_countries_finished_target = Counter(country_targets_f)
display(count_countries_finished_target)

In [None]:
# Which countries are targets in unfinished paths

country_targets_u = [
  target for target in unfinished_paths["target"] if target in article_to_broad_category and "Countries" in article_to_broad_category[target]
]
count_countries_unfinished_target = Counter(country_targets_u)
display(count_countries_unfinished_target)

# There are certainly some trends here. Haiti, Samoa, and the Gaza Strip, for example, are over-represented in the unfinished paths
# when compared to the finished paths.


In [None]:
# In fact we can see that there are some countries that occured as a target more in unfinished paths than in finished paths
count_countries_unfinished_target - count_countries_finished_target

In [None]:
total_country_counts = count_countries_unfinished_target + count_countries_finished_target
country_percent_in_unfinished = total_country_counts.copy()
country_percent_in_finished = total_country_counts.copy()

for item, count in country_percent_in_unfinished.items():
    country_percent_in_unfinished[item] = count_countries_unfinished_target[item] / total_country_counts[item]

for item, count in country_percent_in_finished.items():
    country_percent_in_finished[item] = count_countries_finished_target[item] / total_country_counts[item]

In [None]:
country_percent_in_unfinished

In [None]:
country_percent_in_finished

#### Explore subject strength between connected article

In [None]:
# Visualizing FINISHED PATHS article connections per category
edge_category = merge_articles_categories(edges, ["start", "end"], articles_categories)
visualize_article_connections_per_category(edge_category, "Article Connections Based on Category (Normalized and Scaled Edges)")

In [None]:
# Visualizing FINISHED PATHS article connections per category
finished_paths_categories = merge_articles_categories(finished_paths, ["start", "target"], articles_categories)
visualize_article_connections_per_category(finished_paths_categories, "Start & Target Article Connections in Finished Path Based on Category (Normalized and Scaled Edges)")

In [None]:
# Visualizing UNFINISHED PATHS article connections per category
unfinished_paths_categories = merge_articles_categories(unfinished_paths, ["start", "target"], articles_categories)
visualize_article_connections_per_category(unfinished_paths_categories, "Start & Target Article Connections in Unfinished Path Based on Category (Normalized and Scaled Edges)")

#### Explore articles metrics

In [None]:
article_metrics["stopword_percentage"] = article_metrics["stopword_count"] / article_metrics["word_count"]
article_metrics["non_stopword_percentage"] = article_metrics["non_stopword_count"] / article_metrics["word_count"]
display(article_metrics.head())

##### Articles metrics per category

In [None]:
# Merge articles with their corresponding categories
article_metrics_with_categories = article_metrics.merge(categories, how="left", on=["article"])
display(article_metrics_with_categories.head())

In [None]:
metrics_to_plot = ['word_count', 'stopword_count', 'stopword_percentage', 'non_stopword_count', 'non_stopword_percentage','avg_word_length', 'avg_sent_length', 'paragraph_count','readability_score']
fig, axes = plt.subplots(nrows=len(metrics_to_plot), ncols=2, figsize=(15, 6 * len(metrics_to_plot)))

for idx, metric in enumerate(metrics_to_plot):
  # Bar plot
  ax_bar = axes[idx, 0]
  sns.barplot(x=article_metrics_with_categories["broad_category"], y=article_metrics_with_categories[metric], errorbar=("ci", 95), ax=ax_bar)
  ax_bar.set_xlabel("Category")
  ax_bar.set_ylabel(metric)
  ax_bar.set_title("Mean and CI of {} per Category".format(metric))
  ax_bar.set_xticklabels(ax_bar.get_xticklabels(), rotation=90)

  # Violin plot
  ax_violin = axes[idx, 1]
  sns.violinplot(x=article_metrics_with_categories["broad_category"], y=article_metrics_with_categories[metric], ax=ax_violin)
  ax_violin.set_xlabel("Category")
  ax_violin.set_ylabel(metric)
  ax_violin.set_title("Distribution of {} per Category".format(metric))
  ax_violin.set_xticklabels(ax_violin.get_xticklabels(), rotation=90)

plt.tight_layout()
plt.show()

##### Articles metrics per finished vs unfinished paths

In [None]:
# Show the article metrics per finished and unfinished parths (both for start and end articles)
start_finished_article_metrics = finished_paths.merge(article_metrics_with_categories, how="left", left_on="start", right_on="article")
end_finished_article_metrics = finished_paths.merge(article_metrics_with_categories, how="left", left_on="target", right_on="article")
start_unfinished_article_metrics = unfinished_paths.merge(article_metrics_with_categories, how="left", left_on="start", right_on="article")
end_unfinished_article_metrics = unfinished_paths.merge(article_metrics_with_categories, how="left", left_on="target", right_on="article")

In [None]:
metrics_to_plot = ["word_count", "stopword_count", "stopword_percentage", "non_stopword_count", "non_stopword_percentage","avg_word_length", "avg_sent_length", "paragraph_count", "readability_score"]
dataframes = [start_finished_article_metrics, start_unfinished_article_metrics, end_finished_article_metrics, end_unfinished_article_metrics]
dataframe_labels = ["Start Finished", "Start Unfinished", "Target Finished", "Target Unfinished"]


fig, axes = plt.subplots(nrows=len(metrics_to_plot), ncols=2, figsize=(15, 6 * len(metrics_to_plot)))

for idx, metric in enumerate(metrics_to_plot):
  data = [df[metric] for df in dataframes]
  
  # Bar plot
  ax_bar = axes[idx, 0]
  sns.barplot(data=data, errorbar=("ci", 95), ax=ax_bar)
  ax_bar.set_xlabel("Type of article")
  ax_bar.set_ylabel(metric)
  ax_bar.set_title("Mean and CI of {} per Category".format(metric))
  ax_bar.set_xticklabels(dataframe_labels)

  # Violin plot
  ax_violin = axes[idx, 1]
  sns.violinplot(data=data, ax=ax_violin)
  ax_bar.set_xlabel("Type of article")
  ax_violin.set_ylabel(metric)
  ax_violin.set_title("Distribution of {} per Category".format(metric))
  ax_violin.set_xticklabels(dataframe_labels)

plt.tight_layout()
plt.show()

In [None]:
print("Start Articles (comparing finished vs unfinished):")
t_test_article_metrics(metrics_to_plot, start_finished_article_metrics, start_unfinished_article_metrics)

print("\nTarget Articles (comparing finished vs unfinished):")
t_test_article_metrics(metrics_to_plot, end_finished_article_metrics, end_unfinished_article_metrics)

print("\nFinished Articles (comparing start vs target):")
t_test_article_metrics(metrics_to_plot, start_finished_article_metrics, end_finished_article_metrics)

print("\nUnfinished Articles (comparing start vs target):")
t_test_article_metrics(metrics_to_plot, start_unfinished_article_metrics, end_unfinished_article_metrics)

### Analyze links to targets in finished vs unfinished articles

In [None]:
finished_paths["links_to_target"] = finished_paths["path"].apply(lambda x: len(edges.loc[edges["end"] == x[-1]]))
unfinished_paths["links_to_target"] = unfinished_paths["target"].apply(lambda x: len(edges.loc[edges["end"] == x]))

In [None]:
# Printing mean number of links to the targets in the finished and unfinished paths.
print(f"The targets that were reached had {finished_paths['links_to_target'].mean()} links on average pointing to them.")
print(f"The targets that were not reached had {unfinished_paths['links_to_target'].mean()} links on average pointing to them.")

In [None]:
# Conducting a t-test
stats.ttest_ind(finished_paths["links_to_target"], unfinished_paths["links_to_target"])

In [None]:
# Creating a boxplot of the trends

finished_links =  pd.DataFrame()
finished_links["links_to_target"] = finished_paths["links_to_target"]
finished_links["path_type"] = "Finished paths"

unfinished_links =  pd.DataFrame()
unfinished_links["links_to_target"] = unfinished_paths["links_to_target"]
unfinished_links["path_type"] = "Unfinished paths"

df_links = pd.concat([finished_links,unfinished_links])

ax = sns.boxplot(x="path_type", y="links_to_target", data=df_links)
plt.xlabel(" ")
plt.ylim([-5,155])
plt.ylabel("Number of links to target")

The p-value of a t-test between the number of links pointing to the targets of finished and unfinished paths is 0.0. This means we reject the null hypothesis that the number of links pointing to the targets are statistically the same at the 5% level of significance.

### Analyse possible shortest path distances

In [None]:
# Retrieving the shortest possible paths for the finished games

finished_paths["shortest_path_length"] = finished_paths["path"].apply(
    lambda x: shortest_paths[articles.loc[articles['article'] == x[0]].index[0]][articles.loc[articles['article'] == x[-1]].index[0]]
    )


NOTE IMPORTANT: THERE ARE TYPOS


Eg. At index 141 in unfinished paths, the target is written as "Long_peper", when it should be "Long_pepper"

Overall, 28 times an issue arises in unfinished paths. Doesn't seem to be an issue in finished paths

In [None]:
# Retrieving the shortest possible paths for the unfinished games

shortest_unfinished = []
not_found = 0
for i in range(len(unfinished_paths)):
    source = articles.loc[articles['article'] == unfinished_paths.iloc[i]["path"][0]]
    target = articles.loc[articles['article'] == unfinished_paths.iloc[i]["target"]]
    if len(source) != 0 and len(target) != 0:
        index_source = source.index[0]
        index_target = target.index[0]
        shortest_unfinished.append(int(shortest_paths[index_source][index_target]))
    else:
        shortest_unfinished.append(None)
        not_found+=1

unfinished_paths["shortest_path_length"] = shortest_unfinished
print(f"{not_found} shortest paths not found")

In [None]:
# Testing to see if there are issues in the finished paths too

shortest_finished = []
not_found2 = 0
for i in range(len(finished_paths)):
    source = articles.loc[articles['article'] == finished_paths.iloc[i]["path"][0]]
    target = articles.loc[articles['article'] == finished_paths.iloc[i]["path"][-1]]
    if len(source) != 0 and len(target) != 0:
        index_source = source.index[0]
        index_target = target.index[0]
        shortest_finished.append(int(shortest_paths[index_source][index_target]))
    else:
        shortest_finished.append(None)
        not_found2+=1

print(f"{not_found2} shortest paths not found")

In [None]:
# Counting number of "impossible" paths

print(f"There are {len(finished_paths[finished_paths['shortest_path_length'] == 255])} impossible finished paths.")
print(f"There are {len(unfinished_paths[unfinished_paths['shortest_path_length'] == 255])} impossible unfinished paths.")

In [None]:
# Printing mean shortest possible paths in the finished and unfinished paths.
print(f"The shortest possible paths were {finished_paths['shortest_path_length'].mean()} long on average in the finished paths.")
print(f"The shortest possible paths were {unfinished_paths['shortest_path_length'].mean()} long on average in the unfinished paths.")

In [None]:
# Doing a t test on the shortest path lengths
stats.ttest_ind(finished_paths['shortest_path_length'], unfinished_paths['shortest_path_length'], nan_policy="omit")

In [None]:
# Creating a boxplot of the trends

finished_shortest =  pd.DataFrame()
finished_shortest["shortest_path_length"] = finished_paths[finished_paths['shortest_path_length'] != 255]["shortest_path_length"]
finished_shortest["path_type"] = "Finished paths"

unfinished_shortest =  pd.DataFrame()
unfinished_shortest["shortest_path_length"] = unfinished_paths[unfinished_paths['shortest_path_length'] != 255]["shortest_path_length"]
unfinished_shortest["path_type"] = "Unfinished paths"

df_shortest = pd.concat([finished_shortest,unfinished_shortest])

ax = sns.boxplot(x="path_type", y="shortest_path_length", data=df_shortest)
plt.xlabel(" ")
#plt.ylim([-5,155])
plt.ylabel("Shortest path possible from source to target")

The t test shows that indeed this is a significant difference. Unfinished paths are thus inherently more difficult to get to.

This is an interesting situation. The past two analyses show that the targets are more difficult to get to in the unfinished paths, due to the fewer links that point at them and the larger value of the possible shortest path to them.

A challenge for us may be to try to isolate whether the difference between whether a path is finished or not can be fully explained by more objective factors like this, or if there is a human component that we can isolate as well. Eg, are some categories actually more difficult to get to, or do the differences in the target category distributions in the finished and unfinished paths arise because some categories may be more likely to have longer possible shortest paths to them or have fewer links pointing at them?

We should explore these ideas

TODO: Start working on the questions above

### Analyze Networkx graph objects 
degreehistograms, etc. etc.

## Putting everything together
put all article clicks (before and after giving up, also include all succesfully finished paths) in a wide form df to run statistical anaylsis
- merge with article metrics
- merge with article categories
- merge with game information
- merge with player information
- merge with backclick information

In [None]:
article_information = article_metrics.copy()
article_information["article_name"] = article_information["file_name"].apply(lambda name: name.split(".")[0])

# merge in categories
article_information = pd.merge(article_information, categories[["article", "broad_category"]], how="left", left_on="article_name", right_on="article")

keep = ['article_name',
        'word_count',
        'non_stopword_count',
        'stopword_count',
        'avg_word_length',
        'avg_sent_length',
        'paragraph_count',
        'readability_score',
        'broad_category']

article_information = article_information[keep]
article_information

## exploration per actual link 

In [None]:
# create all links (with duplicates) and add a game ID (e.g., index of finsihed and unfinished path)

# add IDs
finished_paths["game_id"] = "F" + pd.Series(finished_paths.index).astype(str)
unfinished_paths["game_id"] = "U" + pd.Series(unfinished_paths.index).astype(str)

def get_all_clicks(df):
    out = []
    for index, row in df.iterrows():
        links = row['path'].split(';')
        
        edges = list(pairwise(links))
        game_id = row["game_id"]
        to_add = []
        clicks = 1
        for edge in edges:
            source, target = edge
            clicks += 1
            out.append([source, target, clicks, game_id])


    out = pd.DataFrame(out, columns=["source", "target", "num_step", "game_id"])
    return out



In [None]:
# add 
finished_clicks = get_all_clicks(finished_paths)
finished_clicks["give_up"] = 0

In [None]:
unfinished_paths_sample = unfinished_paths[unfinished_paths.path_length > 3]
unfinished_clicks = get_all_clicks(unfinished_paths_sample)
unfinished_clicks["give_up"] = 0
unfinished_clicks.loc[unfinished_clicks.groupby('game_id').tail(1).index, 'give_up'] = 1
unfinished_clicks

In [None]:
all_clicks = pd.concat((finished_clicks.sample(10_000), unfinished_clicks), axis=0)
all_clicks = all_clicks.merge(article_information, how="left", left_on="source", right_on="article_name")
all_clicks

In [None]:
data = all_clicks.dropna()
print(data.give_up.sum())
data.isna().any()

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

mod = smf.logit(formula='give_up ~ num_step + word_count + non_stopword_count + stopword_count + avg_word_length + avg_sent_length + paragraph_count + readability_score + C(broad_category)', data=data)
res = mod.fit(maxiter=100)
print(res.summary())

# not that interesting, the fiut is very very bad, it doesnt converge
# also, only statistically relevant thing is average sentence lenght.

## Exploration per Game

In [None]:
# do same analysis on game level - can we predict whether they will give up before the game even started?

unfinished_paths["start"] = unfinished_paths.path.apply(lambda el: el.split(";")[0])
unfinished_paths["give_up"] = 1

# add start plus end category
finished_paths["start"] = finished_paths.path.apply(lambda el: el.split(";")[0])
finished_paths["target"] = finished_paths.path.apply(lambda el: el.split(";")[-1])
finished_paths["give_up"] = 0


keep = ["start", "target", "give_up", "shortest_path", "links_to_target"]
data = pd.concat((finished_paths[keep], unfinished_paths[keep]), axis=0)
data = data.merge(article_information[["article_name", "broad_category"]], how="left", left_on="start", right_on="article_name")
data = data.merge(article_information[["article_name", "broad_category"]], how="left", left_on="target", right_on="article_name")
data = data.drop(["article_name_x", "article_name_y"], axis=1)
data["not_same_cat"] = data["broad_category_x"] != data["broad_category_y"]
data

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

mod = smf.logit(formula='give_up ~ C(broad_category_y) + C(broad_category_x) + not_same_cat + shortest_path + links_to_target', data=data)
res = mod.fit(maxiter=100)
print(res.summary())

### Interesting takeawys: 
# some categories are statistically significnat (.e.g, countries, geography) and lower the probability
# some categories increase the proabaility
# having to switch categories from source to target increases the probability
# shortest path: the longer the shortest path, the higher the probability of giving up.

### We can easily further expand this analysis