# ADA CAPI Notebook for Data Exploration

In [None]:
import pandas as pd
import networkx as nx
import numpy as np
import os
import urllib
import datetime as datetime
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import textstat
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt') # Punkt tokenizer
nltk.download('stopwords') # Commong stopwords

# load config and extract variables
import config
DATA_PATH = config.PATH_TO_DATA

### Loading and Preparing the Data
Load and clean up the paths, load into weighted graph structure etc.

#### Load Data

In [None]:
# load in all data (except wikipedia articles)
finished_paths = pd.read_csv(os.path.join(DATA_PATH, "wikispeedia_paths-and-graph/paths_finished.tsv"), sep='\t', skiprows=15, names=["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"])
unfinished_paths = pd.read_csv(os.path.join(DATA_PATH, "wikispeedia_paths-and-graph/paths_unfinished.tsv"), sep='\t', skiprows=16, names=["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "type"])
edges = pd.read_csv(os.path.join(DATA_PATH, "wikispeedia_paths-and-graph/links.tsv"), sep='\t', skiprows=15, names=["start", "end"], encoding="utf-8")
articles = pd.read_csv(os.path.join(DATA_PATH, "wikispeedia_paths-and-graph/articles.tsv"), sep='\t', skiprows=12, names=["article"], encoding="utf-8")
categories = pd.read_csv(os.path.join(DATA_PATH, "wikispeedia_paths-and-graph/categories.tsv"), sep='\t', skiprows=13, names=["article", "category"], encoding="utf-8")
shortest_paths = np.genfromtxt(os.path.join(DATA_PATH, "wikispeedia_paths-and-graph/shortest-path-distance-matrix.txt"), delimiter=1, dtype=np.uint8)

In [None]:
finished_paths.info()
display(finished_paths.head())

In [None]:
unfinished_paths.info()
display(unfinished_paths.head())

In [None]:
edges.info()
edges.head()

In [None]:
articles.info()
articles.head()

In [None]:
categories.head()
categories.head()

In [None]:
# shortest paths corresponds to numpy matrix, where 255 signifies no path (underscore in the .txt file), the diagonal is zero
# the row index is the zero-based index corresponding to the index in the articles dataframe, same for the columns (target article)
print((np.diag(shortest_paths)==0).all())
shortest_paths

#### Clean up

In [None]:
# Clean up edge list
display(edges.head())
edges["start"] = edges.start.apply(urllib.parse.unquote)
edges["end"] = edges.end.apply(urllib.parse.unquote)
display(edges.head())

In [None]:
# format datetime as datetime object
finished_paths["datetime"] = finished_paths.timestamp.apply(datetime.datetime.fromtimestamp)
unfinished_paths["datetime"] = unfinished_paths.timestamp.apply(datetime.datetime.fromtimestamp)
display(unfinished_paths.head())

In [None]:
# clean up url encoding for articles
display(articles.head())
articles["article"] = articles.article.apply(urllib.parse.unquote)
display(articles.head())

In [None]:
# clean up url encoding for categories
display(categories.head())
categories["article"] = categories.article.apply(urllib.parse.unquote)
display(categories.head())

In [None]:
# merge articles and categories
articles_categories = pd.merge(articles, categories, how="left", on="article")
display(articles_categories.head())
# 6 articles without category!
print("Merge introduced {} NAs in category columns:".format(articles_categories.category.isna().sum()))
articles_categories[articles_categories.category.isna()]

In [None]:
# Convert paths to a readable format (lists)

finished_paths_readable = finished_paths.copy()
finished_paths_readable["readable_path"] = finished_paths_readable["path"].apply(lambda x: x.split(";"))
finished_paths_readable["readable_path"] = finished_paths_readable["readable_path"].apply(lambda x: [urllib.parse.unquote(y) for y in x])

unfinished_paths_readable = unfinished_paths.copy()
unfinished_paths_readable["readable_path"] = unfinished_paths_readable["path"].apply(lambda x: x.split(";"))
unfinished_paths_readable["readable_path"] = unfinished_paths_readable["readable_path"].apply(lambda x: [urllib.parse.unquote(y) for y in x])
unfinished_paths_readable["target"] = unfinished_paths_readable["target"].apply(urllib.parse.unquote)

In [None]:
# functions to get all links between articles
from itertools import tee
def pairwise(iterable):
    # from python docs - will be introduced in version 3.10
    # pairwise('ABCDEFG') --> AB BC CD DE EF FG
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)


def get_all_links(df, path_colname="path"):
    edge_counter = {}
    for _, row in df.iterrows():
        links = row['path'].split(';')
        
        edges = list(pairwise(links))

        for edge in edges:
            if edge in edge_counter:
                edge_counter[edge] += 1
            else:
                edge_counter[edge] = 1

    out = pd.Series(edge_counter).reset_index()
    out.columns = ["source", "target", "weight"]
    return out



In [None]:
# get all finished links
finished_links = get_all_links(finished_paths)
finished_links.sort_values(by="weight", ascending=False) # TODO: what is up with these <<< signs?


In [None]:
# get all unfinished links
unfinished_links = get_all_links(unfinished_paths)
unfinished_links.sort_values(by="weight", ascending=False) # TODO: what is up with these <<< signs?


In [None]:
# create newtorkx graph from finished paths
finished_graph = nx.from_pandas_edgelist(finished_links,source="source", target="target", edge_attr="weight")
hist = nx.degree_histogram(finished_graph)
plt.bar(range(len(hist)), hist)
pd.Series(hist).describe()

In [None]:
# create newtorkx graph from unfinished paths
unfinished_graph = nx.from_pandas_edgelist(unfinished_links,source="source", target="target", edge_attr="weight")
hist = nx.degree_histogram(unfinished_graph)
plt.bar(range(len(hist)), hist)
pd.Series(hist).describe()

### General Data Exploration
Explore distribution of all relevant variables, analyze and potentially fill missing values, sîmple summary stats

#### Explore Path lengths across finished and unfinished paths

In [None]:
# distribution of path lengths disaggregated across finished and unfinished
unfinished_paths["path_length"] = unfinished_paths.path.apply(lambda el: len(el.split(";")))
finished_paths["path_length"] = finished_paths.path.apply(lambda el: len(el.split(";")))

print("Finished Paths: Length")
display(finished_paths["path_length"].describe())
display(finished_paths.path_length.value_counts())

print("Unfinished Paths: Length")
display(unfinished_paths["path_length"].describe())
unfinished_paths.path_length.value_counts()


In [None]:
# make plot of path lengths
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 4), sharey=True)

sns.histplot(data=finished_paths, x="path_length", ax=axes[0])
axes[0].set_title("Finished Paths")
sns.histplot(data=unfinished_paths, x="path_length", ax=axes[1], hue="type")
axes[1].set_title("Uninished Paths")

# --> highly skewed and many unlikely outcomes (e.g. unfinished paths path length = 1, did they really give up? or not play at all?)

In [None]:
# TODO: plot comparing path lengths after cleaning up (e.g., kicking out top 10 percentiles, log transforms etc.) to better understand what is going on

# make plot of path lengths
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14, 4), sharey=True)
threshold = 30


sns.histplot(x=finished_paths.path_length[finished_paths.path_length < threshold], ax=axes[0], discrete=True)
axes[0].set_title("Finished Paths")

unfinished_clean = unfinished_paths[(unfinished_paths.path_length < threshold) & (unfinished_paths.type == "restart")]
sns.histplot(data=unfinished_clean, x="path_length", ax=axes[1], discrete=True,)
axes[1].set_title("Uninished Paths - Restart")

unfinished_clean = unfinished_paths[(unfinished_paths.path_length < threshold) & (unfinished_paths.type == "timeout")]
sns.histplot(data=unfinished_clean, x="path_length", ax=axes[2], discrete=True,)
axes[2].set_title("Uninished Paths - Timeout")

#### Explore categories in the paths

In [None]:
# Seeing which categories are most represented in articles

broad_categories = categories.copy()
broad_categories["broad_category"] = broad_categories["category"].apply(lambda x: x.split(".")[1])

count_articles = broad_categories.groupby("broad_category").size()

print("Below shows how many articles each of the broad categories are represented by")
display(count_articles)

In [None]:
# Create dictionaries for easy discovery of what categories an article belongs to
article_to_category = {}
article_to_broad_category = {}
for i in range(len(broad_categories)):
    if broad_categories.iloc[i]["article"] in article_to_category:
        article_to_category[broad_categories.iloc[i]["article"]].append(broad_categories.iloc[i]["category"])
        article_to_broad_category[broad_categories.iloc[i]["article"]].append(broad_categories.iloc[i]["broad_category"])
    else:
        article_to_category[broad_categories.iloc[i]["article"]] = [broad_categories.iloc[i]["category"]]
        article_to_broad_category[broad_categories.iloc[i]["article"]] = [broad_categories.iloc[i]["broad_category"]]

In [None]:
# Count how many times each category has occured as a target in the finished paths

# NOTE THAT SOME ARTICLES ARE REPRESENTED BY MULTIPLE CATEGORIES AND ARE COUNTED TWICE
all_target_broad_categories_f = [article_to_broad_category[target] for target in [path[-1] for path in finished_paths_readable["readable_path"]] if target in article_to_broad_category]
all_target_broad_categories_f = [item for sublist in all_target_broad_categories_f for item in sublist]
count_cats_finished_target = Counter(all_target_broad_categories_f)
display(count_cats_finished_target)

ax = plt.pie(count_cats_finished_target.values(), labels = count_cats_finished_target.keys())
plt.show()

In [None]:
# Count how many times each category has occured as a target in the finished paths

# NOTE THAT SOME ARTICLES ARE REPRESENTED BY MULTIPLE CATEGORIES AND ARE COUNTED TWICE
all_target_broad_categories_u = [article_to_broad_category[target] for target in unfinished_paths_readable["target"] if target in article_to_broad_category]
all_target_broad_categories_u = [item for sublist in all_target_broad_categories_u for item in sublist]
count_cats_unfinished_target = Counter(all_target_broad_categories_u)
count_cats_unfinished_target

display(count_cats_unfinished_target)

ax = plt.pie(count_cats_unfinished_target.values(), labels = count_cats_unfinished_target.keys())
plt.show()

# There are certain categories that show up more or less here proportionally!

In [None]:
# Which countries are targets in finished paths

country_targets_f = [target for target in [path[-1] for path in finished_paths_readable["readable_path"]] if target in article_to_broad_category and "Countries" in article_to_broad_category[target]]
count_countries_finished_target = Counter(country_targets_f)
display(count_countries_finished_target)

In [None]:
# Which countries are targets in unfinished paths

country_targets_u = [target for target in unfinished_paths_readable["target"] if target in article_to_broad_category and "Countries" in article_to_broad_category[target]]
count_countries_unfinished_target = Counter(country_targets_u)
display(count_countries_unfinished_target)

# There are certainly some trends here. Haiti, Samoa, and the Gaza Strip, for example, are over-represented in the unfinished paths
# when compared to the finished paths.


In [None]:
# In fact we can see that there are some countries that occured as a target more in unfinished paths than in finished paths
count_countries_unfinished_target - count_countries_finished_target

In [None]:
total_country_counts = count_countries_unfinished_target + count_countries_finished_target
country_percent_in_unfinished = total_country_counts.copy()
country_percent_in_finished = total_country_counts.copy()

for item, count in country_percent_in_unfinished.items():
    country_percent_in_unfinished[item] = count_countries_unfinished_target[item] / total_country_counts[item]

for item, count in country_percent_in_finished.items():
    country_percent_in_finished[item] = count_countries_finished_target[item] / total_country_counts[item]

In [None]:
country_percent_in_unfinished

In [None]:
country_percent_in_finished

#### Exploring subject strength between connected article

In [None]:
edge_article_df = edges.copy()

def add_categories(row, column):
    article_name = row[column]
    if article_name in article_to_broad_category:
        return article_to_broad_category[article_name]
    else:
        return []

edge_article_df['start_categories'] = edge_article_df.apply(add_categories, args=("start",), axis=1)
edge_article_df['end_categories'] = edge_article_df.apply(add_categories, args=("end",), axis=1)
display(edge_article_df)


In [None]:
graph = nx.DiGraph()

for index, row in edge_article_df.iterrows():
    start_article = row['start'][0]
    for start_category in row['start_categories']:
      graph.add_node(start_category)
      for end_category in row['end_categories']:
        graph.add_node(end_category)

        if graph.has_edge(start_category, end_category):
          graph[start_category][end_category]['weight'] += 1
        else:
          graph.add_edge(start_category, end_category, weight=1)

edge_weights = [graph[u][v]['weight'] for u, v in graph.edges()]
max_edge_weight = max(edge_weights)
min_edge_weight = min(edge_weights)
normalized_edge_weights = [(weight - min_edge_weight) / (max_edge_weight - min_edge_weight) for weight in edge_weights]
edge_widths = [weight * 5 for weight in normalized_edge_weights]

figure = nx.shell_layout(graph)
plt.figure(figsize=(8, 8))
nx.draw(graph, figure, with_labels=True, width=edge_widths, edge_color='gray', arrows=True)
plt.title("Article Connections Based on Subjects (Normalized and Scaled Edges)")
plt.show()


#### Analyzing articles

Metrics:
* Total word count: To understand the length of the article.
* Non stopword frequency: To identify words that contribute to the content's meaning.
* Stopword frequency: To identify common words that may not contribute to the content's meaning.
* Average word length: To assess the complexity of the language used.
* Average sentence length: Longer or more complex sentences (based on characters) may contribute to frustration.
* Number of paragraphs: To see if the article's structure plays a role in people giving up.
* Keyword frequency: To identify the most common keywords to understand the article's focus.
* Readability: Ease of reading the article (metric: Flesch Reading Ease Score) Link: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

TODO:
* Compute article embedding
* Sentiment analysis (people might dislike certain topics)

In [None]:
def proprocess_article(article_text):
    preprocessed_text = article_text
    preprocessed_text = preprocessed_text.lower()
    preprocessed_text = preprocessed_text.replace("\n   ", " ") # As the articles are not continuous sentences
    return preprocessed_text

def calculate_article_metrics(article_text):
    preprocessed_text = proprocess_article(article_text)

    words = word_tokenize(preprocessed_text)
    sentences = sent_tokenize(preprocessed_text)

    # Calculate total word count
    total_word_count = len(words)

    # Calculate stopword frequency
    stop_words = set(stopwords.words("english"))
    stopwords_count = sum(1 for word in words if word.lower() in stop_words)

    # Calculate average word length
    average_word_length = sum(len(word) for word in words) / total_word_count

    # Calculate average sentence length
    average_sentence_length = sum(len(sentence) for sentence in sentences) / len(sentences)

    # Calculate number of paragraphs (assume every new line \n is paragraph)
    paragraphs_count = preprocessed_text.count('\n') + 1 # Count last paragraph

    # Calculate keyword frequency
    word_freq = nltk.FreqDist(words)
    most_common_words = word_freq.most_common(10)  # Parameter to adjust

    # Calculate readability (Flesch Reading Ease Score) - 100: Easy to read, 0: Very confusing
    readability = textstat.flesch_reading_ease(preprocessed_text)

    return {
        "word_count": total_word_count,
        "non_stopword_count": total_word_count - stopwords_count,
        "stopword_count": stopwords_count,
        "avg_word_length": average_word_length,
        "avg_sent_length": average_sentence_length,
        "paragraph_count": paragraphs_count,
        "common_words": most_common_words,
        "readability_score": readability,
    }

In [None]:
path = os.path.join(DATA_PATH, "plaintext_articles")
if os.path.exists(path) and os.path.isdir(path):

  article_metrics = pd.DataFrame(columns=["file_name", "word_count", "non_stopword_count", "stopword_count", "avg_word_length", "avg_sent_length", "paragraph_count", "common_words", "readability_score"])

  #Testing: for file_name in ["%C3%81ed%C3%A1n_mac_Gabr%C3%A1in.txt"]:
  for file_name in os.listdir(path):
    file_path = os.path.join(path, file_name)
    
    if os.path.isfile(file_path):
      readable_file_name = urllib.parse.unquote(file_name)
      
      with open(file_path, "r", encoding="utf-8") as article:
        metrics = calculate_article_metrics(article.read())

        metrics["file_name"] = readable_file_name
        article_metrics.loc[len(article_metrics)] = metrics
else:
  raise FileNotFoundError("The specified folder path does not exist or is not a directory.")

In [None]:
article_metrics

TODO: percentage wise, from start to end, what categories are used

### Analyze Networkx graph objects 
degreehistograms, etc. etc.

#### Next Idea

### Exploration Specific to Idea 1
Explore specific questions as noted in notion

In [None]:
# TODO: generate some summary stats on the wikipedia articles (length, number of hyperlinks etc. from the additional data given in teh task (not laoded yet)) to check some of our hypotheses

### Exploration Specific to Idea 2
Explore specific questions as noted in notion

### Exploration Specific to Idea 3
Explore specific questions as noted in notion

### Exploration Specific to Idea 4
Explore specific questions as noted in notion

### Exploration Specific to Idea 5
Explore specific questions as noted in notion