# ADA CAPI Notebook for Data Exploration

In [None]:
import pandas as pd
import networkx as nx
import numpy as np
import os
import urllib
import datetime as datetime
import matplotlib.pyplot as plt
import seaborn as sns
# load config and extract variables
import config
DATA_PATH = config.PATH_TO_DATA

### Loading and Preparing the Data
Load and clean up the paths, load into weighted graph structure etc.

#### Load Data

In [None]:
# load in all data (except wikipedia articles)
finished_paths = pd.read_csv(os.path.join(DATA_PATH, "paths_finished.tsv"), sep='\t', skiprows=15, names=["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"])
unfinished_paths = pd.read_csv(os.path.join(DATA_PATH, "paths_unfinished.tsv"), sep='\t', skiprows=16, names=["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "type"])
edges = pd.read_csv(os.path.join(DATA_PATH, "links.tsv"), sep='\t', skiprows=15, names=["start", "end"], encoding="utf-8")
articles = pd.read_csv(os.path.join(DATA_PATH, "articles.tsv"), sep='\t', skiprows=12, names=["article"], encoding="utf-8")
categories = pd.read_csv(os.path.join(DATA_PATH, "categories.tsv"), sep='\t', skiprows=13, names=["article", "category"], encoding="utf-8")
shortest_paths = np.genfromtxt(os.path.join(DATA_PATH, "shortest-path-distance-matrix.txt"), delimiter=1, dtype=np.uint8)

In [None]:
finished_paths.info()
display(finished_paths.head())

In [None]:
unfinished_paths.info()
display(unfinished_paths.head())

In [None]:
edges.info()
edges.head()

In [None]:
articles.info()
articles.head()

In [None]:
categories.head()
categories.head()

In [None]:
# shortest paths corresponds to numpy matrix, where 255 signifies no path (underscore in the .txt file), the diagonal is zero
# the row index is the zero-based index corresponding to the index in the articles dataframe, same for the columns (target article)
print((np.diag(shortest_paths)==0).all())
shortest_paths

#### Clean up

In [None]:
# Clean up edge list
display(edges.head())
edges["start"] = edges.start.apply(urllib.parse.unquote)
edges["end"] = edges.end.apply(urllib.parse.unquote)
display(edges.head())

In [None]:
# format datetime as datetime object
finished_paths["datetime"] = finished_paths.timestamp.apply(datetime.datetime.fromtimestamp)
unfinished_paths["datetime"] = unfinished_paths.timestamp.apply(datetime.datetime.fromtimestamp)
display(unfinished_paths.head())

In [None]:
# clean up url encoding for articles
display(articles.head())
articles["article"] = articles.article.apply(urllib.parse.unquote)
display(articles.head())

In [None]:
# clean up url encoding for categories
display(categories.head())
categories["article"] = categories.article.apply(urllib.parse.unquote)
display(categories.head())

In [None]:
# merge articles and categories
articles_categories = pd.merge(articles, categories, how="left", on="article")
display(articles_categories.head())
# 6 articles without category!
print("Merge introduced {} NAs in category columns:".format(articles_categories.category.isna().sum()))
articles_categories[articles_categories.category.isna()]

In [None]:
# create networkx graph from 
# TODO: create dweighted graphs, 

### General Data Exploration
Explore distribution of all relevant variables, analyze and potentially fill missing values, sîmple summary stats

#### Explore Path lengths across finished and unfinished paths

In [None]:
# distribution of path lengths disaggregated across finished and unfinished
unfinished_paths["path_length"] = unfinished_paths.path.apply(lambda el: len(el.split(";")))
finished_paths["path_length"] = finished_paths.path.apply(lambda el: len(el.split(";")))

print("Finished Paths: Length")
display(finished_paths["path_length"].describe())
display(finished_paths.path_length.value_counts())

print("Unfinished Paths: Length")
display(unfinished_paths["path_length"].describe())
unfinished_paths.path_length.value_counts()


In [None]:
# make plot of path lengths
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 4), sharey=True)

sns.histplot(data=finished_paths, x="path_length", ax=axes[0])
axes[0].set_title("Finished Paths")
sns.histplot(data=unfinished_paths, x="path_length", ax=axes[1], hue="type")
axes[1].set_title("Uninished Paths")

# --> highly skewed and many unlikely outcomes (e.g. unfinished paths path length = 1, did they really give up? or not play at all?)

In [None]:
# TODO: plot comparing path lengths after cleaning up (e.g., kicking out top 10 percentiles, log transforms etc.) to better understand what is going on

### Analyze Networkx graph objects 
degreehistograms, etc. etc.

#### Next Idea

### Exploration Specific to Idea 1
Explore specific questions as noted in notion

In [None]:
# TODO: generate some summary stats on the wikipedia articles (length, number of hyperlinks etc. from the additional data given in teh task (not laoded yet)) to check some of our hypotheses

### Exploration Specific to Idea 2
Explore specific questions as noted in notion

### Exploration Specific to Idea 3
Explore specific questions as noted in notion

### Exploration Specific to Idea 4
Explore specific questions as noted in notion

### Exploration Specific to Idea 5
Explore specific questions as noted in notion