In [None]:
from data import Data
from engine import Engine, TreeClimber
from html_builder import HTMLBuilder
from krakow import krakow
from krakow.utils import create_dendrogram, normalized_dasgupta_cost

import matplotlib.pyplot as plt

import io
from PIL import Image

import numpy as np

In [None]:
# data = Data(alpha=15, use_cached_forum_data=True, forum="lw")
data = Data(alpha=9, use_cached_forum_data=True, forum="ea")
climber = TreeClimber(data)
engine = Engine(data, climber)

# Analyze cracy of posts

In [None]:
# filter out posts where one of the scores is negative
filtered_posts = [post for post in data.posts.values() if post.democraticScore > 0 and post.meritocraticScore > 0]
# find the median ratio
_demo = np.array([post["democraticScore"] for post in filtered_posts])
_meri = np.array([post["meritocraticScore"] for post in filtered_posts])
median_ratio = np.median(_meri / _demo)

# make the axes equal and rectangular
plt.figure(figsize=(10,10))
# chop off at 0
plt.xlim(0, 350)
plt.ylim(0, 350)
# plot line with a slope of the median ratio
plt.plot([0, 350], [0, 350*median_ratio], color="white", linewidth=0.5)
plt.scatter(_demo, _meri, s=1)

In [None]:
f = lambda p: p.democraticScore

sorted_posts = sorted([post for post in filtered_posts if post.democraticScore*median_ratio > post.meritocraticScore], key=f, reverse=True)[:25]
for post in sorted_posts:
    print(f"{post.bigDownvotes:3} {post.smallDownvotes:3} {post.smallUpvotes:3} {post.bigUpvotes:3}     {post.democraticScore:3} {post.baseScore:3} {post.meritocraticScore:3}     {post.cracy:.3f} {post.title}")

In [None]:
f = lambda p: p.meritocraticScore

sorted_posts = sorted([post for post in filtered_posts if post.democraticScore*median_ratio <= post.meritocraticScore], key=f, reverse=True)[:25]
for post in sorted_posts:
    print(f"{post.bigDownvotes:3} {post.smallDownvotes:3} {post.smallUpvotes:3} {post.bigUpvotes:3}     {post.democraticScore:3} {post.baseScore:3} {post.meritocraticScore:3}     {post.cracy:.3f} {post.title}")

# Analyze overlap of tags

In [None]:
# find tags with highest cooccurence
cooccurence_data = [(u, v, edge_data["weight"]) for u, v, edge_data in data.Tag_cooccurence.edges(data=True)]

# normalize cooccurences by the size of two tags
normalized_cooccurences = dict()
for u, v, cooccurence in cooccurence_data:
    u_size = data.Tag_cooccurence[u][u]["weight"]
    v_size = data.Tag_cooccurence[v][v]["weight"]
    sum_of_occurence = u_size + v_size - cooccurence
    normalized_cooccurences[(u, v)] = cooccurence / sum_of_occurence

# sort by normalized cooccurence
sorted_cooccurences = sorted(normalized_cooccurences.items(), key=lambda x: x[1], reverse=True)

# print tags with highest overlap
for (u, v), cooccurence in sorted_cooccurences:
    if u == v:
        continue
    print(f'{data.tags[u]["name"]:30} {data.tags[v]["name"]:30} {cooccurence:.3f}')

In [None]:
for post in data.posts.values():
    print(post["commentCount"])
    # if post["commentCount"] is None:
    #     print(post["title"])

In [None]:
tag_scores = dict()
for post in data.posts.values():
    for tag, relevance in post["tagRelevance"].items():
        if tag not in tag_scores:
            tag_scores[tag] = []
        tag_scores[tag].append((post["baseScore"], relevance))

In [None]:
# tags_sorted_by_quality = sorted(tag_quality.items(), key=lambda x: np.average(x[1][0], weights=x[1][1]), reverse=True)

tag_qualities = dict()
for tag, scores_and_relevances in tag_scores.items():
    scores, relevances = zip(*scores_and_relevances)
    quality = np.average(scores, weights=relevances)
    tag_qualities[tag] = quality

tags_sorted_by_quality = sorted(tag_qualities.items(), key=lambda x: x[1], reverse=True)


In [None]:
for tag_id, quality in tags_sorted_by_quality:
    scores = [score for score, relevance in tag_scores[tag_id]]
    if len(scores) < 2:
        continue
    if tag_id not in data.tags:
        print(f"{tag_id} not in tags")
        continue
    print(f'{data.tags[tag_id]["name"]:50} {quality:.0f}     {scores}')

# Analyze score distribution

In [None]:
scores = [post["baseScore"] for post in data.posts.values()]

In [None]:
# plot histogram of scores
plt.hist(scores, bins=100, range=(-10, 200))
None

In [None]:
log_scores = np.log(np.clip(scores, 1, None))

In [None]:
# plot histogram of log scores
plt.hist(log_scores, bins=50)
None

In [None]:
sorted_posts = sorted(data.posts.values(), key=lambda x: x["baseScore"], reverse=True)

In [None]:
sorted_posts[-2]

In [None]:
# find which percentile a score 52
percentile = np.percentile(scores, 80.2)
percentile

# Investigate tags present in posts, but not listed by GraphQL query

In [None]:
for post in data.posts.values():
    if "ZJEM3pibQmic8Rp5G" in post["tagRelevance"]:
        print(post)
        print()

In [None]:
for tag in data.tags.values():
    if tag["name"] == "Community":
        print(tag)
        print()

In [None]:
data.tags["Sgx48Pf8PzmTxSEEG"]

In [None]:
"Community" "Frontpage"

In [None]:
data.posts["rDAZancpWpMwxjoFg"]