In [None]:
%pylab inline
import json
import msgpack
import pandas as pd
import scipy.stats as spstats
import numpy as np
import logbook

In [None]:
project_file = "data/projects-2016-10-14.json"
importance_file = "archive/04b_find_importance/2017-10-03 13:54:30 14665f3/importance.utf8.tsv"
map_file = "archive/07_create_article_project_map/2017-03-27 14:10:49 77c76e1/articles_projects.m"
similarity_file = "similarity_mean.csv"
exp = logbook.Experiment("24ci find similarity")

In [None]:
# Load projects
projects = {}
projects_by_name = {}
with open(project_file, 'rb') as f:
    for i, row in enumerate(f):
        data = json.loads(row)
        project_id = data['project_id']
        projects[project_id] = data
        projects[data['project_name']] = data

In [None]:
# Load article-project map
with open(map_file, 'rb') as f:
    article_projects = msgpack.unpackb(f.read())

In [None]:
project_ids = sorted(projects.keys())                
project_important = dict((project_id, set()) for project_id in project_ids)
skipped_projects = set()
with open(importance_file, "rb") as f:
    for row in f:
        row = row.decode('utf-8')
        title, unique, page_id, importance = row.strip().split(u'\t')
        if importance == 'Top' or importance == 'High':
            try:
                project_id = projects_by_name[unique]["project_id"]
                project_important[project_id].add(int(page_id))
            except KeyError:
                skipped_projects.add(unique)

In [None]:
project_articles = {}
for article_id, project_times in article_projects.iteritems():
    for project_id in project_times.keys():
        if article_id in project_important[project_id]:
            try:
                project_articles[project_id].add(article_id)
            except KeyError:
                project_articles[project_id] = set([article_id])

In [None]:
log = exp.get_logger()
similarity = {}
# Exclude empty projects
project_ids = project_articles.keys()
with open(exp.get_filename("similarity.csv"), "wb") as out:
    out.write("low_id,high_id,jaccard\n")
    for i, low in enumerate(project_ids):
        log.info("low: %d", low)
        for high in project_ids[i+1:]:
            low_articles = set(project_articles[low])
            high_articles = set(project_articles[high])
            union = low_articles | high_articles
            intersection = low_articles & high_articles
            jaccard = float(len(intersection)) / float(len(union))
            similarity[(low, high)] = jaccard
            similarity[(high, low)] = jaccard
            out.write("%d,%d,%s\n" % (low, high, repr(jaccard)))
        out.flush()
log.info("done")

In [None]:
project_ids = projects.keys()
col_project_id = []
col_similarity_mean = []
col_title = []
for a in project_ids:
    psim = []
    for b in project_ids:
        try:
            psim.append(similarity[(int(a), int(b))])
        except KeyError:
            pass
    if len(psim) > 0:
        col_project_id.append(a)
        col_similarity_mean.append(np.mean(psim))
        col_title.append(projects[int(a)]["project_name"])

In [None]:
df = pd.DataFrame({
    "project_id": col_project_id,
    "similarity_mean": col_similarity_mean
}).set_index("project_id")

In [None]:
filename = exp.get_filename(similarity_file)
df.to_csv(filename)