In [None]:
%pylab inline
import json
import msgpack
import pandas as pd
import scipy.stats as spstats
import numpy as np
import logbook

In [None]:
project_file = "data/projects-2016-10-14.json"
map_file = "archive/07_create_article_project_map/2017-03-27 14:10:49 77c76e1/articles_projects.m"
exp = logbook.Experiment("24c find_similarity")

In [None]:
projects = {}
with open(project_file, 'rb') as f:
    for i, row in enumerate(f):
        data = json.loads(row)
        project_id = data['project_id']
        projects[project_id] = data
with open(map_file, 'rb') as f:
    article_projects = msgpack.unpackb(f.read())

In [None]:
len(projects)

In [None]:
project_articles = {}
for article_id, project_times in article_projects.iteritems():
    for project_id in project_times.keys():
        try:
            project_articles[project_id].add(article_id)
        except KeyError:
            project_articles[project_id] = set([article_id])

In [None]:
len(project_articles[256])

In [None]:
project_ids = sorted(project_articles.keys())
similarity = {}
log = exp.get_logger()
with open(exp.get_filename("similarity.csv"), "wb") as out:
    out.write("low_id,high_id,jaccard\n")
    for i, low in enumerate(project_ids):
        log.info("low: %d", low)
        for high in project_ids[i+1:]:
            low_articles = set(project_articles[low])
            high_articles = set(project_articles[high])
            union = low_articles | high_articles
            intersection = low_articles & high_articles
            jaccard = float(len(intersection)) / float(len(union))
            similarity[(low, high)] = jaccard
            similarity[(high, low)] = jaccard
            out.write("%d,%d,%s\n" % (low, high, repr(jaccard)))
        out.flush()
log.info("done")

In [None]:
project_ids = projects.keys()
col_project_id = []
col_similarity_mean = []
col_title = []
for a in project_ids:
    psim = []
    for b in project_ids:
        try:
            psim.append(similarity[(int(a), int(b))])
        except KeyError:
            pass
    if len(psim) > 0:
        col_project_id.append(a)
        col_similarity_mean.append(np.mean(psim))
        col_title.append(projects[int(a)]["project_name"])

In [None]:
df = pd.DataFrame({
    "project_id": col_project_id,
    "similarity_mean": col_similarity_mean
}).set_index("project_id")

In [None]:
df.to_csv("output/24c find_similarity/2017-10-02 11:17:41 14665f3/similarity_mean.csv")

In [None]:
df_comb = pd.read_csv("output/26_combine_data/2017-08-25 17:20:07 1cc6dcf/combined.csv")
df_comb['similarity_mean'] = df['similarity_mean']
plt.loglog(df_comb['article_count'], df_comb['similarity_mean'], '.')
plt.xlabel("Article count")
plt.ylabel("Mean jaccard similarity")