In [10]:
from scipy.io import loadmat
from coclust.CoclustMod import CoclustMod
import numpy as np
from sklearn.preprocessing import normalize

# Retrieve the CSTR document-term matrix from a matlab file
file_name = "classic3.mat"
matlab_dict = loadmat(file_name)

X = matlab_dict['A']

model = CoclustMod(n_clusters=3, n_init=1, random_state=0)
model.fit(X)

terms = [str(x[0][0]) for x in matlab_dict['ms']]

print(X.shape)


(3891, 4303)


In [61]:
def get_graph(X, model, terms, n_cluster, n_top_terms=10, n_neighbors=2):
    
    # The structure to be returned
    graph = {"nodes": [], "links": []}
    
    # get submatrix and local kist of terms
    row_indices, col_indices = model.get_indices(n_cluster)
    cluster = model.get_submatrix(X, n_cluster)
    terms = np.array(terms)[col_indices]
    
    # identify most frequent words
    p = cluster.sum(0)  ; t = p.getA().flatten()
    top_term_indices = t.argsort()[::-1][:n_top_terms]
    
    # create tt sim matrix
    cluster_norm=normalize(cluster, norm='l2', axis=0, copy=True)
    sim = cluster_norm.T * cluster_norm
    
    # to be able to compute the final index of a neighbor which is also a top term 
    d={t : i for i,t in enumerate(top_term_indices)}
    
    # identify best neighbors of frequent terms
    pointed_by=dict()
    graph = {"nodes": [], "links": []}
    all_neighbors=set()
    links=[]
    for idx_tt , t in enumerate(top_term_indices) :
        print("== top term" , idx_tt, t)
        best_neighbors=np.argsort(sim.toarray()[t])[::-1][:n_neighbors]
        print(best_neighbors)
        print()
        for n in best_neighbors :
            #if  terms[dico_tt[n]].lower() in stopwords: continue
            if t == n : continue
            if n in top_term_indices and t in pointed_by.get(n,[]) : # t was already pointed by n
                continue
            if n in top_term_indices :
                pointed_by.setdefault(t,[]).append(n)  # n will be able to check that is has been pointed by t
            else : # a "pure" neighbor
                all_neighbors.add(n)  
            if n in top_term_indices : # n is a (not yet handled) top term. Lookup in dictionary to find the d3 index.
                                       # Also record original indices using couples.
                links.append(( (idx_tt,t),( d[n],n) ) )
            else :  # n is a pure neighbor. Compute its d3 index by an addition
                links.append(( (idx_tt,t), (len(top_term_indices) + len(all_neighbors),n) )) # use indices suitable for d3 links
    print("top term indices")
    print(top_term_indices)
    print("true neighbors")
    print(all_neighbors)
    print()
    #all_neighbors=all_neighbors.difference(top_terms_indices) # a top term may point to a top term 
        
    for top_term in top_term_indices:
        graph["nodes"].append({"name": terms[top_term], "group": 0})
    
    for neighbor in all_neighbors:
        graph["nodes"].append({"name": terms[neighbor], "group": 1})
    
    for a, b in links :
        graph["links"].append( {"source": a[0], "target": b[0], "value": sim[a[1], b[1]]} )  
    return graph


In [69]:
graph = get_graph(X, model, terms, 0, 5,3)  # 0,3,2
print()
print(graph)


== top term 0 753
[ 753 1577  386]

== top term 1 413
[ 413 1504   64]

== top term 2 1577
[1577  753  994]

== top term 3 1081
[1081  340 1169]

== top term 4 1504
[1504  413 1145]

top term indices
[ 753  413 1577 1081 1504]
true neighbors
{64, 386, 994, 1169, 340, 1145}


{'links': [{'target': 2, 'source': 0, 'value': 1}, {'target': 6, 'source': 0, 'value': 1}, {'target': 4, 'source': 1, 'value': 1}, {'target': 7, 'source': 1, 'value': 1}, {'target': 8, 'source': 2, 'value': 1}, {'target': 9, 'source': 3, 'value': 1}, {'target': 10, 'source': 3, 'value': 1}, {'target': 11, 'source': 4, 'value': 1}], 'nodes': [{'group': 0, 'name': 'library'}, {'group': 0, 'name': 'system'}, {'group': 0, 'name': 'libraries'}, {'group': 0, 'name': 'research'}, {'group': 0, 'name': 'retrieval'}, {'group': 1, 'name': 'line'}, {'group': 1, 'name': 'public'}, {'group': 1, 'name': 'academic'}, {'group': 1, 'name': 'development'}, {'group': 1, 'name': 'study'}, {'group': 1, 'name': 'document'}]}


In [64]:
%%javascript
require.config({
  paths: {
      d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min'
  }
});

<IPython.core.display.Javascript object>

In [66]:
from IPython.display import HTML
HTML("""
<style>
.node_circle {
  stroke: #fff;
  stroke-width: 1.5px;
}

.link {
  stroke: #999;
  stroke-opacity: .6;
}
</style>
""")

In [67]:
from IPython.display import Javascript
#runs arbitrary javascript, client-side
Javascript("""
           window.graph={};
           """.format(graph))

<IPython.core.display.Javascript object>

In [70]:
%%javascript
require(['d3'], function(d3){
  //a weird idempotency thing
  $("#chart1").remove();
  //create canvas
  element.append("<div id='chart1'></div>");
  $("#chart1").width("1160px");
  $("#chart1").height("800px");        
  var margin = {top: 20, right: 20, bottom: 30, left: 40};
  var width = 1280 - margin.left - margin.right;
  var height = 800 - margin.top - margin.bottom;
  var svg = d3.select("#chart1").append("svg")
    .style("position", "relative")
    .style("max-width", "960px")
    .attr("width", width + "px")
    .attr("height", (height + 50) + "px")
    .append("g")
    .attr("transform", "translate(" + margin.left + "," + margin.top + ")");

var color = d3.scale.category20();

var force = d3.layout.force()
    .charge(-800)
    .linkDistance(400)
    .size([width, height]);

var graph = window.graph;
  force
      .nodes(graph.nodes)
      .links(graph.links)
      .start();

  var link = svg.selectAll(".link")
      .data(graph.links)
      .enter().append("line")
      .attr("class", "link")
      .style("stroke", "#999;")
      .style("stroke-width", function(d) { return Math.sqrt(d.value); });

  var node = svg.selectAll(".node")
      .data(graph.nodes)
      .enter().append("g")
      .attr("class", "node")
      .call(force.drag);
    
  node.append("circle")
      .attr("class", "node_circle")
      .attr("r", 8)
      .style("fill", function(d) { return color(d.group); });

  node.append("text")
      .attr("class", "node_text")
      .attr("dx", 12)
      .attr("dy", ".35em")
      .text(function(d) { return d.name });

  node.append("title")
      .text(function(d) { return d.name; });

  var node_text = svg.selectAll(".node_text");
  var node_circle = svg.selectAll(".node_circle");
    
  force.on("tick", function() {
    link.attr("x1", function(d) { return d.source.x; })
        .attr("y1", function(d) { return d.source.y; })
        .attr("x2", function(d) { return d.target.x; })
        .attr("y2", function(d) { return d.target.y; });

    node_circle.attr("cx", function(d) { return d.x; })
        .attr("cy", function(d) { return d.y; });
      
    node_text.attr("x", function(d) { return d.x; })
        .attr("y", function(d) { return d.y; });
  });


});

<IPython.core.display.Javascript object>