This is an example application for manual interactive clustering.

It leverages bokeh 0.11 and jupyter notebook to provide interaction. 

To run with demo data, select from the menu `cell -> run all`

To choose data to run on, see the cell "Reading the data".

In [4]:
%%html
<!-- We'd like more room to display, so we'll update the notebook stylesheet from a html cell. -->
<style>
    #notebook-container{
        width:90%;
        margin-right:auto;
    }
</style>

In [5]:
#Setup for inline plotting
%matplotlib inline

#Import bokeh tools and set it up to display from the notebook
import bokeh
from bokeh.charts import Scatter
from bokeh.plotting import figure, show
from bokeh.models import WheelZoomTool, PanTool, BoxSelectTool, ResetTool, LassoSelectTool, HoverTool, ColumnDataSource, CustomJS
import bokeh.models.widgets as widgets
from bokeh.io import vplot, hplot
bokeh.io.output_notebook()

#We use matplotlib colourmaps
import matplotlib

#Numpy for array manipulations
import numpy as np

#Pandas for reading and writing csv-s
import pandas as pd

## Reading in the data

`data_path` should point to a csv file with a header row and four columns.
The default label should be 0 and should be considered to mean "unlabelled".

```
word,label,x,y
kahjutekitavam,0,-47.6927792025,0.3933468121
konkreetsem,0,36.3264279922,3.4699638945
juhuslikum,0,10.1572849577,14.3291007382
 ...
```

`labels_path` should point to a file with a class label per line.

```
Esimene_klass
Teine_klass
```
The classes are read from this file and considered to be one-indexed. 

In [6]:
#input data is in the form (word,class,x,y) where class is an int that corresponds to line number in the labels file.
data_path = './data.csv'
labels_path = './labels.txt'


data = pd.read_csv(data_path, dtype={'x':np.float, 'y':np.float, 'label':np.int})
labels = ['unlabelled'] + [i.strip() for i in open(labels_path).read().split('\n') if i.strip()]
encodings = {labels[i]:i for i in range(len(labels))} 

#We assign colors for each label and the default.
#To change the colormap, assign a different one to the `colormap` variable.
colormap = matplotlib.cm.RdYlGn
color_list = np.array([matplotlib.colors.rgb2hex(colormap(i * (1.0/len(labels))))
                          for i in range(1, len(labels)+1)])
colors = color_list[data['label']]

#Then we set up the tools for the bokeh plot
hover = HoverTool(tooltips=[("value", "@word"),],
    always_active=False) #this should diable hover on startup. Does not work. Maybe it will get fixed someday.

plot = figure(tools=[hover,BoxSelectTool(), LassoSelectTool(), WheelZoomTool(), PanTool(), ResetTool()], webgl=True,
        plot_width=1000,   #set these variables to adjust the plot window size 
        plot_height=1000)


#Set up the datasource for bokeh
source = ColumnDataSource(data)
scatter = plot.scatter(source=source, fill_color=colors, line_width=0, line_alpha=0, size=10, x="x", y="y", alpha=0.5)

def update(encoding, args):
    '''
    Updates the plot on the python side.
    '''
    x = np.fromiter(scatter.data_source.data['label'], dtype=np.int)
    x[np.fromiter(args, dtype=np.int)] = (np.ones(len(args), dtype=np.int) * encoding)
    scatter.data_source.data['label'] = x.tolist()
    scatter.data_source.data['fill_color'] = color_list[x].tolist()
    scatter.data_source.push_notebook()

def save_plot(filename=None):
    if filename is None:
        #if not specified, overwrite previous data
        filename = data_path
    df = pd.DataFrame({
            'word':source.data['word'],
            'label':source.data['label'],
            'x':source.data['x'],
            'y':source.data['y']
        })[['word', 'label', 'x', 'y']]
    df.to_csv(filename, index=False)
    
    
#Set up buttons with a custom javascript callback for each one
buttons = []
for label in labels:
    button = widgets.Button(label=label)
    button.callback = CustomJS(args=dict(source=source), code='''
        var refresh = function(arg){
        }
        var selected = source.get("selected");
        var kernel = IPython.notebook.kernel;
        var selected_indices = JSON.stringify(selected["1d"]["indices"]);
        kernel.execute("update(%s, " + selected_indices + ")", callbacks={shell:{
        reply: refresh
        }});''' % encodings[label])

    buttons.append(button)

selected_source = ColumnDataSource(data)
selected_source.data.clear()
selected_table = widgets.DataTable(source=selected_source, columns=[widgets.TableColumn(field='word', width=100, title='Word'),
                                              widgets.TableColumn(field='label', width=100,title='Label')], width = 400,row_headers=True)

#callback to push selected data into "selected table"
select_callback = CustomJS(args=dict(source=source, selected_source=selected_source, table=selected_table), code='''
var selected = source.get("selected");
var data = source.get("data");
var selected_data = selected_source.get("data")

//can be sped up by just pushing the keys we're interested in
for (var key in data){
    selected_data[key] = [];
    for (var i in selected["1d"]["indices"]){
        selected_data[key].push(data[key][i])
    }
}
selected_source.trigger("change");
table.trigger("change")
''')

source.callback = select_callback

save_button = widgets.Button(label='Save data')
save_button.callback = (CustomJS(args=dict(), code='''
var kernel = IPython.notebook.kernel;
kernel.execute("save_plot()", callbacks={shell:{
function(data){
    console.log("saved")
}
}})
'''))


app = hplot(plot, vplot(vplot(vplot(*buttons), save_button), selected_table)) 
show(app)

<bokeh.io._CommsHandle at 0x7f4bc80685f8>