## Growing Entourage Plots for Top Tags with Clusters on Subtags

#### Damon Crockett, Software Studies, damoncrockett@gmail.com

In [1]:
DIR = "/Users/damoncrockett/Desktop/stpete/viz/growing_entourage/"

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("./X.csv")

In [4]:
seq_list = []
for i in range(len(df)):
    seq_list.append(list(df.loc[i][df.loc[i]!=0][1:].sort(axis=1,inplace=False,ascending=False).index))

In [5]:
tag_counts = pd.read_csv("./tags_counts_rank.csv")

In [6]:
d = tag_counts[tag_counts['count'] > 1000] # no need to reset index bc we hacked off end

In [7]:
del d['Unnamed: 0']

In [8]:
tags = list(d.tag)

In [9]:
from collections import Counter

#### Get top subtags for each tag

In [10]:
num_subtags = 16

In [11]:
cluster_list= []

for tag in tags:
    tmp = df[df[tag]!=0]
    seq_list_subset = [seq_list[i] for i in tmp.index]
    seq_list_subset = [item for sublist in seq_list_subset for item in sublist]
    seq_list_subset = [item for item in seq_list_subset if item not in tags]
    clusters = [item[0] for item in Counter(seq_list_subset).most_common(num_subtags)]
    cluster_list.append(clusters)

In [12]:
d['clusters'] = cluster_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


#### Find subtag exemplars

In [13]:
# note that even for the same tag, we might have multiple exemplars

In [14]:
tag_exemplar_names = []
subtag_exemplar_names = []
tag_exemplar_vectors = []

for i in range(len(d)):
    tag = d.tag.loc[i]
    subtags = d.clusters.loc[i]
    tmp = df[df[tag]!=0] # subset to tag
    tmp = tmp[tmp.columns[(tmp != 0).any()]] # eliminate zero cols
    
    for subtag in subtags:
        subtmp = tmp[tmp[subtag]!=0]
        exemplar_vector = np.mean(subtmp.iloc[:,1:].as_matrix(), axis=0)
        
        tag_exemplar_names.append(tag)
        subtag_exemplar_names.append(subtag)
        tag_exemplar_vectors.append(exemplar_vector)

In [15]:
exemplar_df = pd.DataFrame(tag_exemplar_names,columns=['tag'])
exemplar_df['subtag'] = subtag_exemplar_names
exemplar_df['vector'] = tag_exemplar_vectors

### Big Loop

A bit messy because images are in multiple clusters within the same plot, and also because I haven't tried to optimize this code. But it works.

In [16]:
from sklearn.decomposition import RandomizedPCA as pca
from sklearn.manifold import TSNE
from PIL import Image,ImageDraw
from shapely.geometry import Point

In [17]:
thumb_side = 96

In [18]:
def plot():
    for w in range(len(subspace)):
        exemplar = subspace.exemplar_point.loc[w]
        subtag = subspace.subtag.loc[w]

        idxs = []
        for z in tmp.index:
            if subtag in tmp.euclid.loc[z]:
                idxs.append(z)
        
        candidates = tmp.loc[idxs,:]
        candidates['val'] = [item[subtag] for item in candidates.euclid]
        candidates.sort('val',ascending=True,inplace=True)
        
        if len(candidates) > 0:
            best = candidates.iloc[0]
            im = Image.open(best.local_path)
            im.thumbnail((thumb_side,thumb_side),Image.ANTIALIAS)
            closest_open = min(open_grid,key=lambda x: exemplar.distance(x))
            x = int(closest_open.x) * thumb_side
            y = int(closest_open.y) * thumb_side

            canvas.paste(im,(x,y))
            idx = tmp[tmp.local_path==best.local_path].index
            tmp.drop(idx,inplace=True)
            open_grid.remove(closest_open)

In [20]:
for i in range(len(d)): 
    tag = d.tag.loc[i]
    subtags = d.clusters.loc[i]
    tmp = df[df[tag]!=0] # subset to tag
    tmp = tmp[tmp.columns[(tmp!=0).any()]] # eliminate zero cols
    tmp.reset_index(drop=False,inplace=True) # we need the old index too
    
    # now to calculate Euclidean distance from each img in tmp to its exemplar(s)
    euclid = []
    exemplar_set = exemplar_df[exemplar_df.tag==tag]
    exemplar_set.reset_index(drop=True,inplace=True)
    
    for j in range(len(tmp)):
        euclid_dict = {}
        entourages = [item for item in seq_list[tmp['index'].loc[j]] if item not in tags]
        entourages = [item for item in entourages if item in subtags]
        for entourage in entourages:
            exemplar_vector = exemplar_set.vector[exemplar_set.subtag==entourage].values[0] # bc a Series
            img_point = tmp.iloc[j,2:].as_matrix()
            euclid_dict[entourage] = np.linalg.norm(img_point - exemplar_vector)
        euclid.append(euclid_dict)
    
    tmp['euclid'] = euclid # now each image has a dict with all its euc dist from exemplars
    
    # running dim reduce on exemplars to get entourage locations    
    model = TSNE(n_components=2)
    X = np.vstack(exemplar_set.vector)
    subspace = pd.DataFrame(model.fit_transform(X),columns=["x","y"])
    
    # flight and rank
    subspace.sort('x',inplace=True)
    subspace['flight'] = np.repeat(range(1,int(np.sqrt(num_subtags)+1)),int(np.sqrt(num_subtags)))

    subspace.sort(['flight','y'],inplace=True)
    subspace['rank'] = range(1, int(np.sqrt(num_subtags)+1)) * int(np.sqrt(num_subtags))

    factor = int(round( 2 * np.sqrt( ( len(tmp) / (num_subtags * np.pi) ) + num_subtags ))) # serviceable
    subspace['x_grid'] = subspace['flight'] * factor
    subspace['y_grid'] = subspace['rank'] * factor
    
    # turn exemplar grid coords into shapely points
    exemplar_point = []
    for k in subspace.index: # note this is label indexing not positional
        exemplar_point.append(Point(subspace.x_grid.loc[k],subspace.y_grid.loc[k]))
    
    subspace['exemplar_point'] = exemplar_point
    
    # add subtag names to subspace
    subspace['subtag'] = exemplar_set.subtag
    
    # grid list, a list of every grid location
    grid_side = int((np.sqrt(num_subtags) + 1)) * factor
    x,y = range(grid_side) * grid_side, np.repeat(range(grid_side),grid_side) # reusing 'x' and 'y' here...
    grid_list = pd.DataFrame(x,columns=['x'])
    grid_list['y'] = y
    
    # make into shapely points
    point = []
    for u in range(len(grid_list)):
        point.append(Point(grid_list.x.loc[u],grid_list.y.loc[u]))

    grid_list['point'] = point
    open_grid = list(grid_list.point) # the list we will maintain of remaining open grid points
    
    # plotting of exemplar words
    px_w = thumb_side * grid_side
    px_h = thumb_side * grid_side
    canvas = Image.new('RGB',(px_w,px_h),(50,50,50))
    
    for subtag in subtags:
        template = Image.new("RGB", (thumb_side, thumb_side), (50,50,50))
        draw = ImageDraw.Draw(template)
        draw.text((5,thumb_side/2),subtag,fill="white") # hard-coded the left edge...
        plot_point = subspace.exemplar_point[subspace.subtag==subtag].values[0]
        
        x = subspace.x_grid[subspace.subtag==subtag] * thumb_side
        y = subspace.y_grid[subspace.subtag==subtag] * thumb_side
        
        canvas.paste(template,(x,y))
        
        if plot_point in open_grid:
            open_grid.remove(plot_point)
        else:
            print tag,subtag
            
    seq_list_subset = [seq_list[q] for q in tmp['index']]
    seq_list_subset = [item for sublist in seq_list_subset for item in sublist]
    seq_list_subset = [item for item in seq_list_subset if item not in tags]
    counts = [item[1] for item in Counter(seq_list_subset).most_common(num_subtags)]
    
    iterations = max(counts)
    
    for v in range(iterations):
        plot()
        
    canvas.save(DIR+tag+"_"+str(factor)+".png")

In [None]:
'''
# setting up the grid
num_bins = int(round( np.sqrt(len(tmp)) * fill_factor )) # what % of the square should be filled?

# adding in some extremes to push the edges out
spacer = np.std(subspace.x) # standard deviation in x; should be similar in y bc tsne pretty symmetrical
x = [subspace.x.min() - spacer,subspace.x.max() + spacer]
y = [subspace.y.min() - spacer,subspace.y.max() + spacer]

tmpdf = pd.DataFrame(x,columns=["x"])
tmpdf["y"] = y
subspace = subspace.append(tmpdf)

# binning exemplar coordinates
subspace['x_bin'] = pd.cut(subspace['x'],num_bins,labels=False)
subspace['y_bin'] = pd.cut(subspace['y'],num_bins,labels=False)

# removing grid expanders
subspace = subspace[:len(subtags)]

# now to expand the grid by simple multiplication
subspace["x_grid"] = subspace.x_bin * expansion_factor
subspace["y_grid"] = subspace.y_bin * expansion_factor
'''

In [None]:
#print iterations, num_bins, subtag, plot_point, tag

#### Leftover from calculating average rank

In [None]:
"""
avg_rank = []
for tag in list(tag_counts['tag']):
    tmp = []
    for seq in seq_list:
        if tag in seq:
            tmp.append(int(seq.index(tag)))
    avg_rank.append(np.mean(tmp))
    
"""

In [None]:
#tag_counts['avg_rank'] = avg_rank

In [None]:
#tag_counts.to_csv("./tags_counts_rank.csv")