In [1]:
import json

import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
from sklearn.cluster import KMeans

from src.encode_text import TextEmbeddings as te
from src.text_model import text_model, tokenizer

init_notebook_mode(connected=True)
from pyvis.network import Network
import plotly.io as pio
import seaborn as sns
pio.renderers.default = 'notebook_connected'

from src.downstream_tasks.clustering import clustering
from src.downstream_tasks.clustering import plot

In [17]:
save_html = False

In [2]:
rows = []
with open('/Users/dgoldenberg/PycharmProjects/encode_templates/data/tags_originals.json', 'r') as file:
    for line in file:
        rows.append(json.loads(line))
df = pd.DataFrame(rows)

In [3]:
combined_tags = np.unique(np.concatenate([df['parent_tag'].unique(), df['child_tag'].unique()]))

In [4]:
embs = te(list(combined_tags), text_model, tokenizer)

In [5]:
embeddings = embs.get_embeddings()

In [6]:
# Convert the tensor to a numpy array
numpy_array = embeddings.numpy()

# Convert numpy array to list
list_of_arrays = numpy_array.tolist()

df_encoding = pd.concat([pd.Series(combined_tags), pd.Series(list_of_arrays)], axis=1)
df_encoding.columns = ['tags', 'weights']

In [7]:
df_parent = df[['parent_template_id', 'parent_tag']].drop_duplicates()
df_parent.columns = ['template_id', 'tag']
df_child = df[['template_id', 'child_tag']].drop_duplicates().drop_duplicates()
df_child.columns = ['template_id', 'tag']

df_tags = pd.concat([df_parent, df_child], axis=0).drop_duplicates()

In [8]:
# Convert the weights column in df_encoding to numpy arrays
df_encoding['weights'] = df_encoding['weights'].apply(np.array)

# Join the dataframes based on parent_tag and tags
merged_df = df_tags.merge(df_encoding, left_on='tag', right_on='tags')

# Group by parent_template_id and calculate the mean of weights
average_encoding_df = merged_df.groupby('template_id')['weights'].apply(np.mean).reset_index()

In [9]:
s = average_encoding_df['weights'].tolist()
X = np.array(s)

In [10]:
# kmeans_params = {'n_clusters': [10, 30, 50, 80, 90, 100]}

# # # Perform grid search for K-means
# kmeans_results = clustering.grid_search_clustering(X, 'kmeans', kmeans_params)

In [11]:
n_clusters=50
clusters = clustering.create_clustering(X, KMeans, n_clusters=n_clusters)





In [12]:
meta = pd.concat([average_encoding_df, clusters], axis=1).reset_index(drop=True)
meta.columns=['name', 'weights', 'cluster']

In [13]:
fig = plot.plot_tsne_data(X, meta)

In [18]:
if save_html:
    fig.write_html('plots/clusters.html')
else:    
    fig.show()

In [19]:
def get_popular_tags(cluster_num, n=5):
    cluster_num = str(cluster_num)
    ids = meta[meta['cluster']==cluster_num]['name'].values
    popular = df_tags[df_tags['template_id'].isin(ids)]['tag'].value_counts()
    head_popular = pd.DataFrame(popular.head(n).index)
    head_popular['cluster'] = cluster_num
    return head_popular

def get_popular_df(n=5):
    df = pd.concat([get_popular_tags(str(i), n) for i in list(range(n_clusters))],axis=0)
    return df

In [20]:
most_defining = get_popular_df(n=5)
if save_html:
    most_defining.to_csv('plots/defining_words.csv')

In [29]:
get_popular_tags(cluster_num=7)

Unnamed: 0,tag,cluster
0,geographical feature,7
1,nature,7
2,mountain,7
3,tourism,7
4,grassland,7


In [30]:
def add_tag_weight(df, id_col, value_col):
    weight_col = value_col+"_weight"
    df1 = df[[id_col, value_col]].drop_duplicates()
    df1 = df1.groupby(id_col)[value_col].count()
    df1 = df1.reset_index()
    df1.rename(columns={value_col: weight_col}, inplace=True)
    weight = df.merge(df1, left_on=id_col, right_on=id_col)
    weight = 1/weight[[weight_col]]
    return weight

df['parent_tag_weight'] = add_tag_weight(df, id_col="parent_template_id", value_col='parent_tag')
df['child_tag_weight'] = add_tag_weight(df, id_col="template_id", value_col='child_tag')

In [31]:
df['weight'] = df['parent_tag_weight']*df['child_tag_weight']

In [32]:
df1 = df.merge(meta[['name', 'cluster']], left_on = 'parent_template_id', right_on='name')
df1 = df1.merge(meta[['name', 'cluster']], left_on = 'template_id', right_on='name', suffixes=('_parent', '_child'))


In [33]:
popular_df = get_popular_df()
popular_df = popular_df.groupby('cluster')['tag'].apply(list)
df1 = df1.merge(popular_df, left_on = 'cluster_child', right_on='cluster')
df1 = df1.merge(popular_df, left_on = 'cluster_parent', right_on='cluster')

In [34]:
df1 = df1[['name_parent', 'cluster_parent', 'name_child', 'cluster_child', 'tag_x', 'tag_y']].drop_duplicates(['name_parent', 'cluster_parent', 'name_child', 'cluster_child'])

In [35]:
def tags_count(df, id_col, tag_col):
    df2 = df[[id_col, tag_col]].drop_duplicates()
    tags_counts = df2.groupby(tag_col).count().reset_index()
    tags_counts = tags_counts.sort_values(id_col, ascending=False)
    fig = px.bar(tags_counts, x=tag_col, y=id_col, title='Popular Clusters')
    fig.update_xaxes(labelalias=dict(dict(zip(most_defining.cluster,most_defining.tag))))
    return fig

In [36]:
fig = tags_count(df1, "name_parent", "cluster_parent")
if save_html:
    fig.write_html('plots/parent_popular_tags.html')
else:
    fig.show()

In [37]:
fig = tags_count(df1, "name_child", "cluster_child")
if save_html:
    fig.write_html('plots/child_popular_tags.html')
else:
    fig.show()

In [38]:
def heatmap_tags(df, id_col, tag_col):
    df1 = df[[id_col, tag_col]].drop_duplicates()
    indicator_matrix = pd.get_dummies(df1[[tag_col]], prefix="", prefix_sep="")
    result = df1[[id_col]].join(indicator_matrix)
    result = result.groupby(id_col).sum().reset_index()
    result = result.fillna(0)

    a = result.iloc[:,1:].values
    b = result.iloc[:,1:].values.T
    c = b.dot(a)
    c = pd.DataFrame(c)
    fig = px.imshow(c, x = result.iloc[:,1:].columns, y = result.iloc[:,1:].columns)
    iplot(fig)

In [39]:
heatmap_tags(df1, "name_parent", "cluster_parent")

In [40]:
heatmap_tags(df1, "name_child", "cluster_child")

In [41]:
tags_flow = df1.groupby(['cluster_parent', 'cluster_child'])['name_parent'].count().reset_index()
tags_flow.columns = ['cluster_parent', 'cluster_child', 'weight']
sum_weights = tags_flow.groupby('cluster_parent')['weight'].transform('sum')
tags_flow['weight_normalized'] = tags_flow['weight'] / sum_weights

In [42]:
most_defining = get_popular_df(n=1)
tags_flow = tags_flow.merge(most_defining, left_on='cluster_parent', right_on="cluster").drop('cluster', axis=1)
tags_flow = tags_flow.merge(most_defining, left_on='cluster_child', right_on="cluster", suffixes=['_parent', '_child']).drop('cluster', axis=1)
tags_flow

Unnamed: 0,cluster_parent,cluster_child,weight,weight_normalized,tag_parent,tag_child
0,0,0,1076,0.390279,song,song
1,1,0,33,0.022743,food,song
2,10,0,3,0.013953,eye,song
3,11,0,2,0.016807,person,song
4,12,0,77,0.170732,music,song
...,...,...,...,...,...,...
2020,47,47,17,0.369565,tattoo,tattoo
2021,49,47,1,0.002681,food,tattoo
2022,5,47,5,0.005513,clothing,tattoo
2023,6,47,3,0.005988,vehicle,tattoo


In [43]:
fig = px.histogram(tags_flow['weight_normalized'], log_y=True, title="Distribution of Flow Between Parent and Child Tags")
if save_html:
    fig.write_html('plots/distribution_flow.html')
else:
    fig.show()

In [44]:
def filter_flow(df, parent_tags=None, child_tags=None, weight=None):
    if parent_tags is not None:
        df = df[df['cluster_parent'].isin(parent_tags)]
    if child_tags is not None:
        df = df[df['cluster_child'].isin(child_tags)]
    if weight is not None and type(weight)==list:
        df = df[df['weight_normalized'].between(min(weight), max(weight))]
    elif weight is not None and type(weight)==float:
        df = df[df['weight_normalized']>weight]
    print("DataFrame has {} edges".format(len(df)))
    return df

In [46]:
filter_flow(tags_flow, weight=0.1)

DataFrame has 75 edges


Unnamed: 0,cluster_parent,cluster_child,weight,weight_normalized,tag_parent,tag_child
0,0,0,1076,0.390279,song,song
4,12,0,77,0.170732,music,song
46,1,1,513,0.353549,food,food
52,16,1,47,0.174721,food,food
86,49,1,54,0.144772,food,food
...,...,...,...,...,...,...
1839,19,7,60,0.109689,plant,geographical feature
1869,7,7,120,0.196078,geographical feature,geographical feature
1957,9,9,55,0.204461,plant,plant
1974,25,25,26,0.204724,fashion,fashion


In [47]:
def create_graph_animation(net_df):
    net=Network(notebook=True, select_menu=True, filter_menu=True, cdn_resources='in_line',
                height="750px", width="100%", bgcolor="#222222", font_color="white", directed=True)
    net.repulsion()

#     sources = net_df['cluster_parent']
#     targets = net_df['cluster_child']
    sources = net_df['tag_parent']
    targets = net_df['tag_child']
    weights = net_df['weight_normalized']

    edge_data = zip(sources, targets, weights)

    for e in edge_data:
                    src = e[0]
                    dst = e[1]
                    w = e[2]

                    net.add_node(src, src, title=src)
                    net.add_node(dst, dst, title=dst)
                    net.add_edge(src, dst, value=w)

    neighbor_map = net.get_adj_list()

# add neighbor data to node hover data
    for node in net.nodes:
                    node["title"] += " Neighbors:\n" + "\n".join(neighbor_map[node["id"]])
                    node["value"] = len(neighbor_map[node["id"]])

    return net

In [48]:
graph_df = filter_flow(tags_flow, weight=0.05)
net = create_graph_animation(graph_df)
net.show('example.html')

DataFrame has 163 edges
example.html


In [49]:
def child_distribution(cluster):
    data=tags_flow[(tags_flow['cluster_parent']==cluster)].sort_values(by="weight_normalized")
    fig = px.bar(data, 
                 x="cluster_child", y="weight_normalized",
                 title="Distribution of Flow from Cluster {}".format(cluster.title()))
    fig.update_xaxes(labelalias=dict(dict(zip(data.cluster_child,data.tag_child))))
    return fig

In [53]:
tags_flow[tags_flow['cluster_parent']=='27'].sort_values(by='weight_normalized', ascending=False)

Unnamed: 0,cluster_parent,cluster_child,weight,weight_normalized,tag_parent,tag_child
844,27,27,36,0.201117,sports,sports
1802,27,6,14,0.078212,sports,vehicle
556,27,2,13,0.072626,sports,sports
1644,27,46,10,0.055866,sports,games
1462,27,41,10,0.055866,sports,location
240,27,13,8,0.044693,sports,artwork
467,27,18,8,0.044693,sports,animal
884,27,28,8,0.044693,sports,vehicle
1758,27,5,7,0.039106,sports,clothing
64,27,1,7,0.039106,sports,food


In [51]:
get_popular_tags(cluster_num=27)

Unnamed: 0,tag,cluster
0,sports,27
1,vehicle,27
2,bicycle,27
3,cycling,27
4,racing,27


In [52]:
child_distribution('27')

In [54]:
most_defining = get_popular_df(n=1)
def get_transition_matrix(df):
    num_rows = n_clusters
    num_cols = n_clusters
    matrix = np.zeros((num_rows,num_cols))

    for index, row in df.iterrows():
        rows = row['cluster_parent']
        cols = row['cluster_child']
        weights = row['weight_normalized']
        matrix[int(rows)][int(cols)] = weights
    return matrix

transition_mat = get_transition_matrix(tags_flow)

def get_flow_heatmap(matrix):
    
    fig = px.imshow(matrix, 
              labels=dict(x="Child Clusters", y="Parent Clusters"),
              x=most_defining['tag'],
              y=most_defining['tag'])
    return fig

def get_self_flow(matrix):
    diag = matrix.diagonal()
    d = {"cluster": most_defining['cluster'], 'tags': most_defining['tag'], 'weight': diag}
    df = pd.DataFrame(data=d).sort_values(by="weight", ascending=False)
    fig = px.bar(df, x="cluster", y="weight", title="Like Parent, Like Child:'Tight-Knit' and 'Free-Range' Families")
    fig.update_xaxes(labelalias=dict(dict(zip(df.cluster,df.tags))))
    return fig

def get_hist_flow(matrix, cumulative=False):
    diag = matrix.diagonal()
    fig = px.histogram(diag, cumulative=cumulative, title="Prob. of Parent's Tag Presented in Child", nbins=20)
    return fig

In [56]:
fig = get_flow_heatmap(transition_mat)
if save_html:
    fig.write_html('plots/parent_child_heatmap.html')
else:
    fig.show()

In [57]:
fig = get_self_flow(transition_mat)
if save_html:
    fig.write_html('plots/close_families.html')
else:
    fig.show()

In [58]:
fig=get_hist_flow(transition_mat)
if save_html:
    fig.write_html('plots/parent_child_consistency.html')
else:
    fig.show()

In [59]:
fig=get_hist_flow(transition_mat, cumulative=True)
fig.show()

In [60]:
print(np.mean(transition_mat.diagonal()))
print(np.std(transition_mat.diagonal()))

0.25535478061746686
0.13638659948970266


In [61]:
import numpy as np
import pandas as pd
from scipy.stats import kstest, uniform

# Assuming you have a DataFrame named "tags_flow" with columns "parent_tag" and "weight_distribution"

# Define a function to perform KS test and return the test statistic
def ks_test_statistic(data):
    return kstest(data, uniform.cdf)[0]

# Group the data by "parent_tag" and apply the KS test
grouped = tags_flow.groupby('cluster_parent')['weight_normalized'].apply(ks_test_statistic)

In [62]:
fig = px.bar(grouped.reset_index().sort_values(by="weight_normalized"), 
             x="cluster_parent", y="weight_normalized")
fig.update_xaxes(labelalias=dict(dict(zip(most_defining.cluster,most_defining.tag))))
fig

In [63]:
def data_clusters(df, **kwargs):
    from networkx import community
    G=nx.from_pandas_edgelist(df, source="cluster_parent", target="cluster_child", edge_attr="weight_normalized")
    clusters=community.louvain_communities(G, weight="weight_normalized")
    dicts = [{item: i} for i, item_set in enumerate(clusters) for item in (item_set if isinstance(item_set, set) else [item_set])]
    clusters_df = pd.DataFrame([(key, value) for d in dicts for key, value in d.items()], columns=['key', 'value'])
    basic_df = filter_flow(df, **kwargs)
    
    # colors
    
    iter = len(clusters)
    palette = list(reversed(sns.color_palette("Spectral_r", iter).as_hex()))
    colors_df = pd.DataFrame(palette, columns=['color']).reset_index()
    
    cluster_data = basic_df.merge(clusters_df, left_on="cluster_parent", right_on="key")
    cluster_data = cluster_data.merge(colors_df, left_on="value", right_on="index")
    
    net=Network(notebook=True, select_menu=True, filter_menu=True, cdn_resources='in_line',
                height="750px", width="100%", bgcolor="#222222", font_color="white", directed=True)
    net.repulsion()

    sources = cluster_data['tag_parent']
    targets = cluster_data['tag_child']
    weights = cluster_data['weight_normalized']
    colors = cluster_data['color']

    edge_data = zip(sources, targets, weights, colors)

    for e in edge_data:
                    src = e[0]
                    dst = e[1]
                    w = e[2]
                    color = e[3]

                    net.add_node(src, src, title=src, color=color)
                    net.add_node(dst, dst, title=dst, color=color)
                    net.add_edge(src, dst, value=w)

    neighbor_map = net.get_adj_list()

    # add neighbor data to node hover data
    for node in net.nodes:
                    node["title"] += " Neighbors:\n" + "\n".join(neighbor_map[node["id"]])
                    node["value"] = len(neighbor_map[node["id"]])

    return net

    


In [65]:
net = data_clusters(tags_flow, weight=0.05)
net.show('example2.html')

DataFrame has 163 edges
example2.html
