In [None]:
import pandas as pd
import numpy as np

import ast
import pickle

%matplotlib inline

In [None]:
path = 'output/test-2/'
cluster_data = pd.read_csv(path+'1_step_user_groups.csv', sep='\t')

In [None]:
cluster_tags = pd.read_csv(path+'1_step_tags_clusters.txt', sep='\t')
cluster_tags.head()

In [None]:
def getCompleteUserlist(userlist, cluster_dim):
    userlist = list(ast.literal_eval(userlist))
    result = []
    if cluster_dim > 1:
        for userdata in userlist:
            result.append(userdata[0])
    else:
        result.append(userlist[0])
    return result

In [None]:
cluster_data['all_users'] = cluster_data.apply(lambda x: getCompleteUserlist(x['list_users'], x['num_users']), axis=1)
cluster_data.head()

In [None]:
# read tags features
features = pd.read_csv('tagsdata.csv')
features.head()

In [None]:
header = list(features.columns[1:9])

In [None]:
def centroid(data):
    arrays = [np.array(x[header]) for index, x in data.iterrows()]
    
    return np.sum(arrays, axis=0)/data.shape[0]

In [None]:
def closest_tags(centroid, alltags, K):
    alltags['distance'] = alltags.apply(lambda x: np.linalg.norm(np.array(x[header]) - centroid), axis=1)
    
    return alltags.sort_values(by='distance')[:K]

In [None]:
tags_to_add = features[features['cluster'].isin([-1, 0])]
tags_to_add.shape[0]

In [None]:
result = []
for c in cluster_data['id_cluster']:
    cluster_centroid = centroid(features[features['cluster'] == c])

    tags_2 = closest_tags(cluster_centroid, tags_to_add, 10)
    tags_to_add.drop(tags_2.index, inplace=True, axis=0)
    
    result.append(tuple((c, list(tags_2['id_node']))))

In [None]:
extended_cluster_tags = pd.DataFrame(result, columns=['id_cluster', 'hashtags'])

In [None]:
with open('output/test-2/v2/2_step_tags_clusters.txt', 'w') as outfile:
    outfile.write('n_cluster\thashtags\n')
    for c in extended_cluster_tags['id_cluster']:
        cluster = extended_cluster_tags[extended_cluster_tags['id_cluster'] == c]['hashtags'].values[0]
    
        outfile.write('{}\t{}\n'.format(c, ','.join(list(cluster))))

In [None]:
# store for next iterations/runs
usedtags = pickle.load(open('output/user-tags-list.pkl', 'r'))

In [None]:
# compute participation for the second set of tags extracted
tempResult = []
for u in usedtags:
    u_vector = usedtags[u]
    
    if len(u_vector)>0:
        u_result = [u]
        for c in extended_cluster_tags['id_cluster']:
            c_vector = set(extended_cluster_tags[extended_cluster_tags['id_cluster'] == c]['hashtags'].values[0])
            participation = float(len(u_vector.intersection(c_vector)))/len(u_vector)
            u_result.append(participation)
    else:
        u_result = [u]+[0 for i in extended_cluster_tags['id_cluster']]
    tempResult.append(tuple(u_result))

In [None]:
result_header = ['username']+[c for c in extended_cluster_tags['id_cluster']]
participationTable = pd.DataFrame(tempResult, columns=result_header)

In [None]:
participationTable.head()

In [None]:
participationTable.to_csv('output/test-2/v2/2_step_user_participation.csv', index=None)

In [None]:
first_step_users = set()
for userlist in cluster_data['all_users']:
    for u in userlist:
        first_step_users.add(u[0])

In [None]:
with open('output/test-2/v2/2_step_user_groups.csv', 'w') as outfile:
    outfile.write('id_cluster\tnum_users\tlist_users\n')
    for i in extended_cluster_tags['id_cluster']:
        group = participationTable[participationTable[i] != 0.0]
        group.sort_values(by=i, ascending = False, inplace=True)
        
        userlist = [tuple((x['username'], x[i])) for index, x in group.iterrows() if x['username'] not in first_step_users]
        
        dim = len(userlist)
        outfile.write('{}\t{}\t{}\n'.format(i, dim, str(userlist).strip('[]')))

## Visualization of clusters and extension

In [None]:
first_step = cluster_tags.merge(cluster_data, left_on='n_cluster', right_on='id_cluster')[['id_cluster','dim','num_users']]

In [None]:
extended_cluster_tags['dim_2'] = extended_cluster_tags.apply(lambda x: len(x['hashtags']), axis=1)
extended_cluster_tags

In [None]:
user_2 = []
for i in range(1,37):
    group = participationTable[participationTable[i] != 0.0]
    group.sort_values(by=i, ascending = False, inplace=True)
    
    # filter previous users to verify how many users we are extending the clusters to
    # but in principle the participation is updates for all users and it increases for the ones of the first step!
    userlist = [tuple((x['username'], x[i])) for index, x in group.iterrows() if x['username'] not in first_step_users]

    dim = len(userlist)
    user_2.append(tuple((i, dim)))

In [None]:
second_step = first_step.merge(extended_cluster_tags[['id_cluster','dim_2']], on='id_cluster')\
                        .merge(pd.DataFrame(user_2, columns=['id_cluster', 'num_users_2']), on='id_cluster')
#second_step

In [None]:
second_step.set_index('id_cluster', inplace=True)

In [None]:
second_step.head()

In [None]:
second_step.columns = ['#tag', '#users', '#tag_2', '#users_2']

In [None]:
second_step.plot(kind='bar', figsize=(16,10), width=0.8)

Participation of the core users may increase after adding the extension

In [None]:
# compute participation for the overall hashtags cluster (core + extended)
# NB: it is possible that users have not a complete participation, since a lot of hashtags still are not included
tempResult = []
for u in usedtags:
    u_vector = usedtags[u]
    
    if len(u_vector)>0:
        u_result = [u]
        for c in extended_cluster_tags['id_cluster']:
            c_vector = set(extended_cluster_tags[extended_cluster_tags['id_cluster'] == c]['hashtags'].values[0])\
                        .union(set(cluster_tags[cluster_tags['n_cluster'] == c]['hashtags'].values[0]))
            participation = float(len(u_vector.intersection(c_vector)))/len(u_vector)
            u_result.append(participation)
    else:
        u_result = [u]+[0 for i in extended_cluster_tags['id_cluster']]
    tempResult.append(tuple(u_result))

In [None]:
result_header = ['username']+[c for c in extended_cluster_tags['id_cluster']]
participationTable = pd.DataFrame(tempResult, columns=result_header)

In [None]:
participationTable.head()

In [None]:
participationTable.to_csv('output/test-2/v2/complete_user_participation.csv', index=None)

### Community Network Visualization 

In [None]:
path = 'output/test-2/'

In [None]:
part1 = pd.read_csv(path+'1_step_user_participation.csv')
part2 = pd.read_csv(path+'v2/2_step_user_participation.csv')

In [None]:
edgetable = []

for i in range(1,37):
    group = part1[part1[str(i)] > 0.0]
    
    
    userlist = [tuple(('core_{}'.format(i), x['username'], x[str(i)])) for index, x in group.iterrows()]
    edgetable = edgetable + userlist

for i in range(1,37):
    group = part2[part2[str(i)] > 0.0]
    
    
    userlist = [tuple(('extension_{}'.format(i), x['username'], x[str(i)])) for index, x in group.iterrows()]
    edgetable = edgetable + userlist


In [None]:
clusteredges = [tuple(('core_{}'.format(i), 'extension_{}'.format(i), 1)) for i in range(1,37)]

In [None]:
edges = pd.DataFrame(clusteredges + edgetable, columns=['source','target','weight'])
edges.to_csv('output/test-2/edgetable_2.csv', index=None)

In [None]:
nodes = pd.DataFrame(columns=['id', 'type'])

In [None]:
def computeSize(cluster):
    cid = int(cluster['id'].split('_')[1])
    
    if 'core' in cluster['id']:
        return len(cluster_tags[cluster_tags['n_cluster'] == cid]['hashtags'].values[0].split(','))
    
    elif 'extension' in cluster['id']:
        return len(list(extended_cluster_tags[extended_cluster_tags['id_cluster'] == cid]['hashtags'].values[0]))

In [None]:
def computeLabel(cluster):
    cid = int(cluster['id'].split('_')[1])
    
    if 'core' in cluster['id']:
        return cluster_tags[cluster_tags['n_cluster'] == cid]['hashtags'].values[0]
    elif 'extension' in cluster['id']:
        return ','.join(list(extended_cluster_tags[extended_cluster_tags['id_cluster'] == cid]['hashtags'].values[0]))

In [None]:
n1 = pd.DataFrame(edges['source'].unique(), columns=['id'])
n1['type'] = 'cluster'

In [None]:
n1['size'] = n1.apply(lambda x: computeSize(x), axis=1)
n1['label'] = n1.apply(lambda x: computeLabel(x), axis=1)

In [None]:
n2 = edges[['target']]
n2['type'] = 'user'
n2.columns = ['id', 'type']
n2['size'] = 1
n2['label'] = n2.apply(lambda x: x['id'], axis=1)

In [None]:
nodes = pd.concat([n1, n2])
nodes.drop_duplicates().to_csv('output/test-2/nodetable_2.csv', index=None)

## Number of neighbors hashtags to consider

In [None]:
 usedtags = pickle.load(open('output/user-tags-list.pkl', 'r'))
def k_selection(max_K, tags_to_add):
    k_vec = range(10, max_K)
    added_users = []
    
    for k in k_vec:
        print k
        result = []
        for c in cluster_data['id_cluster']:
            cluster_centroid = centroid(features[features['cluster'] == c])

            tags_2 = closest_tags(cluster_centroid, tags_to_add, k)
            tags_to_add.drop(tags_2.index, inplace=True, axis=0)

            result.append(tuple((c, list(tags_2['id_node']))))
            
        extended_cluster_tags = pd.DataFrame(result, columns=['id_cluster', 'hashtags'])

        curr_added_users = []
        for c in extended_cluster_tags['id_cluster']:
            c_vector = set(extended_cluster_tags[extended_cluster_tags['id_cluster'] == c]['hashtags'].values[0])
            num_users = 0
            for u in usedtags:
                u_vector = usedtags[u]

                if len(u_vector)>0:
                    u_result = [u]
                    participation = float(len(u_vector.intersection(c_vector)))/len(u_vector)
                    
                    if participation > 0.0:
                        num_users += 1
                        
            curr_added_users.append(num_users)
        
        added_users.append(np.mean(curr_added_users))
        
    return added_users

In [None]:
# average number of users per cluster added in the second run
k_selection(15, features[features['cluster'].isin([-1, 0])])