In [54]:
import pandas as pd
import numpy as np

import snap

import requests
import re

In [2]:
clusters = pd.read_csv('output/final/1_step_tags_clusters.txt', sep='\t')
clusters.head()

Unnamed: 0,n_cluster,dim,hashtags
0,-1,917,
1,0,370,"colour,sunday,friday,weekend,coffee,monday,tex..."
2,1,23,"sydney,keychain,australia,streetstyle,bali,fas..."
3,2,39,"turquoise,earrings,necklace,jewellery,accessor..."
4,3,16,"positano,amalficoast,capri,ravello,amalfi,sorr..."


## Participation

In [3]:
# extract list of tags for each cluster
clusterTags = {}
for c in range(clusters.shape[0]-1):
    c_tags = clusters[clusters['n_cluster'] == c]['hashtags'].values[0].split(',')
    clusterTags[c] = set(c_tags)

In [35]:
# extract list of tags for each post
path = 'data/test-3/'

# read input network
t_net = snap.LoadEdgeListNet(path+'hashtag_network.csv', '\t')

usedtags = {}
userposts = {}

it = t_net.BegNI()
V = t_net.GetNodes()
for i in range(V):
    nid = it.GetId()
    type = t_net.GetStrAttrDatN(nid, 'type')

    if type == 'post':
        id_post = t_net.GetStrAttrDatN(nid, 'id')
        
        taglist = []
        nodeIt = t_net.GetNI(nid)
        for t in range(nodeIt.GetOutDeg()):
            tid = nodeIt.GetOutNId(t)
            tagname = t_net.GetStrAttrDatN(tid, 'content')
            taglist.append(tagname)
            
        usedtags[id_post] = set(taglist)
        
    elif type == 'user':
        username = t_net.GetStrAttrDatN(nid, 'content')
        postlist = []
        nodeIt = t_net.GetNI(nid)
        for t in range(nodeIt.GetOutDeg()):
            pid = nodeIt.GetOutNId(t)
            id_post = t_net.GetStrAttrDatN(pid, 'id')
            postlist.append(id_post)
            
        userposts[username] = set(postlist)
    it.Next()

In [28]:
# compute participation of each post in each cluster 
postPart = {}
for p in usedtags:
    p_vector = usedtags[p]
    
    if len(p_vector)>0:
        postPart[p] = {}
        for c in range(clusters.shape[0]-1):
            c_vector = clusterTags[c]
            participation = float(len(p_vector.intersection(c_vector)))/len(c_vector)
            postPart[p][c] = participation

In [29]:
participationTablePost = pd.DataFrame.from_dict(postPart, orient='index')

In [30]:
participationTablePost.to_csv('output/final/1_step_post_particiapation.csv')

In [38]:
# compute participation of user based on post participation and check differences 
# with directly computing the participation
userPart = {}
for u in userposts.keys():
    u_posts = userposts[u]
    userPart[u] = {}
    for c in range(clusters.shape[0]-1):
        userPart[u][c] = 0
        n_relevant_posts = len(u_posts)
        for p in u_posts:
            try:
                userPart[u][c] = userPart[u][c] + postPart[p][c]
            except KeyError:
                n_relevant_posts -= 1 # post without hashtags
        
        if n_relevant_posts > 0:
            userPart[u][c] = userPart[u][c]/n_relevant_posts
        

In [39]:
participationTable = pd.DataFrame.from_dict(userPart, orient='index')
participationTable.to_csv('output/final/1_step_user_particiapation_freq.csv')

In [40]:
sorted(userPart['laputenas'].items(), key=lambda (k, v): v, reverse=True)

[(8, 0.03571428571428571),
 (5, 0.020833333333333332),
 (2, 0.00641025641025641),
 (0, 0.0),
 (1, 0.0),
 (3, 0.0),
 (4, 0.0),
 (6, 0.0),
 (7, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0)]

## Visualization of resulting graph

Connect each user to a max of 3 communities

In [41]:
MAX_COMM = 3
outpath = 'output/final/'

In [42]:
edgetable = []


for u in userPart.keys():
    u_part = userPart[u]
    best_communities = sorted(u_part.items(), key=lambda (k, v): v, reverse=True)[:MAX_COMM]
    
    userlist = [tuple((u, bc[0], bc[1])) for bc in best_communities if bc[1]>0]
    edgetable = edgetable + userlist

In [43]:
edges = pd.DataFrame(edgetable, columns=['source','target','weight'])
edges.to_csv(outpath + 'user_cluster_edges_viz.csv', index=None)

In [44]:
# size of cluster is the number of hashtags
def computeSize(cluster):
    return  clusters[clusters['n_cluster'] == int(cluster)]['dim'].values[0]

# label for now is the set of hashtags (need to manual define a name if possible)
def computeLabel(cluster):
    return  clusters[clusters['n_cluster'] == int(cluster)]['hashtags'].values[0]


In [45]:
nodes = pd.DataFrame(columns=['id', 'type'])

# cluster nodes
n1 = pd.DataFrame(edges['target'].unique(), columns=['id'])
n1['type'] = 'cluster'
n1['size'] = n1['id'].apply(lambda x: computeSize(x))
n1['label'] = n1['id'].apply(lambda x: computeLabel(x))

# user nodes
n2 = edges[['source']]
n2['type'] = 'user'
n2.columns = ['id', 'type']
n2['size'] = 1
n2['label'] = n2.apply(lambda x: x['id'], axis=1)

nodes = pd.concat([n1, n2])
nodes.drop_duplicates().to_csv(outpath + 'user_cluster_nodes_viz.csv', index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


## Extraction of top images

In [18]:
def get_img_url(postUrl):
    page = requests.get(url)
    try:
        return re.findall(r'=[\S]*.jpg', page.text)[0][2:] # 640, 750, 1080 are the three possible urls, they start with ="
    except Exception as e:
        print e, postUrl
        return None

In [56]:
TOP_IMG = 5
cluster_pic_path = '../../../img/'

In [16]:
posts = pd.read_csv('raw-data/post.csv', sep='\t')
posts.head()

Unnamed: 0,id_post,username,video_count,url_img,link_post,owner,caption,comment_count,taken_at_timestamp,taken_at_time,shortcode,is_video,likes_count
0,1636842385063323219,gxgcollective,0,https://scontent-mxp1-1.cdninstagram.com/t51.2...,https://www.instagram.com/p/Ba3O3oTAKZT,2964555295,Some beautiful luxe pieces have arrived - hand...,1.0,1509347000.0,2017-10-30 08:00:39,Ba3O3oTAKZT,False,10.0
1,1636667639059249501,gxgcollective,0,https://scontent-mxp1-1.cdninstagram.com/t51.2...,https://www.instagram.com/p/Ba2nIvZgPld,2964555295,It’s a scorcher today - wish we were here 🌞🔥...,0.0,1509326000.0,2017-10-30 02:13:28,Ba2nIvZgPld,False,15.0
2,1635907116583816031,gxgcollective,0,https://scontent-mxp1-1.cdninstagram.com/t51.2...,https://www.instagram.com/p/Baz6Nrpgwtf,2964555295,Weekend summer vibes - Shop gxgcollective.com ...,0.0,1509235000.0,2017-10-29 02:02:26,Baz6Nrpgwtf,False,5.0
3,1635853795965600449,gxgcollective,0,https://scontent-mxp1-1.cdninstagram.com/t51.2...,https://www.instagram.com/p/BazuFw9AQLB,2964555295,Morning x #gxgcollective,0.0,1509229000.0,2017-10-29 00:16:30,BazuFw9AQLB,False,6.0
4,1635593889484344162,gxgcollective,0,https://scontent-mxp1-1.cdninstagram.com/t51.2...,https://www.instagram.com/p/Bayy_oLg4ti,2964555295,gxgcollective.com - New arrivals - A touch of ...,0.0,1509198000.0,2017-10-28 15:40:07,Bayy_oLg4ti,False,10.0


In [46]:
participationTablePost.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
1417719112704094679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1417720206520100400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1417721423774016136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1417723489285569906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1417726356804600801,0.0,0.0,0.0,0.0,0.035714,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
topImg = {}
for c in range(clusters.shape[0]-1):
    top_img_cluster = list(participationTablePost[c].sort_values(ascending=False)[:TOP_IMG].index)
    topImg[c] = top_img_cluster

In [59]:
for c in range(clusters.shape[0]-1):
    top_img_cluster = list(participationTablePost[c].sort_values(ascending=False).index)[:10]
    
    img_found = 0
    for pid in top_img_cluster:
        url = posts[posts['id_post'] == pid]['link_post'].values[0]
        username = posts[posts['id_post'] == pid]['username'].values[0]
        image_url = get_img_url(url)
        
        if image_url is not None:
            img_data = requests.get(image_url).content
            with open(cluster_pic_path + 'c{}_{}_{}.jpg'.format(c, username, pid), 'wb') as handler:
                handler.write(img_data)
            
            img_found += 1
            
        if img_found == TOP_IMG:
            break

list index out of range https://www.instagram.com/p/BP2Hh98DGd7
list index out of range https://www.instagram.com/p/BSp38tQB6sl
list index out of range https://www.instagram.com/p/BSW6dM_h7kg
