# Hashtag Culture Analysis

A hashtag (#) is a type of metadata tag used on social networks such as Twitter and other microblogging services. It lets users apply dynamic, user-generated tagging that helps other users easily find messages with a specific theme or content. We can borrow some basic principles from Network Science and graph theory to understand how hashtags on Instagram are connected.

# What aspects of Graph Theory can we use in our analysis?
<ul>
    <li><b>Community Detection</b>: We can use algorithms to identify and label clusters of topics/themes</li>
    <li><b>Degree Centrality/ Betweenness Centrality</b>: We can calculate what hashtags in the network are particularly important in linking the whole network.</li>
    <li><b>Visualization</b>: If we plot the network using scatterplots, it’s a very compelling way to visualise a huge amount of information about hashtags that would be cumbersome to do otherwise</li>
</ul>
    

In [101]:
import pandas as pd
import numpy as np

import networkx as nx
from node2vec import Node2Vec

In [83]:
hashtag_cooccurence_df = pd.read_csv("../data/instagram_hashtag_posts.csv")

In [84]:
# Drop rows with null values
hashtag_cooccurence_df = hashtag_cooccurence_df[hashtag_cooccurence_df['hashtags'].notna()]
hashtag_cooccurence_df.head()

Unnamed: 0.1,Unnamed: 0,post_url,hashtags
0,0,https://www.instagram.com/p/CQ--UmMH_i8/,"#sgfood, #sg, #sgfoodies, #hawkerculturesg, #k..."
1,1,https://www.instagram.com/p/CUhn24DKP0A/,"#lino, #linocutprint, #linocut, #linoprint, #l..."
2,2,https://www.instagram.com/p/CNUz7jPnpJK/,"#chinhogaixiao, #chinhojiak, #甄好吃, #甄好介绍, #sup..."
3,3,https://www.instagram.com/p/CZlBMnEPFcy/,"#hawkerculturesg, #coffeeshopsg"
4,4,https://www.instagram.com/p/COcNod6nOhv/,"#affordable, #awesomefood, #delicious, #explor..."


## Create Adjacency Matrix

In [85]:
occurence = hashtag_cooccurence_df["hashtags"].tolist()

In [86]:
# Create set of all unique hashtags

unique_hashtags = set()
for combination in occurence:
    combination = combination.split(", ")
    for hashtag in combination:
        unique_hashtags.add(hashtag)

In [87]:
unique_hashtags = list(unique_hashtags)
unique_hashtags

['#kasohsg',
 '#蛋香味',
 '#lazarusisland',
 '#coffeebrew',
 '#teamfnbsg',
 '#gluteniousrice',
 '#kopisg',
 '#usk',
 '#wontonmee',
 '#hawkercentre',
 '#成記肉骨茶',
 '#muralartist',
 '#bakersim',
 '#abcfood',
 '#hawkersg',
 '#eatingfortheinsta',
 '#sgfoodphotography',
 '#sengkee',
 '#BCM',
 '#singaporefoodporn',
 '#the1950scoffee',
 '#gracefulleeTanjongPagar',
 '#01-33,',
 '#angmokio',
 '#healthyeating',
 '#friedcarrotcake',
 '#jixiangangkukueh',
 '#yuanyangpeng',
 '#charsiew',
 '#印度煎饼',
 '#caifan',
 '#wearethestreets',
 '#SingaporeFood',
 '#covidlifestyle',
 '#caipng',
 '#amoy',
 '#tebangardensfoodcentre',
 '#watercolorsketchbook',
 '#agaragar',
 '#mlxg',
 '#eatbooksg',
 '#anoodlestory',
 '#linocutting',
 '#abcbrickworksfoodcentre',
 '#shiokfoodsg',
 '#discoverunder10k.',
 '#SupportLocalSg',
 '#CKT',
 '#abcmarket',
 '#japan',
 '#foodie',
 '#laupasat',
 '#thefoodieinitiative',
 '#cheecheongfun',
 '#instamagazine',
 '#sgbreakfast',
 '#whattoeat',
 '#壁アート探し',
 '#ondehondehcake',
 '#singaporelife

In [88]:
# Create adjecency dictionary, where
# adjacency_dict = k(#n, #n+1) => v: int(number of co-occurence across all posts)
adjacency_dict = {}
for hashtag in unique_hashtags:
     for hashtag2 in unique_hashtags:
            if (hashtag,hashtag2) not in adjacency_dict:
                adjacency_dict[(hashtag,hashtag2)] = 0

In [89]:
adjacency_dict

{('#kasohsg', '#kasohsg'): 0,
 ('#kasohsg', '#蛋香味'): 0,
 ('#kasohsg', '#lazarusisland'): 0,
 ('#kasohsg', '#coffeebrew'): 0,
 ('#kasohsg', '#teamfnbsg'): 0,
 ('#kasohsg', '#gluteniousrice'): 0,
 ('#kasohsg', '#kopisg'): 0,
 ('#kasohsg', '#usk'): 0,
 ('#kasohsg', '#wontonmee'): 0,
 ('#kasohsg', '#hawkercentre'): 0,
 ('#kasohsg', '#成記肉骨茶'): 0,
 ('#kasohsg', '#muralartist'): 0,
 ('#kasohsg', '#bakersim'): 0,
 ('#kasohsg', '#abcfood'): 0,
 ('#kasohsg', '#hawkersg'): 0,
 ('#kasohsg', '#eatingfortheinsta'): 0,
 ('#kasohsg', '#sgfoodphotography'): 0,
 ('#kasohsg', '#sengkee'): 0,
 ('#kasohsg', '#BCM'): 0,
 ('#kasohsg', '#singaporefoodporn'): 0,
 ('#kasohsg', '#the1950scoffee'): 0,
 ('#kasohsg', '#gracefulleeTanjongPagar'): 0,
 ('#kasohsg', '#01-33,'): 0,
 ('#kasohsg', '#angmokio'): 0,
 ('#kasohsg', '#healthyeating'): 0,
 ('#kasohsg', '#friedcarrotcake'): 0,
 ('#kasohsg', '#jixiangangkukueh'): 0,
 ('#kasohsg', '#yuanyangpeng'): 0,
 ('#kasohsg', '#charsiew'): 0,
 ('#kasohsg', '#印度煎饼'): 0,
 ('#k

In [90]:
occurence

['#sgfood, #sg, #sgfoodies, #hawkerculturesg, #koreanfoodsg, #singapore, #korean, #korea, #koreanfood, #koreanfoodlovers, #8dayseat, #food, #foodie, #foodporn, #foodstagram, #foodblogger, #foodie, #foodstylist, #foodpics, #foodprep, #foodblogger, #foodart, #foodgram, #travel, #travelphotography, #trending, #travelblogger, #travelgram, #training, #traveladdict, #travelpics',
 '#lino, #linocutprint, #linocut, #linoprint, #linoleumprint, #linocutting, #linoprinting, #print, #prints, #printmaking, #printstudio, #printmaker, #printmakingart, #printmakingstudio, #singapore, #singaporefood, #hawkerfood, #hawkerfoodsg, #hawkerculturesg, #sghawkerfood, #sghawkerculture, #sghawkers',
 '#chinhogaixiao, #chinhojiak, #甄好吃, #甄好介绍, #supporthawker, #supportlocal, #hawkerfoodsg, #hawkerfood, #hawkersg, #hawkerculturesg, #hawkercentre, #hawker, #hawkerculture, #ourhawkerculture, #singaporefood, #singaporefoodie, #singaporehawkerfood, #sgfoodie, #sgfood, #sghawker, #sghawkerculture, #singaporefoodculture

In [91]:
for combination in occurence:
    combination = combination.split(", ")
    for hashtag in combination:
        for hashtag2 in combination:
            if hashtag2!=hashtag:
                adjacency_dict[(hashtag,hashtag2)] += 1

In [92]:
# Remove records where 0 co occurence
temp_adjacency_dict = {}
for k,v in adjacency_dict.items():
    if v != 0:
        temp_adjacency_dict[k]=v
        
# Remove duplicates
# e.g. (hashtag1, hashtag2) is duplicate of (hashtag2, hashtag1)
cleaned_adjacency_dict = {}
for k,v in temp_adjacency_dict.items():
    if (k[1],k[0]) not in cleaned_adjacency_dict:
        cleaned_adjacency_dict[k] = v

In [93]:
cleaned_adjacency_dict

{('#kasohsg', '#teamfnbsg'): 1,
 ('#kasohsg', '#sgrestaurants'): 1,
 ('#kasohsg', '#sgeats'): 1,
 ('#kasohsg', '#dapaonation'): 1,
 ('#kasohsg', '#zicharnation'): 1,
 ('#kasohsg', '#sgfoodie'): 1,
 ('#kasohsg', '#savefnbsg'): 1,
 ('#kasohsg', '#sgfood'): 1,
 ('#kasohsg', '#hawker'): 1,
 ('#kasohsg', '#ourhawkerculture'): 1,
 ('#kasohsg', '#fnbsg'): 1,
 ('#kasohsg', '#sgheritage'): 1,
 ('#kasohsg', '#hawkerculturesg'): 1,
 ('#kasohsg', '#supportlocalsg'): 1,
 ('#kasohsg', '#sghistory'): 1,
 ('#kasohsg', '#sgfoodies'): 1,
 ('#蛋香味', '#hawkersg'): 1,
 ('#蛋香味', '#igsgfoodies'): 1,
 ('#蛋香味', '#sgfoodporn'): 1,
 ('#蛋香味', '#carrotcake'): 1,
 ('#蛋香味', '#sgigfoodies'): 1,
 ('#蛋香味', '#instafood_sg'): 1,
 ('#蛋香味', '#sgfoodblogger'): 1,
 ('#蛋香味', '#igsgfood'): 1,
 ('#蛋香味', '#sgigfood'): 1,
 ('#蛋香味', '#p2ha'): 1,
 ('#蛋香味', '#sgfoodtrend'): 1,
 ('#蛋香味', '#supporthawkersg'): 1,
 ('#蛋香味', '#eggy'): 1,
 ('#蛋香味', '#好吃'): 1,
 ('#蛋香味', '#whati8today'): 1,
 ('#蛋香味', '#wokhei'): 1,
 ('#蛋香味', '#foodstagram'):

In [115]:
source = []
target = []
edge = []
for k,v in cleaned_adjacency_dict.items():
    edge.append(v)
    source.append(k[0])
    target.append(k[1])

In [116]:
df = pd.DataFrame(list(zip(source, target, edge)),
               columns =['source', 'target', 'edge'])

In [124]:
adjacency_matrix = nx.from_pandas_edgelist(df, edge_attr=True)

## Simulate walks with node2vec

In [138]:
node2vec = Node2Vec(adjacency_matrix, dimensions=30, walk_length=5, num_walks=200, workers=4)
model = node2vec.fit(window=10, min_count=1)

Computing transition probabilities:   0%|          | 0/1247 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 50/50 [00:04<00:00, 11.28it/s]
Generating walks (CPU: 2): 100%|██████████| 50/50 [00:03<00:00, 14.13it/s]
Generating walks (CPU: 3): 100%|██████████| 50/50 [00:05<00:00,  9.65it/s]
Generating walks (CPU: 4): 100%|██████████| 50/50 [00:04<00:00, 11.96it/s]


In [139]:
vocab, vectors =  model.wv.key_to_index, model.wv.vectors

# get node name and embedding vector index.
name_index = np.array([(v[0], v[1]) for v in vocab.items()])

# init dataframe using embedding vectors and set index as node name
node2vec_output = pd.DataFrame(vectors[name_index[:,1].astype(int)])
node2vec_output.index = name_index[:,0]

In [140]:
node2vec_output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
#hawkerculturesg,-0.204844,0.270728,0.244294,-0.296544,0.158725,0.063700,0.047933,-0.299630,-0.578772,-0.338073,...,0.149182,0.351860,0.326264,0.228629,0.066685,0.018542,-0.111635,0.361855,0.380692,0.049929
#hawkerfood,-0.299551,0.185702,0.887134,-0.646640,0.018703,-0.089562,-0.061510,-0.364893,-1.098514,-0.199707,...,0.150654,-0.116567,-0.322258,0.468862,0.142534,0.134794,0.135924,0.600070,-0.260466,0.102501
#sgfood,-0.139456,-0.041303,0.851110,0.291800,0.515831,0.278948,-0.429143,0.261057,-1.212820,-0.330664,...,0.477293,-0.250924,-0.253094,0.676679,0.360398,0.198562,-0.525799,-0.043093,0.490954,0.080965
#hawkerfoodsg,-0.913714,0.264768,1.271017,0.095255,0.331039,-0.173593,-0.103294,-0.430525,-0.841618,-0.208013,...,0.146499,-0.424013,0.264196,0.621345,0.301998,0.711786,-0.131675,0.365870,0.332542,0.211643
#hawkerculture,0.031048,-0.069613,1.094995,-0.176134,0.153722,-0.419647,-0.099322,-0.342333,-0.361356,0.513227,...,1.102867,0.214964,0.154081,0.750166,-0.173208,0.196903,0.390355,0.430910,-0.149107,0.054479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
#veg_littledragon,-0.157626,-0.054043,-0.003150,0.307282,0.029861,0.270337,-0.459179,-0.469106,-0.675568,-0.129649,...,0.100427,-0.444460,0.221923,0.357956,-0.579807,-0.045898,-0.526492,0.409518,-0.167484,-0.025009
#donburi,0.291313,0.045218,0.135236,0.330100,-0.176356,0.288729,0.048892,0.136298,-0.368244,-0.244438,...,0.211356,-0.033635,0.024387,0.772804,-0.308160,0.271309,-0.794378,-0.082874,0.205658,-0.260805
#boonlayplace,-0.106644,-0.295728,0.451058,0.053661,0.174091,-0.157648,0.323508,-0.417935,-0.066104,-0.411618,...,0.012117,0.306540,-0.145931,0.435973,0.094409,0.302921,0.066376,-0.398631,-0.171816,-0.738419
#jurongpoint,-0.182637,-0.159962,0.144775,0.534656,0.198151,-0.066275,0.372510,-0.163962,-0.185658,-0.277456,...,-0.120523,0.539699,-0.035967,0.728591,0.193685,0.273445,0.004951,-0.197641,-0.224097,-0.413017
