In [1]:
import networkx as nx
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
import random
import colorlover as cl
from IPython.display import HTML
from collections import Counter
import matplotlib.pyplot as plt
import operator
import re
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.data import find
random.seed(1)
init_notebook_mode(connected=True)

# Hashtag: "#facebook"

### Loading Twitter Data

In [2]:
!pwd

/d/Dropbox/Dropbox/#my-notebook/MIE1513/Lab7


In [3]:
df = pd.read_csv("tweets2009-06-0115.csv.zip", sep='\t', compression='zip')

In [4]:
print("Num of rows:", df.shape[0])

Num of rows: 3437690


In [5]:
df.head()

Unnamed: 0,date,user,tweet
0,2009-06-01 21:43:59,burtonator,No Post Title
1,2009-06-01 21:47:23,burtonator,No Post Title
2,2009-06-02 01:15:44,burtonator,No Post Title
3,2009-06-02 05:17:52,burtonator,No Post Title
4,2009-06-02 23:58:25,burtonator,No Post Title


## Quesiton 1 - #facebook
- Finding most common hash tags, and select #facebook to use
- Buide a table contianing the selected hashtag
- Add a column of comment relationship to the table

In [6]:
allTweets = df["tweet"].str.cat(sep=' ')
tweetWords = [word.strip(""" ,.:'\";""").lower() for word in allTweets.split()]
hashTags = [word for word in tweetWords if word.startswith("#")]
hashTagsCounter = Counter(hashTags)

In [7]:
hashTagsCounter.most_common(20)

[('#iranelection', 26853),
 ('#followfriday', 16400),
 ('#jobs', 13322),
 ('#iremember', 11057),
 ('#spymaster', 10587),
 ('#ff', 10446),
 ('#squarespace', 9198),
 ('#tcot', 7691),
 ('#fb', 6107),
 ('#cnnfail', 4451),
 ('#11thcommandment', 3429),
 ('#jtv', 3317),
 ('#140mafia', 3144),
 ('#iran', 2935),
 ('#', 2895),
 ('#news', 2837),
 ('#quote', 2750),
 ('#vampirebite', 2634),
 ('#1', 2587),
 ('#bsb', 2433)]

In [8]:
fbTag = df[df["tweet"].str.lower().str.contains("#facebook", na=False)].copy()

In [9]:
fbTag[fbTag['user']=='rondostar']

Unnamed: 0,date,user,tweet
1636186,2009-06-13 04:00:25,rondostar,RT @cymberly: 1 minute! #facebook
1658018,2009-06-13 04:22:54,rondostar,http://www.facebook.com/THE_REAL_SHAQ is still...
1675724,2009-06-13 04:42:28,rondostar,http://www.facebook.com/3OH!3 is still availab...
1689450,2009-06-13 04:58:50,rondostar,Common? Come on Common!?!? http://www.facebook...
1691176,2009-06-13 05:01:10,rondostar,RT @LevelTen_Colin: wondering if #facebook has...
1694020,2009-06-13 05:04:27,rondostar,Bloc Party still available - http://www.facebo...
1701632,2009-06-13 05:14:35,rondostar,T.O. http://www.facebook.com/terrellowens #fac...
1705039,2009-06-13 05:19:10,rondostar,RT @LevelTen_Colin: http://www.facebook.com/ke...
1709783,2009-06-13 05:24:43,rondostar,wow RT @LevelTen_Colin: http://www.facebook.co...
1709784,2009-06-13 05:24:43,rondostar,wow RT @LevelTen_Colin: http://www.facebook.co...


In [10]:
fbTag[fbTag['user']=='levelten_colin']

Unnamed: 0,date,user,tweet
1694377,2009-06-13 05:05:06,levelten_colin,http://www.facebook.com/dolphlundgren - still ...
1697541,2009-06-13 05:09:13,levelten_colin,my hometown - still available http://www.faceb...
1704195,2009-06-13 05:17:57,levelten_colin,http://www.facebook.com/kevinbacon - still ava...
1707199,2009-06-13 05:21:39,levelten_colin,http://www.facebook.com/chucknorris - still av...
1711127,2009-06-13 05:26:07,levelten_colin,RT @rondostar: Bruce Lee - http://www.facebook...
1711128,2009-06-13 05:26:07,levelten_colin,RT @rondostar: Bruce Lee - http://www.facebook...


In [11]:
def addMentionedColumn(df):
    
    def mentionsList(txt):
        allWords = [word.strip(""" ,.:'\";""").lower() for word in txt.split()]
        allNames = [word.strip("@") for word in allWords if word.startswith("@")]
        uniqueNames = list(set(allNames))
        return allNames
    
    df["mentioned"] = df["tweet"].apply(mentionsList)

In [12]:
addMentionedColumn(fbTag)

In [13]:
fbTag.head()

Unnamed: 0,date,user,tweet,mentioned
5639,2009-06-11 17:08:04,kwech,You have one and a half days to choose your Fa...,[]
21157,2009-06-11 17:26:38,vjt,"~ #EFF Launches TOSBack, a ""Terms of Service"" ...","[velenux, googlepolicyit]"
25643,2009-06-11 17:34:56,studiomds,@garyvee facebook vanity url alternative for t...,[garyvee]
28973,2009-06-11 17:41:25,nextmoon,"RT @StefanW: ""Social Media Addiction"" by @ jfo...","[stefanw, ]"
40133,2009-06-11 17:59:19,mikemdm,http://tr.im/oazp #facebook marketing book (fr...,[]


### Question 2 
- Build a mention graph

In [14]:
def mentionGraph(df):
    g = nx.DiGraph()
    
    for (index, date, user, tweet, mentionedUsers) in df.itertuples():
        for mentionedUser in mentionedUsers:
            if (user in g) and (mentionedUser in g[user]):
                g[user][mentionedUser]["numberMentions"] += 1
            else:
                g.add_edge(user, mentionedUser, {'numberMentions': 1})
    
    return g

In [15]:
fbGraph = mentionGraph(fbTag)

#### Generate random positions for nodes and store them at property 'pos'

In [16]:
def addRandomPositions(graph):
    
    posDict = dict((node,(random.gauss(0,20),random.gauss(0,2))) for node in graph.nodes())
    nx.set_node_attributes(graph,"pos", posDict)

In [17]:
addRandomPositions(fbGraph)

### Q2.(a)
- Show nodes and edges count in the mention graph
- nodes: 746
- edges: 542

In [18]:
# show # nodes and edges
print("# nodes:", len(fbGraph.nodes()))
print("# edges:", len(fbGraph.edges()))

# nodes: 746
# edges: 545


### Q2.(b) Graph nodes' degree histogram
- Among the 746 data, majority of the nodes have very small number of degree.i.e 573/746 have degree 1, 112/746 have degree 2. Only 13 nodes have degree higher than 5. The highest degree is 29.

In [19]:
# extract degree list of all nodes
degree_sequence = sorted(nx.degree(fbGraph).values(), reverse=True)

def degree_histo(degree_list):
    degree_sequence = degree_list
    trace1  = {"x":degree_sequence,"type":'histogram','opacity':0.75,'xbins':{'end':746.5,'start':-0.5}}
    degree_data = Data([trace1])
    layout = Layout(title='Directed Graph Degree Distribution',xaxis=dict(title='Node Degree'),yaxis=dict(type='log',autorange=True,title='Count (Log Scale)'))
    fig = Figure(data=degree_data,layout=layout)
    iplot(fig)
degree_histo(degree_sequence)

### Q2.(c) Top 4 edges with highest weights

In [20]:
def weight_dict(G):
    weight_dict = {}
    for (u,v) in G.edges():
        edgeWidth = G[u][v]['numberMentions']
        weight_dict[(u,v)] = edgeWidth
    return weight_dict

fb_edge_dict = weight_dict(fbGraph)

In [21]:
top_5_edge = sorted(fb_edge_dict.items(), key=lambda x: -x[1])[:5]

In [22]:
top_5_edge

[(('bullconsulting', 'pachanyc'), 5),
 (('rondostar', 'levelten_colin'), 4),
 (('babysherlene', 'laibcoms'), 4),
 (('thomasbregulla', 'photoshoptips'), 4),
 (('mayhemstudios', 'spndrm'), 4)]

In [23]:
fbGraph['rondostar']

{'cymberly': {'numberMentions': 1}, 'levelten_colin': {'numberMentions': 4}}

In [24]:
fbGraph['levelten_colin']

{'rondostar': {'numberMentions': 2}}

### Q2.(d) Visualization of the mention 


In [25]:
# map purd color scale to 300 cells
purd = cl.scales['9']['seq']['PuRd']
purd20 = cl.interp(purd, 20)
HTML(cl.to_html(purd20))

In [26]:
def plotNetworkWidthColor(graph):
    
    maxMention = max(weight_dict(graph).values())
    minMention = min(weight_dict(graph).values())
    
    scatters=[]

    for (node1, node2) in graph.edges():
        x0, y0 = graph.node[node1]['pos']
        x1, y1 = graph.node[node2]['pos']
        edgeWidth = graph[node1][node2]['numberMentions']
        edgeColor = int(19*(edgeWidth-minMention)/(maxMention-minMention))
        s = Scatter(
                x=[x0, x1],
                y=[y0, y1],
                hoverinfo='text',
                text="Edge: %s\Weight: %f" % ((node1,node2), edgeWidth),
                mode='lines', 
                line=Line(width=edgeWidth ,color=purd20[edgeColor]))
        scatters.append(s)



    for node in graph.nodes():
        xPos, yPos = graph.node[node]['pos']
        s = Scatter(
                x=[xPos], 
                y=[yPos], 
                hoverinfo='none',
                mode='marker', 
                marker=dict(
                    color="#888", 
#                     size=nx.degree(graph,node)*2,         
                    size=2,
                    line=dict(width=2)))
        scatters.append(s)
    
    layout = Layout(showlegend=False)
    fig = Figure(data=scatters, layout=layout)
    iplot(fig, show_link=False)

In [27]:
def applyLayout(graph, layoutFunc):
    posDict = layoutFunc(graph) 
    nx.set_node_attributes(graph, "pos", posDict)

In [28]:
fbGraphSpring_2d = fbGraph.copy()
applyLayout(fbGraphSpring_2d, nx.spring_layout)
plotNetworkWidthColor(fbGraphSpring_2d)

### Q3.(a)
- It seems the top tweet for hashtag #facebook are advertisements that asking people to follow them on their facebook pages.

In [29]:
fbTag['tweet'][:10]


5639     You have one and a half days to choose your Fa...
21157    ~ #EFF Launches TOSBack, a "Terms of Service" ...
25643    @garyvee facebook vanity url alternative for t...
28973    RT @StefanW: "Social Media Addiction" by @ jfo...
40133    http://tr.im/oazp #facebook marketing book (fr...
40735    Soon you'll have your custom #Facebook URL for...
40895    seesmic #twitter and #facebook evolved : http:...
66111    Oh, did I mention that #Mizzou has a fan page ...
82502    Just added myself to the http://wefollow.com t...
86858    you can pick a #FaceBook user name?! What the ...
Name: tweet, dtype: object

In [30]:
def getTopK(df, k, value_column='tweet', min_length = 2):
    stop = set(stopwords.words('english'))
    counter = Counter()
    for twt in df[value_column]:
        counter.update([word.lower() 
                        for word 
                        in tokenize.regexp.regexp_tokenize(twt, pattern='\w+') 
                        if word.lower() not in stop and word.isalpha() and len(word)>=min_length])
    topk = counter.most_common(k)
    return topk

In [31]:
top150 = getTopK(fbTag, 150)

In [32]:
top150

[('facebook', 2083),
 ('http', 1177),
 ('com', 773),
 ('rt', 473),
 ('www', 457),
 ('username', 302),
 ('ly', 297),
 ('bit', 273),
 ('url', 197),
 ('vanity', 162),
 ('twitter', 140),
 ('usernames', 103),
 ('get', 93),
 ('got', 90),
 ('fb', 89),
 ('name', 83),
 ('tinyurl', 63),
 ('new', 62),
 ('im', 55),
 ('page', 53),
 ('de', 49),
 ('available', 43),
 ('urls', 42),
 ('mashable', 41),
 ('socialmedia', 39),
 ('first', 39),
 ('fan', 38),
 ('minutes', 38),
 ('vanityurl', 34),
 ('via', 34),
 ('gd', 33),
 ('grab', 33),
 ('google', 32),
 ('tr', 32),
 ('go', 32),
 ('social', 31),
 ('like', 31),
 ('tonight', 30),
 ('time', 29),
 ('facebookvanity', 29),
 ('mine', 28),
 ('friends', 28),
 ('ich', 28),
 ('good', 27),
 ('bio', 26),
 ('last', 25),
 ('also', 25),
 ('ow', 24),
 ('media', 23),
 ('please', 23),
 ('auch', 23),
 ('profile', 22),
 ('facebooklandgrab', 22),
 ('work', 22),
 ('still', 22),
 ('one', 21),
 ('myspace', 21),
 ('bei', 21),
 ('added', 20),
 ('wefollow', 20),
 ('directory', 20),
 ('f

### Q3.(2)

In [33]:
def getTopK_user(df, k, label_value, label_column='user', operation=operator.eq, value_column='tweet',min_length=2):
    # return top k words for tweet of each user
    stop = set(stopwords.words('english'))
    counter = Counter()
    for twt in df.loc[operation(df[label_column],label_value)][value_column]:
            counter.update([word.lower() 
                            for word 
                            in tokenize.regexp.regexp_tokenize(twt, pattern='\w+') 
                            if word.lower() not in stop and len(word)>=min_length and word.isalpha()])
    topk = counter.most_common(k)
    return topk

In [34]:
def plotNetworkWidthColor_Top3(graph):
    
    maxMention = max(weight_dict(graph).values())
    minMention = min(weight_dict(graph).values())
    
    scatters=[]

    for (node1, node2) in graph.edges():
        x0, y0 = graph.node[node1]['pos']
        x1, y1 = graph.node[node2]['pos']
        edgeWidth = graph[node1][node2]['numberMentions']
        edgeColor = int(19*(edgeWidth-minMention)/(maxMention-minMention))
        s = Scatter(
                x=[x0, x1],
                y=[y0, y1],
                hoverinfo='text',
                text="Edge: %s\Weight: %f" % ((node1,node2), edgeWidth),
                mode='lines', 
                line=Line(width=edgeWidth ,color=purd20[edgeColor]))
        scatters.append(s)



    for node in graph.nodes():
        xPos, yPos = graph.node[node]['pos']
        top3 = getTopK_user(fbTag, k=3, label_value=node)
        s = Scatter(
                x=[xPos], 
                y=[yPos], 
                hoverinfo='text',
                text= 'Node: %s,Top words: %s' % (node,top3),
                mode='marker', 
                marker=dict(
                    color="#888", 
                    size=2,
                    line=dict(width=2)))
        scatters.append(s)
    
    layout = Layout(showlegend=False)
    fig = Figure(data=scatters, layout=layout)
    iplot(fig, show_link=False)

**Mention graph with 3 most common words for each user as hover text**

In [35]:
fbGraph_top3 = fbGraph.copy()
applyLayout(fbGraph_top3, nx.spring_layout)
plotNetworkWidthColor_Top3(fbGraph_top3)

### Q4.(a) Centrality Analysis
- Degree centrality
- PageRank centrality (optimized version of katz)


**Degree Centrality**

In [36]:
degree_centr = nx.degree_centrality(fbGraph)

** PageRank useage**

In [37]:
page_centr = nx.pagerank(fbGraph)

### Q4.(b) Visualizations
-** Visualize Degree Centrality**

In [38]:
purd300 = cl.interp(purd, 300)
HTML(cl.to_html(purd300))

In [39]:
def plotNetworkSizeColor(graph, centrality):
    maxCentr = max(centrality.values())
    minCentr = min(centrality.values())
    
    scatters=[]

    for (node1, node2) in graph.edges():
        x0, y0 = graph.node[node1]['pos']
        x1, y1 = graph.node[node2]['pos']
        edgeWidth = graph[node1][node2]['numberMentions']
        s = Scatter(
                x=[x0, x1],
                y=[y0, y1],
                hoverinfo='none',
                mode='lines', 
                line=Line(width=edgeWidth ,color='#888'))
        scatters.append(s)



    for node in graph.nodes():
        nodeCentr = centrality[node]
        nodeColor = int(299*(nodeCentr-minCentr)/(maxCentr-minCentr))
        xPos, yPos = graph.node[node]['pos']
        s = Scatter(
                x=[xPos], 
                y=[yPos], 
                text="User: %s\nCentrality: %.3f" % (node, nodeCentr),
                hoverinfo='text',
                mode='marker', 
                marker=dict(
                    color=purd300[nodeColor], 
                    size=nx.degree(graph,node)*1.5,         
                    line=dict(width=2)))
        scatters.append(s)
    
    layout = Layout(showlegend=False)
    fig = Figure(data=scatters, layout=layout)
    iplot(fig, show_link=False)

In [40]:
fbGraph_4b1 = fbGraph.copy()
applyLayout(fbGraph_4b1, nx.spring_layout)
plotNetworkSizeColor(fbGraph_4b1, degree_centr)

- **Visualize PageRank Centrality**

In [41]:
fbGraph_4b2 = fbGraph.copy()
applyLayout(fbGraph_4b2, nx.spring_layout)
plotNetworkSizeColor(fbGraph_4b2, page_centr)

### Q4.(c)
Description:
In both of the above centrality graphs, the darkness of node color is proportional to the centrality. For the purpose of comparison, the node size is propotional to the Degree_Centrality.
1. It can be shown that even the node with top2 centrality with 2 methods are same, but most of the results are different.
 * Reason: Compare to the baseline of degree centrality, which   only consider the number of direct neighbors, the PageRank centrality incorporate the idea of markov chain. It not only consider the direct neighbors, but also the other reachable nodes, and gives a measure of importance of the node.

2. PageRank producees more meaningful interpretation. Compares to degree centrality. PageRank gives a probability distribution that can be used to represent the importance of a certain node, after considering all the nodes in the graph.

### Q5.(a)

In [42]:
def plot_histo(degree_list, title,xtitle,ytitle,ylabel=None):
    degree_sequence = degree_list
    trace1  = {"x":degree_sequence,"type":'histogram','opacity':0.75,'xbins':{'end':746.5,'start':-0.5}}
    degree_data = Data([trace1])
    if ylabel == 'log':
        layout = Layout(title=title,xaxis=dict(title=xtitle),yaxis=dict(type='log',autorange=True,title=ytitle))
    else:
        layout = Layout(title=title,xaxis=dict(title=xtitle),yaxis=dict(autorange=True,title=ytitle))
    fig = Figure(data=degree_data,layout=layout)
    iplot(fig)

In [43]:
def mentionGraph_ud(df):
    g = nx.Graph()
    
    for (index, date, user, tweet, mentionedUsers) in df.itertuples():
        for mentionedUser in mentionedUsers:
            if (user in g) and (mentionedUser in g[user]):
                g[user][mentionedUser]["numberMentions"] += 1
            else:
                g.add_edge(user, mentionedUser, {'numberMentions': 1})
    
    return g

In [44]:
fbGraph_ud = mentionGraph_ud(fbTag)

In [45]:
fbGraph_5 = fbGraph_ud.copy()
applyLayout(fbGraph_5, nx.spring_layout)
plotNetworkWidthColor(fbGraph_5)

1) number of maximal cliques

In [46]:
num_maxCliques = nx.graph_number_of_cliques(fbGraph_ud)
num_maxCliques

502

2) The graphs's clique number

In [47]:
graph_clique = nx.graph_clique_number(fbGraph_ud)
graph_clique

3

3) Number of maximal cliques for each node

In [48]:
num_cliques = nx.number_of_cliques(fbGraph_ud)
plot_histo(list(num_cliques.values()), title='Unidirected Node Num of Maximal Cliques Distribution', xtitle='Number of Maximal Cliques',ytitle='Count (Log Scale)',ylabel='log')

4) Size of the largest maximal clique containing each given node

In [49]:
cliques_number = nx.node_clique_number(fbGraph_ud)
cliques_number
sorted(cliques_number.items(), key=lambda x: -x[1])[:10]

[('mmidas', 3),
 ('mashable', 3),
 ('hashsocial', 3),
 ('scottmonty', 3),
 ('cspenn', 3),
 ('the_beth', 3),
 ('adityarao310', 3),
 ('cincyrecruiter', 3),
 ('susanbeebe', 3),
 ('rmercader', 3)]

In [50]:
plot_histo(list(cliques_number.values()), title='Unidirected Node Clique Number Distribution', xtitle='Node Clique Number',ytitle='Count')

In [51]:
sum(x==3 for x in cliques_number.values())

36

In [52]:
sum(x==2 for x in cliques_number.values())

705

In [53]:
sum(x==1 for x in cliques_number.values())

5

### Q5.(b) Insights on the connectivity patterns
* To summarize the clique data in the first part. 
    * In my graph of 746 nodes and 542 edges. 
    * We have 502 distinct maximal cliques of size 1 or 2 or 3. 
    * Most of the nodes 705/746 have the largest clique size of 2, while only 36/746 have size of 3. And 1/746 has clique size of 1, meaning it's isolated from all other nodes/users. 
    * 606/746 nodes have only 1 maximal cliques, 96/746 have 2, and only 3 nodes have more than 10 maximal cliques.
* Insights
    * That means user in this hashtag netword is not strongly connected. 606 Nodes having 1 maximal cliques meaning that each of them only connects to another user, so that to form multiple isolated user pairs. There are 3 users have more tha n10 maximal cliques. That means they might be the centrer of certain topics related to this hashtag.
    * 36 nodes are in the maximal cliques of 3, that means the largest component after betweeness clustering will be 3. That is: the maximum size of group that knows and interact with each other is 3.