# Scalable Social Network Analysis

## Preparation: Run ONCE

```
activate py35 && pip install sparkmagic==0.11.2 && pip install colorlover &&  jupyter nbextension enable --py --sys-prefix widgetsnbextension
```

## Working with Spark

First, load sparkmagic extention

In [None]:
%load_ext sparkmagic.magics

To connect to your HDInsight Spark cluster:
* run "%manage_spark" to get the management panel
* Add Endpoint: your cluster address (format: "https://[myclustername].azurehdinsight.net/livy") and your chosen user/password
* Create Session: with language "python"
* Wait until your session is ready

In [None]:
%manage_spark

## Every spark cell has to start with %%spark

In [None]:
%%spark
sc

In [None]:
%%spark

# First, we load the csv into an RDD named tweetsCSV
tweetsCSV = sc.textFile("wasb://mie451datasets@mie451files.blob.core.windows.net/tweets2009-06-0115.csv")

In [None]:
%%spark

# Print to validate tweetsCSV
tweetsCSV

In [None]:
%%spark

# "take" the first 5 items
tweetsCSV.take(5)

In [None]:
%%spark

def isEnglish(s):
    try:
        s.encode('ascii')
    except UnicodeEncodeError:
        return False
    else:
        return True

# RDD Transformations: parse the data in tweetsCSV
tweets = tweetsCSV.filter(lambda s: isEnglish(s)).map(lambda s: s.split("\t")).filter(lambda s: s[0] != "date" and len(s) == 3).map(lambda s:(str(s[0]), str(s[1]), str(s[2])))

In [None]:
%%spark
from pyspark.sql.types import *

# Create schema for dataframe
tweetsSchema = StructType([StructField("date", StringType(), False), 
                           StructField("user", StringType(), False), 
                           StructField("tweet", StringType(), False)])

# Create data frame
tweetsDF = sqlContext.createDataFrame(tweets, tweetsSchema)

In [None]:
%%spark

# Select-Where query (selecting all the tweets of use "burtonator")
tweetsDF.where(tweetsDF.user=="burtonator").select(tweetsDF.date).collect()

In [None]:
%%spark

# Register as a table to allow direct SQL queries
tweetsDF.registerTempTable("tweets")

### Use option "-c sql" to run sql queries on registered tables
### Use option "--maxrows X" to limit the results to X records

In [None]:
%%spark -c sql --maxrows 10
SELECT * FROM tweets

### Use option "-o dfname" to save the results into a Pandas dataframe named dfname to allow further processing in the notebook (e.g., with networkx)

In [None]:
%%spark -c sql -o firstTen --maxrows 10
SELECT * FROM tweets

In [None]:
firstTen

# Introduction to NetworkX

In [None]:
import networkx as nx

## Creating a graph

In [None]:
G = nx.Graph()

In [None]:
G.add_node(234)
G.add_node("hello")
G.add_edge(234,"hello")

In [None]:
print("Nodes:", G.nodes())
print("Edges:", G.edges())

In [None]:
G[234]

In [None]:
G['hello']

## Properties on edges

In [None]:
G.add_edge('Alice', 'Bob', {'know': 10, 'friends': 5})

In [None]:
print("Nodes:", G.nodes())
print("Edges:", G.edges())

In [None]:
G['Bob']

In [None]:
G['Alice']

In [None]:
G['Bob']['Alice']['know'] += 1

In [None]:
G['Alice']

## Analyzing graphs

In [None]:
G.add_edge('Alice', 'Carlos')
G.add_edge('Carlos', 'Dave')
G.add_edge('Dave', 'Bob')
G.add_edge('Alice', 'Eve')

In [None]:
components = nx.connected_components(G)
list(components)

In [None]:
nx.degree(G)

In [None]:
nx.degree(G,'Bob')

In [None]:
nx.has_path(G, 'Alice', 'Dave')

In [None]:
nx.has_path(G, 'Alice', 'hello')

In [None]:
nx.shortest_path(G, 'Alice', 'Dave')

## Centrality

In [None]:
nx.degree_centrality(G)

In [None]:
nx.betweenness_centrality(G)

# Analyzing Twitter Data

## Finding most common hash tags (RDD)

In [None]:
%%spark

import re
wordTokenizerRegex = re.compile("[ ,.;]")

# Split to words and flatten using flatMap (to have one big RDD with all words). Then, we filter only hashtags (starting with "#")
hashTags = tweets.flatMap(lambda s: wordTokenizerRegex.split(s[2])).filter(lambda w: w.startswith("#"))

In [None]:
%%spark

# how many hashtags did we find
print(hashTags.count())

# examine the first 100
print(hashTags.take(100))

In [None]:
%%spark

# count unique words and sort them in descending order of count
countedHashTags = hashTags.map(lambda w: (w, 1)).reduceByKey(lambda a, b: a + b).sortBy(lambda tup: tup[1], ascending = False)

In [None]:
%%spark

#show top 100 words
countedHashTags.take(100)

## Filtering tweets with selected hashtag #redsox (DataFrame)

We use SQL queries to expore different tags and export the selected subset to a pandas dataframe (in the following case we choose "redsox")

In [None]:
%%spark -c sql --maxrows 10
SELECT * FROM tweets WHERE LOWER(tweet) LIKE "%#redsox%" 

In [None]:
%%spark -c sql -o onlyRedsox
SELECT * FROM tweets WHERE LOWER(tweet) LIKE "%#redsox%" 

In [None]:
# Verify pandas dataframe
onlyRedsox.head()

## IMPORTANT
Once you've got the required dataframe - save it (to avoid having to re-run spark again next time):

In [None]:
onlyRedsox.to_csv("backup-df-redsox.csv")

## IMPORTANT
After you save your df you can always reload it, instead of re-creating it using spark.
If you want to get the saved dataframe, just load it from disk as follows:

In [None]:
import pandas as pd
redsox = pd.DataFrame.from_csv("backup-df-redsox.csv")

In [None]:
# Verify loaded dataframe
onlyRedsox.head()

Now, we move to build the mention graph

In [None]:
def addMentionedColumn(df):
    
    def mentionsList(txt):
        allWords = [word.strip(""" ,.:'\";""").lower() for word in txt.split()]
        allNames = [word.strip("@") for word in allWords if word.startswith("@")]
        uniqueNames = list(set(allNames))
        return allNames
    
    df["mentioned"] = df["tweet"].apply(mentionsList)

In [None]:
addMentionedColumn(onlyRedsox)

In [None]:
onlyRedsox.head(10)

In [None]:
def mentionGraph(df):
    g = nx.Graph()
    
    for (index, date, user, tweet, mentionedUsers) in df.itertuples():
        for mentionedUser in mentionedUsers:
            if (user in g) and (mentionedUser in g[user]):
                g[user][mentionedUser]["numberMentions"] += 1
            else:
                g.add_edge(user, mentionedUser, {'numberMentions': 1})
    
    return g

In [None]:
redsoxGraph = mentionGraph(onlyRedsox)

In [None]:
print("# nodes:", len(redsoxGraph.nodes()))
print("# edges:", len(redsoxGraph.edges()))

In [None]:
redsoxGraph['shelley1005']

# Visualize Mention Graph

In [None]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
init_notebook_mode(connected=True)

### Generate random positions for nodes and store them at property "pos"

In [None]:
import random
def addRandomPositions(graph):
    posDict = dict((node,(random.gauss(0,10),random.gauss(0,10))) for node in graph.nodes())
    nx.set_node_attributes(graph,"pos", posDict)

In [None]:
addRandomPositions(redsoxGraph)

In [None]:
nx.get_node_attributes(redsoxGraph, 'pos')['shelley1005']

### Visualize using Plot.ly scatter plots

In [None]:
def plotNetwork(graph):
    scatters=[]

    for (node1, node2) in graph.edges():
        x0, y0 = graph.node[node1]['pos']
        x1, y1 = graph.node[node2]['pos']
        edgeWidth = graph[node1][node2]['numberMentions']
        s = Scatter(
                x=[x0, x1],
                y=[y0, y1],
                hoverinfo='none',
                mode='lines', 
                line=Line(width=1 ,color='#888'))
        scatters.append(s)



    for node in graph.nodes():
        xPos, yPos = graph.node[node]['pos']
        s = Scatter(
                x=[xPos], 
                y=[yPos], 
                hoverinfo='none',
                mode='marker', 
                marker=dict(
                    color="#888", 
                    size=10,         
                    line=dict(width=2)))
        scatters.append(s)
    
    layout = Layout(showlegend=False)
    fig = Figure(data=scatters, layout=layout)
    iplot(fig, show_link=False)

In [None]:
plotNetwork(redsoxGraph)

### Visualize using node size and edge width

In [None]:
def plotNetworkSize(graph):
    scatters=[]

    for (node1, node2) in graph.edges():
        x0, y0 = graph.node[node1]['pos']
        x1, y1 = graph.node[node2]['pos']
        edgeWidth = graph[node1][node2]['numberMentions']
        s = Scatter(
                x=[x0, x1],
                y=[y0, y1],
                hoverinfo='none',
                mode='lines', 
                line=Line(width=edgeWidth ,color='#888'))
        scatters.append(s)



    for node in graph.nodes():
        xPos, yPos = graph.node[node]['pos']
        s = Scatter(
                x=[xPos], 
                y=[yPos], 
                hoverinfo='none',
                mode='marker', 
                marker=dict(
                    color="#888", 
                    size=nx.degree(graph,node)*2,         
                    line=dict(width=2)))
        scatters.append(s)
    
    layout = Layout(showlegend=False)
    fig = Figure(data=scatters, layout=layout)
    iplot(fig, show_link=False)

In [None]:
plotNetworkSize(redsoxGraph)

## Using *Colorlover* for colors

In [None]:
import colorlover as cl
from IPython.display import HTML

In [None]:
HTML(cl.to_html( cl.scales['9'] ))

In [None]:
# map purd color scale to 300 cells
purd = cl.scales['9']['seq']['PuRd']
purd300 = cl.interp(purd, 300)
HTML(cl.to_html(purd300))

## Adding color and text based on centrality

In [None]:
def plotNetworkSizeColor(graph):
    closenessCentr = nx.closeness_centrality(redsoxGraph)
    maxCentr = max(closenessCentr.values())
    minCentr = min(closenessCentr.values())
    
    scatters=[]

    for (node1, node2) in graph.edges():
        x0, y0 = graph.node[node1]['pos']
        x1, y1 = graph.node[node2]['pos']
        edgeWidth = graph[node1][node2]['numberMentions']
        s = Scatter(
                x=[x0, x1],
                y=[y0, y1],
                hoverinfo='none',
                mode='lines', 
                line=Line(width=edgeWidth ,color='#888'))
        scatters.append(s)



    for node in graph.nodes():
        nodeCentr = closenessCentr[node]
        nodeColor = int(299*(nodeCentr-minCentr)/(maxCentr-minCentr))
        xPos, yPos = graph.node[node]['pos']
        s = Scatter(
                x=[xPos], 
                y=[yPos], 
                text="User: %s\nCloseness: %.3f" % (node, nodeCentr),
                hoverinfo='text',
                mode='marker', 
                marker=dict(
                    color=purd300[nodeColor], 
                    size=nx.degree(graph,node)*2,         
                    line=dict(width=2)))
        scatters.append(s)
    
    layout = Layout(showlegend=False)
    fig = Figure(data=scatters, layout=layout)
    iplot(fig, show_link=False)

In [None]:
plotNetworkSizeColor(redsoxGraph)

## Using NetworkX layouts

In [None]:
def applyLayout(graph, layoutFunc):
    posDict = layoutFunc(graph) 
    nx.set_node_attributes(graph, "pos", posDict)

### Spring layout

In [None]:
redsoxGraphSpring = redsoxGraph.copy()
applyLayout(redsoxGraphSpring, nx.spring_layout)
plotNetworkSizeColor(redsoxGraphSpring)

### Random layout

In [None]:
redsoxGraphRandom = redsoxGraph.copy()
applyLayout(redsoxGraphRandom, nx.random_layout)
plotNetworkSizeColor(redsoxGraphRandom)

### Circular layout

In [None]:
redsoxGraphCircular = redsoxGraph.copy()
applyLayout(redsoxGraphCircular, nx.circular_layout)
plotNetworkSizeColor(redsoxGraphCircular)

### Spectral layout

In [None]:
redsoxGraphSpectral = redsoxGraph.copy()
applyLayout(redsoxGraphSpectral, nx.spectral_layout)
plotNetworkSizeColor(redsoxGraphSpectral)