# Mapping the brain(hack)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from preprocess_data import *

	"grid.linewidth: white
"
	in file "/home/mboos/.config/matplotlib/stylelib/mb.mplstyle"
	Key grid.linewidth: Could not convert "white" to float
  styles = read_style_directory(stylelib_path)


## Data loading
Our data comes in a dictionary in which each entry is the username of a follower of the Brainhack Warsaw account and its value is a list of all its followers.
To find the simplest similarity measure between Twitter users, we compute a matrix of binary indicators, each entry indicates if a user (indicated by the row number) is followed by another user (all _followers_ are indicated by the number of columns).

In [2]:
with open('../data/data_so_far.json', 'r') as fl:
    data_dict = json.load(fl)
data_dict = {key: map(str, val) for key, val in data_dict.items() if val is not None}
sparse_mat, vocabulary = compute_sparse_matrix_of_followers(data_dict)

## The simplest similarity
We now compute how many followers are shared between any two users by a simple matrix multiplication.

In [3]:
shared_followers = sparse_mat.dot(sparse_mat.T).todense()
max_followers = np.zeros_like(shared_followers)
for i in range(shared_followers.shape[0]):
    for j in range(shared_followers.shape[1]):
        max_followers[i,j] = min(np.diag(shared_followers)[i], np.diag(shared_followers)[j])
normalized_followers = shared_followers / max_followers

## Embedding it in a space
We now embed all followers of Brainhack Warsaw in a two dimensional space.
For this we use dimensionality reduction to project individual users in a space that groups users with a high number of shared followers closer together than users with a low number.

In [4]:
import umap
shared_embedding = umap.UMAP(n_components=2, min_dist=0.0, metric='precomputed', n_neighbors=10).fit_transform(
    normalized_followers)

## Visualizing the space
We now visualize it.

In [7]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import minmax_scale


from bokeh.io import output_file, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper
from bokeh.palettes import plasma
from bokeh.plotting import figure
from bokeh.transform import transform

labels = sorted(data_dict.keys())
source = ColumnDataSource(data=dict(x=shared_embedding[:,0], y=shared_embedding[:,1], z=minmax_scale(
    np.diag(shared_followers), (5,30)), desc=labels))
hover = HoverTool(tooltips=[
    ('User', '@desc'),
])
mapper = LinearColorMapper(palette=plasma(256),
                           low=shared_embedding[:,1].min(), high=shared_embedding[:,1].max())

p = figure(plot_width=700, plot_height=500, tools=[hover], title="Shared followers")
p.circle('x', 'y', size='z', source=source,
         fill_color=transform('y', mapper))

output_notebook()
show(p)



## How much do Brainhackers talk about/with each other on Twitter?

We now want to analyze some preliminary tweet data.

### TODO

explain how to download the data

In [56]:
import glob
import pandas as pd
tweet_files = glob.glob('../data/tweets_*.json')
filtered_tweet_data = [format_tweets(tweet_file) for tweet_file in tweet_files]
filtered_tweet_data = [l_i for sublist in filtered_tweet_data for l_i in sublist]
all_tweets_df = pd.DataFrame(filtered_tweet_data)
all_tweets_df['mentions'] = all_tweets_df['text'].apply(parse_mentions)

## Extraction the number of mentions between users

In [17]:
unique_users = all_tweets_df['account'].unique()
no_mentions = np.zeros((unique_users.shape[0], unique_users.shape[0]))
for i, user in enumerate(unique_users):
    mentions_per_user = [elem_i for elem in all_tweets_df['mentions'][all_tweets_df['account']==user] for elem_i in elem]
    unique_mentions, mention_count = np.unique(mentions_per_user, return_counts=True)
    mention_count_dict = {mention: count for mention, count in zip(unique_mentions, mention_count)}
    users_in_brainhack = np.intersect1d(unique_mentions, unique_users)
    if users_in_brainhack.size == 0:
        continue
    else:
        unique_idx = np.squeeze(np.array([np.where(unique_users==brainhack_user)[0] for brainhack_user in users_in_brainhack]))
        mention_count = np.array([mention_count_dict[brainhack_user] for brainhack_user in users_in_brainhack])
        no_mentions[i][unique_idx] = mention_count

## Visualizing the graph structure of the number of mentions

In [20]:
import umap
embedding = umap.UMAP(n_components=2, min_dist=0.001, metric='precomputed', n_neighbors=5).fit_transform(
    no_mentions)

import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import minmax_scale


from bokeh.io import output_file, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper
from bokeh.palettes import plasma
from bokeh.plotting import figure
from bokeh.transform import transform

labels = unique_users
source = ColumnDataSource(data=dict(x=embedding[:,0], y=embedding[:,1], desc=labels))
hover = HoverTool(tooltips=[
    ('User', '@desc'),
])
mapper = LinearColorMapper(palette=plasma(256),
                           low=embedding[:,1].min(), high=embedding[:,1].max())

p = figure(plot_width=700, plot_height=500, tools=[hover], title="Shared followers")
p.circle('x', 'y', source=source,
         fill_color=transform('y', mapper))

output_notebook()
show(p)

## Extracting the number of retweets

In [57]:
unique_users = all_tweets_df['account'].unique()
no_of_retweets = np.zeros((unique_users.shape[0], unique_users.shape[0]))
for i, user in enumerate(unique_users):
    retweets_per_user = all_tweets_df['user'][all_tweets_df['account']==user]
    unique_retweeted, retweet_count = np.unique(retweets_per_user, return_counts=True)
    retweet_count_dict = {retweeted: count for retweeted, count in zip(unique_retweeted, retweet_count)}
    users_in_brainhack = np.intersect1d(unique_retweeted, unique_users)
    if users_in_brainhack.size == 0:
        continue
    else:
        unique_idx = np.squeeze(np.array([np.where(unique_users==brainhack_user)[0] for brainhack_user in users_in_brainhack]))
        retweet_count = np.array([retweet_count_dict[brainhack_user] for brainhack_user in users_in_brainhack])
        no_of_retweets[i][unique_idx] = retweet_count

In [68]:
unique_users = all_tweets_df['account'].unique()
no_of_retweeted = np.zeros((unique_users.shape[0], unique_users.shape[0]))
for i, user in enumerate(unique_users):
    retweeted_per_user = all_tweets_df['account'][np.logical_and(all_tweets_df['account']!=user, all_tweets_df['user']==user)]
    unique_retweeted, retweet_count = np.unique(retweeted_per_user, return_counts=True)
    if unique_retweeted.size == 0:
        continue
    retweet_count_dict = {retweeted: count for retweeted, count in zip(unique_retweeted, retweet_count)}
    #users_in_brainhack = np.intersect1d(unique_retweeted, unique_users)
    unique_idx = np.squeeze(np.array([np.where(unique_users==brainhack_user)[0] for brainhack_user in unique_retweeted]))
    retweet_count = np.array([retweet_count_dict[brainhack_user] for brainhack_user in unique_retweeted])
    no_of_retweeted[i][unique_idx] = retweet_count

In [None]:
import networkx as nx
import umap
import plotly.plotly as py
import plotly.graph_objs as go

G = nx.from_numpy_matrix(no_of_retweeted)

embedding = umap.UMAP(n_components=2, min_dist=0.001, metric='precomputed', n_neighbors=25).fit_transform(
    no_of_retweeted)

edge_trace = go.Scatter(
    x=[],
    y=[],
    line=dict(width=0.5,color='#888'),
    hoverinfo='none',
    mode='lines')

for edge in G.edges():
    x0, y0 = embedding[edge[0]]
    x1, y1 = embedding[edge[1]]
    edge_trace['x'] += tuple([x0, x1, None])
    edge_trace['y'] += tuple([y0, y1, None])

labels = unique_users
    
node_trace = go.Scatter(
    x=embedding[:,0],
    y=embedding[:,1],
    text=labels,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale='Bluered',
        reversescale=True,
        color=[],
        size=minmax_scale(
    np.diag(no_of_retweets), (5,20)),
        colorbar=dict(
            thickness=15,
            title='Retweeted by',
            xanchor='left',
            titleside='right'
        ),
        line=dict(width=2)))


for node, adjacencies in enumerate(G.adjacency()):
    node_trace['marker']['color']+=tuple([len(adjacencies[1])])

In [None]:
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>How much do Brainhackers retweet each other?',
                titlefont=dict(size=16),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

py.iplot(fig, filename='retweets')

In [25]:
import umap
embedding = umap.UMAP(n_components=2, min_dist=0.001, metric='precomputed', n_neighbors=5).fit_transform(
    no_of_retweets)

import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import minmax_scale


from bokeh.io import output_file, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper
from bokeh.palettes import plasma
from bokeh.plotting import figure
from bokeh.transform import transform

labels = unique_users
source = ColumnDataSource(data=dict(x=embedding[:,0], y=embedding[:,1], z=minmax_scale(
    np.diag(no_of_retweets), (5,30)), desc=labels))
hover = HoverTool(tooltips=[
    ('User', '@desc'),
])
mapper = LinearColorMapper(palette=plasma(256),
                           low=embedding[:,1].min(), high=embedding[:,1].max())

p = figure(plot_width=700, plot_height=500, tools=[hover], title="How connected are brainhackers?")
p.circle('x', 'y', size='z', source=source,
         fill_color=transform('y', mapper))

output_notebook()
show(p)