# Twitter dataset analysis with Infomap

## Run pipelines

In [1]:
# IMPORTS
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import matplotlib.pyplot as plt
import seaborn as sns
from orchestrator import Orchestrator
from analysis_helper import AnalysisHelper

INFO - datasources.tw.tw - load config file
INFO - datasources.tw.proxy_provider - getting proxy list
INFO - datasources.tw.proxy_provider - reading proxy list json file
DEBUG - datasources.tw.tw_api - INIT Tw api
DEBUG - datasources.tw.tw_premium_api - INIT Tw premium api
INFO - datasources.tw.tw - INIT Tw
DEBUG - matplotlib.pyplot - Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [2]:
# LOGGING LEVEL
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [3]:
# PATHS
PROJECT_PATH = os.path.abspath('')
INPUT_PATH = os.path.join(PROJECT_PATH, 'input/')
OUTPUT_PATH = os.path.join(PROJECT_PATH, 'output/')
PROJECT_NAME = 'uk_healthcare_infomap_small'

In [4]:
# PIPELINE
o = Orchestrator(PROJECT_NAME, INPUT_PATH, OUTPUT_PATH)
o.execute()
ah = AnalysisHelper(o.datasources)

## Analysis

### Context and graph analysis


#### Context summary

In [5]:
display(ah.get_contexts())

Unnamed: 0_level_0,start_date,end_date,location,hashtags
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
16-days-of-action-2018,2018-11-25,2018-12-10,United Kingdom,#16days #16daysofaction #16daysofactiontoolkit
elf-day,2018-12-03,2018-12-12,United Kingdom,#elfday #elfday2018


#### Context's graphs

In [6]:
graphs = ah.get_graphs()
display(graphs.describe())
display(graphs)

Unnamed: 0,no_nodes,no_edges,avg_degree,avg_weighted_degree,density,strongly_conn_components,avg_clustering,assortativity,scc_over_nodes
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,380.5,392.5,2.076,2.204,0.0025,375.5,0.055,-0.157,0.9865
std,21.92031,61.51829,0.442649,0.417193,0.000707,26.162951,0.06364,0.035355,0.012021
min,365.0,349.0,1.763,1.909,0.002,357.0,0.01,-0.182,0.978
25%,372.75,370.75,1.9195,2.0565,0.00225,366.25,0.0325,-0.1695,0.98225
50%,380.5,392.5,2.076,2.204,0.0025,375.5,0.055,-0.157,0.9865
75%,388.25,414.25,2.2325,2.3515,0.00275,384.75,0.0775,-0.1445,0.99075
max,396.0,436.0,2.389,2.499,0.003,394.0,0.1,-0.132,0.995


Unnamed: 0_level_0,no_nodes,no_edges,avg_degree,avg_weighted_degree,density,connected,strongly_conn_components,avg_clustering,assortativity,scc_over_nodes
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
16-days-of-action-2018,396,349,1.763,1.909,0.002,False,394.0,0.01,-0.132,0.995
elf-day,365,436,2.389,2.499,0.003,False,357.0,0.1,-0.182,0.978


* *# nodes*: number of users.
* *# edges*: number of relations between users.
* *avg degree*: average number of edges per node.
* *avg degree*: average weigth sum per node.
* *density*: expresses the how sparse is the adjacency matrix (i.e. how likely that a node has edges). 0 for a graph without edges and 1 for a complete graph.

\begin{align}
&\begin{aligned}
d = \frac{m}{n(n-1)}
\end{aligned}\\
&\begin{aligned}
m = \text{# edges}
\end{aligned}\\
&\begin{aligned}
n = \text{# nodes}
\end{aligned}
\end{align}

* *connected*: for each pairs of nodes exists a path that connects them.
* *strongly connected components*: group of nodes for which for each pair of nodes exists a path that connects them. Can be broadly relatable to the concept of community.
* *avg clustering*: average for each node of the fraction of possible triangles through that node that exist.
* *assortativity*: measures how much nodes are likely to connect to nodes with the same degree (>0) or with a different degree (<0).

### Community Detection analysis

#### Partitions summary

In [35]:
for context_name, partition in ah.get_partitions():
    print(f'context "{context_name}":')
    display(partition)
    display(partition.describe())

context "16-days-of-action-2018":


Unnamed: 0_level_0,no_nodes,no_edges,avg_degree,avg_weighted_degree,density,connected,strongly_conn_components,avg_clustering,assortativity
community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,14,16,2.2857,2.2857,0.0879,True,14,0.0,-0.5494
1,12,23,3.8333,4.8333,0.1742,True,12,0.0,-0.1245
2,4,3,1.5,1.5,0.25,True,4,0.0,
3,4,3,1.5,1.5,0.25,True,4,0.0,
4,25,37,2.96,3.2,0.0617,True,25,0.0,-0.4423
5,15,14,1.8667,1.8667,0.0667,True,15,0.0,
6,4,4,2.0,3.5,0.3333,True,4,0.0,
7,10,9,1.8,1.8,0.1,True,10,0.0,
8,8,8,2.0,2.0,0.1429,True,8,0.0,
9,7,6,1.7143,1.7143,0.1429,True,7,0.0,


Unnamed: 0,no_nodes,no_edges,avg_degree,avg_weighted_degree,density,strongly_conn_components,avg_clustering,assortativity
count,29.0,29.0,29.0,29.0,29.0,29.0,29.0,6.0
mean,7.206897,7.724138,1.992683,2.262697,0.211152,7.172414,0.014369,-0.439417
std,4.61658,7.170856,0.527777,0.73091,0.101574,4.598672,0.077379,0.318518
min,4.0,3.0,1.5,1.5,0.0617,4.0,0.0,-1.0
25%,4.0,4.0,1.6,1.7778,0.1429,4.0,0.0,-0.522625
50%,5.0,6.0,1.8667,2.0,0.2,5.0,0.0,-0.39795
75%,8.0,8.0,2.0,2.5,0.25,8.0,0.0,-0.213425
max,25.0,37.0,3.8333,4.8333,0.5,25.0,0.4167,-0.1245


context "elf-day":


Unnamed: 0_level_0,no_nodes,no_edges,avg_degree,avg_weighted_degree,density,connected,strongly_conn_components,avg_clustering,assortativity
community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,17,24,2.8235,2.9412,0.0882,True,15,0.1213,-0.7309
1,27,74,5.4815,5.4815,0.1054,True,27,0.302,-0.1155
2,30,38,2.5333,2.8,0.0437,True,27,0.2233,-0.7593
3,5,4,1.6,1.6,0.2,True,5,0.0,
4,5,4,1.6,3.2,0.2,True,5,0.0,-0.5774
5,4,3,1.5,1.5,0.25,True,4,0.0,
6,10,12,2.4,2.4,0.1333,True,9,0.2445,-0.4082
7,5,4,1.6,1.6,0.2,True,5,0.0,-0.5774
8,8,9,2.25,2.25,0.1607,True,8,0.1458,-0.6248
9,4,4,2.0,2.0,0.3333,True,4,0.0,-0.5774


Unnamed: 0,no_nodes,no_edges,avg_degree,avg_weighted_degree,density,strongly_conn_components,avg_clustering,assortativity
count,29.0,29.0,29.0,29.0,29.0,29.0,29.0,14.0
mean,7.862069,9.965517,2.081186,2.149614,0.217762,7.62069,0.077917,-0.509821
std,7.351988,14.811874,0.973151,0.998248,0.091767,6.961187,0.14489,0.175608
min,4.0,3.0,1.5,1.5,0.0385,4.0,0.0,-0.7593
25%,4.0,3.0,1.5,1.5,0.1667,4.0,0.0,-0.5774
50%,5.0,4.0,1.6,1.6667,0.25,5.0,0.0,-0.5774
75%,7.0,9.0,2.25,2.4,0.25,7.0,0.1213,-0.43115
max,30.0,74.0,5.4815,5.4815,0.4667,27.0,0.531,-0.1155


For each context provide all the communities and the related metrics.

**important**: if the community detection algorithm fails to provide a community, the whole network is considered as a single community.

In [36]:
ah.get_partitions_aggregated()

Unnamed: 0_level_0,count,max,mean,min,std
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
assortativity,10.0,-0.12,-0.47,-1.0,0.25
avg_clustering,29.0,0.53,0.05,0.0,0.11
avg_degree,29.0,5.48,2.04,1.5,0.75
avg_weighted_degree,29.0,5.48,2.21,1.5,0.86
density,29.0,0.5,0.21,0.04,0.1
no_edges,29.0,74.0,8.84,3.0,10.99
no_nodes,29.0,30.0,7.53,4.0,5.98
strongly_conn_components,29.0,27.0,7.4,4.0,5.78


Aggregation of the partition metrics presented above.

The average number of communities is given by the count column (ignore the assortativity cell, NaN do alterate the mean).

In [39]:
community_over_nodes_ratio = ah.community_over_nodes_ratio()
display(community_over_nodes_ratio)
display(community_over_nodes_ratio.describe().round(decimals=2))

Unnamed: 0_level_0,community/no_nodes ratio
name,Unnamed: 1_level_1
16-days-of-action-2018,0.14
elf-day,0.13


Unnamed: 0,community/no_nodes ratio
count,2.0
mean,0.14
std,0.01
min,0.13
25%,0.13
50%,0.14
75%,0.14
max,0.14


For each context determine the ratio of found communities over the number of nodes chosen to be part of a community.

#### Partition quality metrics

In [41]:
for context_name, pquality in ah.get_pquality():
    print(f'context "{context_name}":')
    display(pquality)

context "16-days-of-action-2018":


Unnamed: 0_level_0,min,max,avg,std
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
internal_density,0.030833,0.25,0.105576,0.04991
edges_inside,3.0,37.0,7.724138,7.046136
normalized_cut,0.0,0.113328,0.015863,0.035748
avg_degree,1.5,3.833333,1.992682,0.518605
fomd,0.0,0.5,0.190142,0.150454
expansion,0.0,0.25,0.030022,0.068706
cut_ratio,0.0,0.00122,0.000148,0.000337
conductance,0.0,0.111111,0.015479,0.034931
max_odf,0.0,2.0,0.344828,0.543035
avg_odf,0.0,0.25,0.051746,0.083929


context "elf-day":


Unnamed: 0_level_0,min,max,avg,std
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
internal_density,0.019231,0.233333,0.108881,0.045087
edges_inside,3.0,74.0,9.965517,14.554257
normalized_cut,0.0,0.437946,0.082049,0.118848
avg_degree,1.5,5.481482,2.081182,0.956222
fomd,0.038462,0.5,0.308601,0.129298
expansion,0.0,1.2,0.208169,0.32268
cut_ratio,0.0,0.005381,0.000935,0.001448
conductance,0.0,0.428571,0.080302,0.116491
max_odf,0.0,31.0,1.551724,5.593014
avg_odf,0.0,1.2,0.244951,0.355935


Survey of community measures: https://github.com/Lab41/survey-community-detection 
* *Internal density*: number of edges (ms) in subset S divided by the total number of possible edges between all nodes (ns(ns-1)/2). Higher the better
* *Average degree*: average internal degree across all nodes (ns) in subset S. Higher the better.
* *FOMD (Fraction over median degree)*: Determines the number of nodes that have an internal degree greater than the median degree of nodes in Subset S. Higher the better.
* *Expansion*: can be thought as “External Degree”. Measure of separability. Lower the better.
* *Cut Ratio*: This metric is a measure of separability and can be thought of as "External Density". Lower the better.
* *Conductance*: Ratio of edges inside the cluster to the number of edges leaving the cluster (captures surface area to volume). Higher the better.
* *Normalized Cut*: Represents how well subset S is separated from graph G. Combines Conductance with the fraction of external edges over all non-community edges. Higher the better.
* *Maximum ODF (Out Degree Fraction)*: Fraction of external conections to internal connections for each node (ns) in S. It then returns the fraction with the highest value. Lower the better.
* *Average ODF*: same as Maximum ODF but takes the average. Lower the better.
* *Flake-ODF*: Fraction of the number of nodes that have fewer internal connections than external connections to the number of nodes (ns) in subset S. Higher the better.

#### Cumulative sum of degree distribution

In [None]:
AnalysisHelper.plot_compare_cumsum_deg_dist(results);

Cumulative sum of degree distribution shows how the degree is distributed with respect to the number of nodes.

* Nodes with a lower degree (left on the plot) are typically way more than the ones with a high degree (right on the plot).

* Over the dotted line there's a number of node above the mean (the converse below the line) as per z-score.

* Z-score normalization for the number of nodes has been chosen because it preserves the range (maximum and minimum) and introduces the dispersion of the serie (standard deviation / variance)

\begin{align}
P(k) = \frac{\text{# nodes with degree >= k}}{\text{# nodes}}
\end{align}

Cumulative sum of degree distribution is more robust wrt the simple degree distribution because is less subject to small numbers. 

#### Summary stats for communities

In [None]:
AnalysisHelper.communities_summary_stats(results)

Measures to detect the goodness of partitions from :
* *Degenerated context ratio*:
\begin{align}
\text{degenerated_context_ratio} = \frac{\text{# degenerated contexts}}{\text{# contexts}}
\end{align}
* *Good context ratio*: which is the reverse of the degenerated context ratio.
\begin{align}
\text{good_context_ratio} = \frac{\text{# good contexts}}{\text{# contexts}}
\end{align}
* *Average communities per context*: the average number of communities for number of contexts. Only considers good contexts.
\begin{align}
\text{avg_communities_per_good_context	} = \frac{\sum{\text{# good communities}}}{\text{# good contexts}}
\end{align}
* *Average sociable users ratio*: the average for the context number for the ratio of the users in the communities over the users found. Only considers good contexts.
\begin{align}
\text{avg_communities_per_good_context	} =
\frac{\sum{\frac{\text{# users in communities in context}}{\text{# all users in context}}}}
{\text{# good contexts}}
\end{align}

*note*: a "degenerated context" is a context for which the community detection algorithm hasn't found any community. Degenerated context provide then a unique great community which includes all the nodes of the network graph. A "good context" is the reverse: all the contexts for which the community detection algorithm has found at least a communtiy.

## Shared users analysis

#### Shared users and community detection

In [None]:
shared_nodes = AnalysisHelper.compare_common_nodes(results)
display(shared_nodes.is_present.value_counts())
display(shared_nodes)

Community detection process filters out users not belonging to any community.

In this table are listed all the users from all the contexts that belongs to more than one context. 

* *is_present* column: describes whether a user has survived the community detection process.
* *no_participations* column: counts how many contexts a user has been in.

**important**: no_participations below may differ for the fact that a user may have survived in a context but not other ones.

#### Shared users

In [None]:
shared_nodes = AnalysisHelper.get_common_nodes(results)
print(f'There are {len(shared_nodes.index)} shared nodes.')
display(shared_nodes.groupby('no_participations').count().name.to_frame().rename(columns={'name': 'count'}))
display(shared_nodes.sort_values(by=['no_participations', 'follower_rank'], ascending=False).round(decimals=2))

Number of appearances of nodes across different events togheter with Tw infos.

#### Shared users and events

In [None]:
AnalysisHelper.plot_events_with_common_nodes(results, 'community_detection', 'nodes');

Number of users per event that appear in more than one event.

## Ranking

In [None]:
AnalysisHelper.rank_1().head(10)

Ranking function that takes into account:
* *Inverse in-degree*: In the considered graphs the hubs in the communities seems to be mostly related to associations and organizations. The smaller ones are more likely to be individuals.
* *Topical focus*: related to the interest the users have on the topic.

Formula:
\begin{align}
\mathit{R1}(u) & = \frac{1}{\sum_{u \in C} \mathit{IC}(u) + 1} \cdot \sum_{u \in C} \mathit{TF}(u)
\end{align}

In [None]:
AnalysisHelper.rank_2().head(10)

Formula:

\begin{align}
\mathit{R2}(u) & = \lvert \mathit{FR}(u) - 1 \rvert \cdot \left(\sum_{u \in C} \mathit{TA}(U) + \sum_{u \in C} \mathit{IC}(U)\right)
\end{align}

In [None]:
AnalysisHelper.rank_3().head(10)

Formula:

\begin{align}
\mathit{R3}(u) & = \lvert \mathit{FR}(u) - 1 \rvert \cdot \left(\sum_{u \in C} \mathit{TA}(U) + \frac{1}{\sum_{u \in C} \mathit{IC}(U) + 1}\right)
\end{align}

# Tables for the paper

## Table 1

In [None]:
table_1 = AnalysisHelper.get_single_summary('event_detection', 'event', results)\
        [['start_date', 'end_date', 'hashtags']].merge(graph_summaries, left_index=True, right_index=True)

table_1.reset_index(inplace=True)

table_1['hashtags'] = table_1['hashtags'].apply(lambda x: x[0].lower() if len(x)>1 else x[0].lower() + ', ...')
table_1['name'] = table_1['name'].apply(lambda x: x.replace('-2018', '').replace('-', ' ').capitalize())
table_1['period (2018)'] = table_1[['start_date', 'end_date']].apply(lambda x: ' / '.join(x).replace('2018-', ''), axis=1)


table_1[['assortativity', 'avg_degree']] = table_1[['assortativity', 'avg_degree']].round(decimals=1)
table_1['density'] = table_1['density'].round(decimals=3)

table_1 = table_1[['name', 'period (2018)', 'no_nodes', 'no_edges', 'density', 'avg_degree', 'assortativity']]

table_1.rename(columns={'name': 'context name',
                        'avg_degree': 'avg degree',
                        'no_nodes': 'nodes',
                        'no_edges': 'edges'} , inplace=True)
table_1.columns = [c.capitalize() for c in table_1.columns]

table_1.to_csv('tables/table_1.csv', index=False)
display(table_1)

## Table 3

In [None]:
table_3 = AnalysisHelper.get_common_nodes(results).reset_index()
table_3 = table_3.head(11).round(decimals=2).sort_values(by=['no_participations', 'follower_rank'], ascending=False)
table_3.rename(columns={'index': 'username'}, inplace=True)
table_3.drop(columns=['url', 'bio', 'location'], inplace=True)
table_3.rename(columns={'follower_rank': 'follower rank',
                        'no_participations': 'participations'} , inplace=True)
table_3.columns = [c.capitalize() for c in table_3.columns]

# hack
table_3.drop(table_3.index[10], inplace=True)

table_3.to_csv('tables/table_3.csv', index=False)
display(table_3)

## Table 4

In [None]:
rank_1 = AnalysisHelper.rank_1().head(100).drop(columns=['name', 'location', 'rank']).rename(columns={'user_name': 'Rank 1'})
rank_2 = AnalysisHelper.rank_2().head(100).reset_index().drop(columns='rank').rename(columns={'user_name': 'Rank 2'})
rank_3 = AnalysisHelper.rank_3().head(100).reset_index().drop(columns='rank').rename(columns={'user_name': 'Rank 3'})

table_6 = rank_1.merge(rank_2, left_index=True, right_index=True)
table_6 = table_6.merge(rank_3, left_index=True, right_index=True)
table_6['#'] = table_6.index + 1
table_6 = table_6[['#', 'Rank 1', 'Rank 2', 'Rank 3']]

table_6.to_csv('tables/table_6.csv', index=False)
display(table_6)

# Interpret rankings

__Important: Needs manual labelling!__

In [None]:
import pandas as pd
rankings = pd.read_csv('tables/table_6_labelled.csv', dtype={
    '#': 'uint16',
    
    'Individual 1': int,
    'Professional 1': int,
    'Association 1': int,
    'Intopic 1': int,
    
    'Individual 2': int,
    'Professional 2': int,
    'Association 2': int,
    'Intopic 2': int,
    
    'Individual 3': int,
    'Professional 3': int,
    'Association 3': int,
    'Intopic 3': int
})

def get_single_rank(rankings, rank_number):
    user_type_columns = [f'{t} {rank_number}'
                         for t in ['Individual', 'Professional', 'Association', 'Intopic']]
    
    return rankings[['#'] + user_type_columns]

def cum_sum_plot(rank):
    fig, ax = plt.subplots(figsize=(15, 8))
    
    bucket_size = 10
    columns = rank.columns[1:].tolist()
    rank = rank.groupby(by=lambda x: int(x/bucket_size)).agg({
        '#': lambda x: x.iloc[-1], **{c: 'sum' for c in columns}})
    
    for c in columns:
        rank[c] = rank[c].cumsum()
        ax = sns.lineplot(x="#", y=c, data=rank, label=c.split()[0])
    
    ax.set_xlabel('Users buckets')
    ax.set_ylabel('Cumulated number of users per type')
    plt.xticks([i * bucket_size for i in range(1, len(rank.index)+1)])
    plt.title('Cumulated number of users per type')
    ax.grid(True)
    plt.show()

In [None]:
cum_sum_plot(get_single_rank(rankings, 1))
cum_sum_plot(get_single_rank(rankings, 2))
cum_sum_plot(get_single_rank(rankings, 3))

In [None]:
def stacked_bar_plot(rank):
    from functools import reduce
    
    fig, ax = plt.subplots(figsize=(15, 8))
    barWidth = 1
    bucket_size = 10    
    columns = rank.columns[1:-1].tolist()
    rank = rank.groupby(by=lambda x: int(x/bucket_size)).agg({
        '#': lambda x: x.iloc[-1], **{c: 'sum' for c in columns}})
    r = rank.index.tolist()
    
    for i, c in enumerate(columns):
        bottom = reduce((lambda x, y: x + y), [rank[c] for c in columns[0:i]], 0)
        plt.bar(r, rank[c], bottom=bottom, edgecolor='white', width=barWidth)
        
    plt.legend(columns)
    plt.xticks(rank.index.tolist(), [i * bucket_size for i in range(1, len(rank.index)+1)])
    plt.xlabel('Number of users')
    plt.ylabel('Number of users per type')
    plt.title('Distribution of users type over top 100 for Rank')
    plt.show()

In [None]:
stacked_bar_plot(get_single_rank(rankings, 1))
stacked_bar_plot(get_single_rank(rankings, 2))
stacked_bar_plot(get_single_rank(rankings, 3))

In [None]:
plt.figure(figsize=(8,6))
barWidth = 1
r = rank_2.index.tolist()

plt.bar(r, rank_2['Individual 2'], edgecolor='white', width=barWidth)
plt.bar(r, rank_2['Professional 2'], bottom=rank_2['Individual 2'], edgecolor='white', width=barWidth)
plt.bar(r, rank_2['Association 2'], bottom=rank_2['Individual 2']+rank_2['Professional 2'], edgecolor='white', width=barWidth)

plt.legend(['Individual', 'Professional', 'Association'])
plt.xticks(rank_2.index.tolist(), [10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
plt.yticks(list(range(0,11)))
plt.xlabel('Number of users')
plt.ylabel('Number of users per type')
plt.title('Distribution of users type over top 100 for Rank 2')

plt.savefig("rank2-distribution.pdf", bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
barWidth = 1
r = rank_3.index.tolist()

plt.bar(r, rank_3['Individual 3'], edgecolor='white', width=barWidth)
plt.bar(r, rank_3['Professional 3'], bottom=rank_3['Individual 3'], edgecolor='white', width=barWidth)
plt.bar(r, rank_3['Association 3'], bottom=rank_3['Individual 3']+rank_3['Professional 3'], edgecolor='white', width=barWidth)
# plt.plot(rank_3['Intopic 3'], color='red')

plt.legend(['Individual', 'Professional', 'Association'])
plt.xticks(rank_3.index.tolist(), [10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
plt.xlabel('Number of users')
plt.ylabel('Number of users per type')
plt.title('Distribution of users type over top 100 for Rank 3')

plt.savefig("rank3-distribution.pdf", bbox_inches='tight')
plt.show()

In [None]:
def grouped_bar_plot(rank):
    from functools import reduce
    
    fig, ax = plt.subplots(figsize=(15, 8))
    barWidth = 0.25
    bucket_size = 10    
    columns = rank.columns[1:-1].tolist()
    rank = rank.groupby(by=lambda x: int(x/bucket_size)).agg({
        '#': lambda x: x.iloc[-1], **{c: 'sum' for c in columns}})
    r = rank.index.tolist()
    
#     for i, c in enumerate(columns):
#         bottom = reduce((lambda x, y: x + y), [rank[c] for c in columns[0:i]], 0)
#         plt.bar(r, rank[c], bottom=bottom, edgecolor='white', width=barWidth)

    for i, c in enumerate(columns):
        
        
    plt.legend(columns)
    plt.xticks(rank.index.tolist(), [i * bucket_size for i in range(1, len(rank.index)+1)])
    plt.xlabel('Number of users')
    plt.ylabel('Number of users per type')
    plt.title('Distribution of users type over top 100 for Rank')
    plt.show()

In [None]:
plt.figure(figsize=(15,8))

import numpy as np
# set width of bar
barWidth = 0.25
 
# set height of bar
bars1 = rank_1['Individual 1']
bars2 = rank_1['Professional 1']
bars3 = rank_1['Association 1']
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
 
# Make the plot
plt.bar(r1, bars1, width=barWidth, edgecolor='white', label='Individual')
plt.bar(r2, bars2, width=barWidth, edgecolor='white', label='Professional')
plt.bar(r3, bars3, width=barWidth, edgecolor='white', label='Association')
 
# Add xticks on the middle of the group bars
plt.xlabel('group', fontweight='bold')
plt.xticks(rank_1.index.tolist(), [10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
 
# Create legend & Show graphic
plt.legend();

In [None]:
plt.figure(figsize=(15,8))

import numpy as np
# set width of bar
barWidth = 0.25
 
# set height of bar
bars1 = rank_2['Individual 2']
bars2 = rank_2['Professional 2']
bars3 = rank_2['Association 2']
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
 
# Make the plot
plt.bar(r1, bars1, width=barWidth, edgecolor='white', label='Individual')
plt.bar(r2, bars2, width=barWidth, edgecolor='white', label='Professional')
plt.bar(r3, bars3, width=barWidth, edgecolor='white', label='Association')
 
# Add xticks on the middle of the group bars
plt.xlabel('group', fontweight='bold')
plt.xticks(rank_1.index.tolist(), [10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
 
# Create legend & Show graphic
plt.legend();