In [None]:
!pip install matplotlib

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
COUNTRY = 'netherlands'

In [None]:
import operator
from numpy import median

import pandas as pd
import networkx as nx

from utilities import load_graph_from_files

def get_network_stats(region):
    graph, _ = load_graph_from_files(region)
    print("*** {} - Analyzing network of {} nodes and {} edges".format(region, graph.order(), graph.size()))
    
    no_of_accounts = 8
    
    dc = nx.out_degree_centrality(graph)
    dc_avg = sum(dc.values()) / len(dc.values())
    dc_top = sorted(dc.items(), key=operator.itemgetter(1), reverse=True)
    print("DC avg: {}".format(dc_avg))
    dc_med = median(list(dc.values()))
    print("DC median: {}".format(dc_med))
 
    # https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html#networkx.algorithms.centrality.eigenvector_centrality
    ec = nx.eigenvector_centrality(graph.reverse())
    ec_avg = sum(ec.values()) / len(ec.values())
    ec_top = sorted(ec.items(), key=operator.itemgetter(1), reverse=True)
    print("EC avg: {}".format(ec_avg))
    
    k = 1500
    print("k value: {}".format(k))
    bc = nx.betweenness_centrality(graph, k=k)
    bc_avg = sum(bc.values()) / len(bc.values())
    bc_top = sorted(bc.items(), key=operator.itemgetter(1), reverse=True)
    print("BC avg: {}".format(bc_avg))
    
    dc_df = pd.DataFrame(dc_top, columns=['id_str', 'out_degree']).set_index('id_str')
    ec_df = pd.DataFrame(ec_top, columns=['id_str', 'eigenvector']).set_index('id_str')
    bc_df = pd.DataFrame(bc_top, columns=['id_str', 'betweenness']).set_index('id_str')
    
    dc_df['degree_rank'] = dc_df['out_degree'].rank(ascending=False)
    ec_df['eigen_rank'] = ec_df['eigenvector'].rank(ascending=False)
    bc_df['betw_rank'] = bc_df['betweenness'].rank(ascending=False)

    
    accounts = set(
        [acc[0] for acc in dc_top[:no_of_accounts]] +\
        [acc[0] for acc in ec_top[:no_of_accounts]] +\
        [acc[0] for acc in bc_top[:no_of_accounts]]
    )
    
    #print(accounts)
       
    big = pd.concat([dc_df, ec_df, bc_df], axis=1, sort=False).reset_index().rename(columns={'index': 'id_str'})
    only_top_accounts = big[big.id_str.isin(accounts)]
    return only_top_accounts
    

In [None]:
df = get_network_stats(COUNTRY)
df.to_pickle('network_stats_{}'.format(COUNTRY))

In [None]:
# look up screen name

from utilities import db_to_pandas

def get_account(id_str):
    sql_query="""
    SELECT DISTINCT tweet_body->'user'->>'screen_name' as screen_name
    FROM tweet
    WHERE tweet_body->'user'->>'id_str' = '{}';
    """.format(id_str)
    
    q = db_to_pandas(sql_query)['screen_name']
    return q

df['screen_name'] = df['id_str'].apply(lambda x: get_account(x))

In [None]:
# look up latest follower count

from utilities import db_to_pandas

def get_max_followers_count(id_str):
    sql_query="""
    SELECT tweet_body->'user'->>'followers_count' as followers_count
    FROM tweet
    WHERE tweet_body->'user'->>'id_str' = '{}';
    """.format(id_str)
    q = db_to_pandas(sql_query)['followers_count'].max()
    return q

df['followers'] = df['id_str'].apply(lambda x: get_max_followers_count(x))

In [None]:
df['followers'] = pd.to_numeric(df['followers'])
df = df.sort_values(by='followers', ascending=False)

df

In [None]:
# round
df = df.round({
    'out_degree': 4, 
    'eigenvector': 4, 
    'betweenness': 4
})

In [None]:
descript = {
"MinPres": "Prime minister",
"geertwilderspvv": "Politician",
"thierrybaudet": "Politician",
"rivm": "National Institute for Public Health",
"MinVWS": "Ministry of health",
"fvdemocratie": "Political party",
"VogelvrijeHArts": "Meme account or person",
"Hannesz1956": "Person",
"lewinskylou2": "Person",
"Sashaexposed": "Person",
"bruno_bruins": "Minister for Medical Care",
"FritsRosendaal": "Professor of Clinical Epidemiology",
"MIsBack8": "Person",
"Erna_art": "Person",
"arzubarsk": "Person",
"holadiejee": "Person",
"MedicijnNL": "Medicine evaluation company"
}

df.set_index(['screen_name', 'id_str'])
df['description'] = df['screen_name'].map(descript)

df

In [None]:
nice = df.drop('id_str', axis=1)
nice.to_csv('account_stats_{}.csv'.format(COUNTRY), index=False)

tex = nice.rename(columns={
    'out_degree': 'out-degree',
    'degree_rank': 'out-degree rank',
    'eigen_rank': 'eigenvector rank',
    'betw_rank': 'betweenness rank',
    'screen_name': 'user'
})
tex = tex.reindex(columns=[
    'followers', 'user', 'description', 'out-degree', 'out-degree rank', 
    'betweenness', 'betweenness rank', 'eigenvector', 'eigenvector rank'])
tex.to_latex('outputs/accounts_table.tex', index=False)
tex.head()