In [1]:
!pip install matplotlib



In [2]:
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import csv

def load_from_file(region):
    g = nx.DiGraph()
    
    with open('nodelist_{}.csv'.format(region), 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            g.add_nodes_from(row)
    
    edges = list()
    with open('edgelist_{}.csv'.format(region), 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            edges.append((row['Source'], row['Target']))
    g.add_edges_from(edges)
    
    return g

In [4]:
DB_HOST = '89.145.163.87'
DB_PORT = 5432
DB_USERNAME = 'group11'
DB_PASSWORD = 'tsw2020'
DB_NAME = 'tweets'

In [2]:
import psycopg2
import pandas as pd

def db_to_pandas(query):
    """ Query database to a Pandas DF"""
    conn = psycopg2.connect(host=DB_HOST, port=DB_PORT, database=DB_NAME, user=DB_USERNAME, password=DB_PASSWORD)
    cur = conn.cursor()
    df = pd.read_sql_query(query, conn)    
    cur.close()
    conn.close()
    return df



In [13]:
import operator
import pandas as pd
import networkx as nx

from utilities import load_graph_from_files

def get_network_stats(region):
    graph, _ = load_graph_from_files(region)
    print("*** {} - Analyzing network of {} nodes and {} edges".format(region, graph.order(), graph.size()))
    
    no_of_accounts = 5
    

    dc = nx.out_degree_centrality(graph)
    dc_avg = sum(dc.values()) / len(dc.values())
    dc_top = sorted(dc.items(), key=operator.itemgetter(1), reverse=True)
    print("DC avg: {}".format(dc_avg))
    
    # https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html#networkx.algorithms.centrality.eigenvector_centrality
    ec = nx.eigenvector_centrality(graph.reverse())
    ec_avg = sum(ec.values()) / len(ec.values())
    ec_top = sorted(ec.items(), key=operator.itemgetter(1), reverse=True)
    print("EC avg: {}".format(ec_avg))
    
    cc = nx.closeness_centrality(graph, wf_improved=True)
    cc_avg = sum(cc.values()) / len(cc.values())
    cc_top = sorted(cc.items(), key=operator.itemgetter(1), reverse=True)
    print("CC avg: {}".format(cc_avg))

    k = int(float(graph.order())*float(graph.size())/3000000)
    print("k value: {}".format(k))
    bc = nx.betweenness_centrality(graph, k=k)
    bc_avg = sum(bc.values()) / len(bc.values())
    bc_top = sorted(bc.items(), key=operator.itemgetter(1), reverse=True)
    print("BC avg: {}".format(bc_avg))
    
    dc_df = pd.DataFrame(dc_top, columns=['id_str', 'out_degree']).set_index('id_str')
    ec_df = pd.DataFrame(ec_top, columns=['id_str', 'eigenvector']).set_index('id_str')
    cc_df = pd.DataFrame(cc_top, columns=['id_str', 'closeness']).set_index('id_str')
    bc_df = pd.DataFrame(bc_top, columns=['id_str', 'betweenness']).set_index('id_str')
    
    accounts = set(
        [acc[0] for acc in dc_top[:no_of_accounts]] +\
        [acc[0] for acc in ec_top[:no_of_accounts]] +\
        [acc[0] for acc in cc_top[:no_of_accounts]] +\
        [acc[0] for acc in bc_top[:no_of_accounts]]
    )
    
    print(accounts)
       
    big = pd.concat([dc_df, ec_df, cc_df, bc_df], axis=1, sort=False).reset_index().rename(columns={'index': 'id_str'})
    only_top_accounts = big[big.id_str.isin(accounts)]
    return only_top_accounts
    

In [14]:
from utilities import db_to_pandas

def get_account(id_str):
    sql_query="""
    SELECT DISTINCT tweet_body->'user'->>'screen_name' as screen_name
    FROM tweet
    WHERE tweet_body->'user'->>'id_str' = '{}';
    """.format(id_str)
    
    q = db_to_pandas(sql_query)['screen_name']
    return q
    

In [15]:
df = get_network_stats('netherlands')
df

*** netherlands - Analyzing network of 24496 nodes and 31531 edges
DC avg: 5.2549081251866707e-05
EC avg: 0.00045372776089488146
CC avg: 0.003157426931334164
k value: 257
BC avg: 9.281163689974418e-06
{124280192, 1024256932834631680, 1172123420559065090, 107209093, 1340408646, 199397015, 367703310, 41778159, 883813550, 135512400, 4181682291, 190648628, 35527415, 15581273, 214046074, 610558969}


Unnamed: 0,id_str,out_degree,eigenvector,closeness,betweenness
639,15581273,0.009757,0.1517707,0.008107,0.001104
2500,35527415,0.137008,0.5938578,0.009277,0.027407
2800,41778159,0.054297,0.1340974,0.007337,0.002736
5222,107209093,0.0,2.879208e-14,0.014319,0.0
5872,124280192,0.032619,0.07597392,0.006501,0.00126
6211,135512400,4.1e-05,0.03625782,0.010819,0.007367
7511,190648628,0.0,2.879208e-14,0.013572,0.0
7745,199397015,0.001796,0.176255,0.010967,0.024264
8053,214046074,0.0,2.879208e-14,0.013831,0.0
10852,367703310,0.033435,0.05377339,0.0,0.0


In [16]:
# look up screen name
df['screen_name'] = df['id_str'].apply(lambda x: get_account(x))

In [26]:
from utilities import db_to_pandas

def get_max_followers_count(id_str):
    sql_query="""
    SELECT tweet_body->'user'->>'followers_count' as followers_count
    FROM tweet
    WHERE tweet_body->'user'->>'id_str' = '{}';
    """.format(id_str)
    q = db_to_pandas(sql_query)['followers_count'].max()
    return q

In [27]:
df['followers_count'] = df['id_str'].apply(lambda x: get_max_followers_count(x))

35527415 69983
41778159 813969
367703310 208671
124280192 13252
1024256932834631680 18042
1172123420559065090 3017
15581273 74537
1340408646 9872
199397015 1194
883813550 769
610558969 2117
4181682291 245
135512400 2397
214046074 982
107209093 4030
190648628 1661


In [28]:
# round
df = df.round({
    'out_degree': 5, 
    'eigenvector': 5, 
    'closeness': 5, 
    'betweenness': 5
})

df

Unnamed: 0,id_str,out_degree,eigenvector,closeness,betweenness,screen_name,followers_count
0,35527415,0.15418,0.59377,0.01044,0.01904,rivm,69983
1,41778159,0.0611,0.13376,0.00826,0.00198,geertwilderspvv,813969
2,367703310,0.03763,0.05362,0.0,0.0,thierrybaudet,208671
3,124280192,0.03671,0.07584,0.00732,0.00097,VogelvrijeHArts,13252
4,1024256932834631680,0.02026,0.0,0.0,0.0,RebeccaH2020,18042
10,1172123420559065090,0.01704,0.27305,0.00966,0.00554,bruno_bruins,3017
19,15581273,0.01098,0.15172,0.00912,0.00132,MinVWS,74537
20,1340408646,0.01024,0.0405,0.01481,0.00877,Hannesz1956,9872
103,199397015,0.00202,0.17641,0.01234,0.01239,arzubarsk,1194
154,883813550,0.00119,0.15939,0.0,0.0,MedicijnNL,769


In [30]:
nice = df.drop('id_str', axis=1)
nice.to_csv('account_stats.csv', index=False)
#nice.to_latex()