In [None]:
!pip install matplotlib

In [1]:
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import csv

def load_from_file(region):
    g = nx.DiGraph()
    
    with open('nodelist_{}.csv'.format(region), 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            g.add_nodes_from(row)
    
    edges = list()
    with open('edgelist_{}.csv'.format(region), 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            edges.append((row['Source'], row['Target']))
    g.add_edges_from(edges)
    
    return g

In [52]:
DB_HOST = '89.145.163.87'
DB_PORT = 5432
DB_USERNAME = 'group11'
DB_PASSWORD = 'tsw2020'
DB_NAME = 'tweets'

In [88]:
import psycopg2
import pandas as pd

def db_to_pandas(query):
    """ Query database to a Pandas DF"""
    conn = psycopg2.connect(host=DB_HOST, port=DB_PORT, database=DB_NAME, user=DB_USERNAME, password=DB_PASSWORD)
    cur = conn.cursor()
    df = pd.read_sql_query(query, conn)    
    cur.close()
    conn.close()
    return df



In [161]:
import operator
import pandas as pd

def get_network_stats(region):
    graph = load_from_file(region)
    print("*** {} - Analyzing network of {} nodes and {} edges".format(region, graph.order(), graph.size()))
   
    dc = nx.degree_centrality(graph)
    dc_avg = sum(dc.values()) / len(dc.values())
    dc_top = sorted(dc.items(), key=operator.itemgetter(1), reverse=True)[:10]
    print("DC avg: {}".format(dc_avg))
    
    
    ec = nx.eigenvector_centrality(graph)
    ec_avg = sum(ec.values()) / len(ec.values())
    ec_top = sorted(ec.items(), key=operator.itemgetter(1), reverse=True)[:10]
    print("EC avg: {}".format(ec_avg))
    
    
    cc = nx.closeness_centrality(graph)
    cc_avg = sum(cc.values()) / len(cc.values())
    cc_top = sorted(cc.items(), key=operator.itemgetter(1), reverse=True)[:10]
    print("CC avg: {}".format(cc_avg))

    k = int(float(graph.order())*float(graph.size())/3000000)
    print(k)
    bc = nx.betweenness_centrality(graph, k=k)
    bc_avg = sum(bc.values()) / len(bc.values())
    bc_top = sorted(bc.items(), key=operator.itemgetter(1), reverse=True)[:10]
    print("BC avg: {}".format(bc_avg))
    
    dc_df = pd.DataFrame(dc_top, columns=['id_str', 'degree_centrality']).set_index('id_str')
    ec_df = pd.DataFrame(ec_top, columns=['id_str', 'eigenvector_centrality']).set_index('id_str')
    cc_df = pd.DataFrame(cc_top, columns=['id_str', 'closeness_centrality']).set_index('id_str')
    bc_df = pd.DataFrame(bc_top, columns=['id_str', 'betweenness_centrality']).set_index('id_str')


    return pd.concat([dc_df, ec_df, cc_df, bc_df], axis=1, sort=False)\
.reset_index().rename(columns={'index': 'id_str'})
    

In [134]:
def get_account(id_str):
    sql_query="""
    SELECT DISTINCT tweet_body->'user'->>'screen_name' as screen_name
    FROM tweet
    WHERE tweet_body->'user'->>'id_str' = '{}';
    """.format(id_str)
    
    q = db_to_pandas(sql_query)['screen_name']
    return q
    

In [162]:
df = get_network_stats('netherlands')
df

*** netherlands - Analyzing network of 21768 nodes and 31531 edges
DC avg: 0.00013309159634562021
EC avg: 0.002083930536593513
CC avg: 0.003998423765220624
228
BC avg: 9.000892122832056e-06


Unnamed: 0,id_str,degree_centrality,eigenvector_centrality,closeness_centrality,betweenness_centrality
0,35527415,0.155051,,,0.020247
1,41778159,0.061239,,,
2,367703310,0.037626,,,
3,124280192,0.036753,,,
4,1024256932834631680,0.02026,,,
5,4715788289,0.019479,,,
6,3233835926,0.018606,,,
7,1085897362550202369,0.018331,,,
8,194774819,0.018101,,,
9,15537007,0.017687,,,


In [164]:
# look up screen name
df['screen_name'] = df['id_str'].apply(lambda x: get_account(x))

# round
df.round({
    'degree_centrality': 5, 
    'eigenvector_centrality': 5, 
    'closeness_centrality': 5, 
    'betweenness_centrality': 5
})

df

Unnamed: 0,id_str,degree_centrality,eigenvector_centrality,closeness_centrality,betweenness_centrality,screen_name
0,35527415,0.155051,,,0.020247,rivm
1,41778159,0.061239,,,,geertwilderspvv
2,367703310,0.037626,,,,thierrybaudet
3,124280192,0.036753,,,,VogelvrijeHArts
4,1024256932834631680,0.02026,,,,RebeccaH2020
5,4715788289,0.019479,,,,FHvanLeeuwen
6,3233835926,0.018606,,,,jodigraphics15
7,1085897362550202369,0.018331,,,,Sashaexposed
8,194774819,0.018101,,,,EricArends
9,15537007,0.017687,,,,martinenserink


In [110]:
df.to_csv('account_stats.csv')