In [1]:
import pandas as pd
import numpy as np

from pandas.io.json import json_normalize

from IPython.core.display import display, HTML
import locale
locale.setlocale(locale.LC_ALL, 'en_US')

from concurrent.futures import ProcessPoolExecutor
import multiprocessing
from tqdm import tqdm_notebook as tqdm

timeout = 3600  # Timeout for building bipartite graph.
num_processes = int(multiprocessing.cpu_count())

DATA_DOI = 'doi:10.7910/DVN/DCBDEP'
DATA_FILE = 'https://dataverse.harvard.edu/api/access/datafile/:persistentId/?persistentId={persistent_id}'
CONTENTS_URI = 'https://dataverse.harvard.edu/api/datasets/:persistentId/versions/1/files?persistentId={}'.format(DATA_DOI)


In [2]:
import time
import graph_tool as gt
from graph_tool.stats import vertex_average

def buildGraph(df, debug=False, timeout=900):
    if debug: tic = time.time()
    b = gt.Graph(directed=False)
    b.add_edge_list(df[['post', 'user']].values)
    if debug:
        print("Bipartite nodes: %d, vertices: %d" % (b.num_vertices(), b.num_edges()))
        print("Average degree:", vertex_average(b, "total"))
        print("Time", time.time() - tic)
        tic = time.time()
    user_count = df.user.value_counts()
    p = projected_graph(b, user_count[user_count >= 2].index, timeout=timeout)
    if debug:
        print("Projection nodes: %d, vertices: %d" % (p.num_vertices(), p.num_edges()))
        print("Average degree:", vertex_average(p, "total"))
        print("Time", time.time() - tic)
    
        degree_sequence = sorted(p.get_out_degrees(p.get_vertices()), reverse=True)

    del b
    return p


def projected_graph(b, nodes, timeout=900):
    tic = time.time()
    p = gt.Graph(directed=False)
    try:
        for u in nodes:
            nbrs2 = [n for nbr in b.vertex(u).all_neighbours() for n in b.vertex(nbr).all_neighbours()]
            p.add_edge_list((u, n) for n in nbrs2)
            if time.time() - tic > timeout: raise
    except:
        p = gt.Graph(directed=False)
    return p


def extractUserInteractions(uri):
    df = pd.read_csv(uri, index_col=False, engine='c',
                    dtype={'post': np.int64, 'user': np.float64, 'type': str})
    df = df[df['type'] == 'C']
    df.dropna(inplace=True)
    _map = df['user'].drop_duplicates().reset_index(drop=True)
    _map = pd.Series(_map.index.values, index=_map)
    df['user'] = df['user'].map(_map)
    _map = df['post'].drop_duplicates().reset_index(drop=True)
    _map = pd.Series(_map.index.values, index=_map)
    _map += len(df['user'].unique()) + 10
    df['post'] = df['post'].map(_map)
    
    df['user'] = df['user'].astype(np.uint32, casting='safe')
    df['post'] = df['post'].astype(np.uint32, casting='safe')
    # print("Read %d interactions for the page %d" % (len(df), page))
    return df

In [3]:
def download_datafile(dataFile_id, overwrite=True):
    import shutil
    import requests
    import os.path

    url = 'https://dataverse.harvard.edu/api/access/datafile/{id}?format=original'.format(id=dataFile_id)
    try:
        r = requests.get(url, stream=True, allow_redirects=True, timeout=1)
        filename = r.headers.get('content-disposition')
        filename = filename.split("'")[2] if len(filename.split("'")) > 2 else filename.split('"')[1]
        try:
            if overwrite or not os.path.isfile('data/{}'.format(filename)):
                with open('data/{}'.format(filename), 'wb') as fd:
                    shutil.copyfileobj(r.raw, fd)
            return (filename, dataFile_id)
        except:
            if os.path.isfile('data/{}'.format(filename)):
                os.remove('data/{}'.format(filename))
    except:
        pass

    return (None, dataFile_id)


def process_file(filename):
    try:
        if filename in ['00__combinedPageInteractions.csv']:
            raise
        df = pd.read_csv('data/' + filename, index_col=False,
                        dtype={'post': np.int64, 'user': np.float64, 'type': str})
        types = df.type.value_counts()
        graph = buildGraph(df[df['type'] == 'C'].dropna(), debug=False, timeout=timeout)
        return {filename.split('.')[0]: [len(df.post.unique()), len(df.user.unique()), 
                                         types['C'] if 'C' in types.index else 0, 
                                         types['L'] if 'L' in types.index else 0,
                                         graph.num_edges() if graph.num_edges() else np.nan,
                                         graph.num_vertices() if graph.num_vertices() else np.nan]}
    except:
        pass
        return {filename.split('.')[0]: []}

In [4]:
files = json_normalize(data=pd.read_json(CONTENTS_URI)['data']).set_index('label')
file_ids = files['dataFile.id']
filenames = set()

for _ in range(5):  # Try five times to download all files from Dataverse.
    # Process the rows in chunks in parallel
    with ProcessPoolExecutor(num_processes) as pool:
        fn = list(tqdm(pool.map(download_datafile, file_ids, chunksize=1), total=len(file_ids)))
    filenames = filenames.union(set(x[0] for x in fn if x[0]))
    file_ids = [x[1] for x in fn if not x[0]]
    if len(file_ids) == 0:
        break








In [5]:
len(filenames)

161

In [6]:
with ProcessPoolExecutor(8) as pool:
    pages = list(tqdm(pool.map(process_file, filenames, chunksize=1), total=len(filenames)))
    
stats = pd.DataFrame.from_dict(dict((key,d[key]) for d in pages for key in d), orient='index',
                               columns=["Posts", "Users", "Comments", "Likes", 'Edges', 'Nodes'])





In [7]:
stats.to_pickle('stats_01.pkl')

In [8]:
print(len(stats))
s = (stats.drop(['00__combinedPageInteractions'], errors='ignore').describe()
         .T[['mean','std','min','25%', '50%', '75%', 'max']])
s.columns.name = 'Metric'
s.index.name = None
s['Sum'] = stats.sum()
s.rename(index={'Number of edges': 'Edges', 'Number of nodes': 'Nodes'},
         columns={'mean': "Mean",'std': 'Std.','min': 'Min','25%': '$Q1$', 
                  '50%': 'Median', '75%': '$Q3$', 'max': 'Max'},
         inplace=True)

s.to_latex('article/pageStats.tex', bold_rows=True, escape=False,
           float_format=lambda x: "$%s$" % locale.format("%d", x, grouping=True))

from IPython.core.display import HTML, display, Latex
display(HTML(s.to_html(float_format=lambda x: locale.format("%d", x, grouping=True))))


161


Metric,Mean,Std.,Min,$Q1$,Median,$Q3$,Max,Sum
Posts,11950,30472,4,874,3088,9588,236499,1912016
Users,2544019,11408810,182,26589,180314,897564,113374887,407043116
Comments,603743,2580485,37,10638,45592,230205,27550352,96599002
Likes,7557290,33899018,384,54923,442424,2589165,308495988,1209166428
Edges,26598994,56070956,41,163362,1775807,16519553,274925107,3750458258
Nodes,636588,1447806,48,19984,117437,617935,9333469,89758965


In [9]:
names = pd.read_csv("data/names.csv", dtype={0: str, 1: str}).set_index('Page')

s = stats.copy().drop(['00__combinedPageInteractions'], errors='ignore')
s['Id'] = s.index

s.rename(index=str, columns={'Number of edges': 'Edges', 'Number of nodes': 'Nodes'}, inplace=True)

s['Edges'] = [pd.np.nan if x == 0.0 else x for x in s['Edges']]
s['Nodes'] = [pd.np.nan if x == 0.0 else x for x in s['Nodes']]

print(len(s))

def name_mapping(x, latex_escape=False):
    if type(names.loc[x.Id].Name) is float: 
        ret = str(x.Id)
    else:
        if latex_escape:
            ret = (str(names.loc[x.Id].Name).replace('\\', '\\textbackslash ')
                            .replace('_', '\\_')
                            .replace('%', '\\%').replace('$', '\\$')
                            .replace('#', '\\#').replace('{', '\\{')
                            .replace('}', '\\}').replace('~', '\\textasciitilde ')
                            .replace('^', '\\textasciicircum ')
                            .replace('&', '\\&'))
        else:
            ret = str(names.loc[x.Id].Name)
    
    if pd.np.isnan(x['Edges']):
        # print(x)
        ret = ret + "$^\ddagger$"

    return ret

"""Save table as latex file."""
s['Name'] = s.apply(lambda x: name_mapping(x, True), axis=1)
s.fillna(np.inf).sort_values(['Edges','Nodes', 'Users']).reset_index().to_latex(
        'article/allPages.tex',
        escape=False, longtable=True, index=False,
        columns=['Name', 'Id', 'Posts', 'Users', 'Comments', 'Likes','Edges','Nodes'],
        float_format=lambda x: "" if x == np.inf else locale.format("%d", x, grouping=True))


s['Name'] = s.apply(name_mapping, axis=1)
display(HTML(s.fillna(pd.np.inf).sort_values([
                 'Edges','Nodes', 'Users'])
             .to_html(index=False, 
                      columns=['Name', 'Id', 'Posts', 'Users', 'Comments', 'Likes','Edges','Nodes'],
                      float_format=lambda x: "" if x == np.inf else locale.format("%d", x, grouping=True))))



160


Name,Id,Posts,Users,Comments,Likes,Edges,Nodes
Sustainable Development Policy & Practice / Po...,126385310775420,297,273,37,384,41.0,170.0
Ski-Akademie Schladming,266340220123141,191,949,64,1773,50.0,242.0
Royal Club Consulting,203841476411447,445,182,38,1300,52.0,48.0
Chaddsford Winery,493075535051,126,480,160,814,295.0,473.0
Say NO to Bullying,246881265345291,299,469,129,791,305.0,440.0
488878434469215,488878434469215,492,1889,224,4872,350.0,1855.0
Posthotel Schladming,184073274591,142,600,162,1806,426.0,588.0
"Play Station, Nitendo, X Boxs E.t.c",10237302714,86,390,143,385,1002.0,401.0
Energy Saver,121219973056,669,2909,644,6397,1104.0,2904.0
Dourakis Winery,106096217726,253,3048,400,7042,1585.0,2784.0
