In [1]:
%matplotlib inline
import sys
import pandas as pd
import matplotlib.pylab as plt
import re
import numpy as np
import pickle
import datetime as dt # for ticking POSIX clock see http://www.ucolick.org/~sla/leapsecs/epochtime.html
import time
import math
from collections import Counter
import networkx as nx
import graphviz
import pygraphviz

In [3]:
# ----------------- #
## Handle raw data ##
# ----------------- #

def datetime_to_posix(year,month,day,hour=0,minute=0,second=0):
    """Return posix time, given datetime
    """
    return ( time.mktime( dt.datetime(year,month,day,hour,minute,second).timetuple() ) )
                       
                       
def time_bin_network(df, window=3600):
    """Return iterator to partition network in 'window'-sized bins.
    
    Parameters
    ----------
    df : network dataframe
        Pandas dataframe with network loaded straight from csv.
    window : bin-size integer
        Size of network bins in seconds. This number controls
        the amount of bins.
        
    Returns
    -------
    df_slice : iterator
        Iterator object that yields one bin at a time.
    """
    
    try:
        assert window >= 300
        window += 0.01
    except AssertionError:
        raise AssertionError("""'window' must be greater than, \
or equal to 300.""")
        
    # Get timestamp column and extreme values
    col_t = df["timestamp"]
    min_t = min(col_t)
    max_t = max(col_t)    
    
    # Get timespan and number of network splits - requires traversing data twice per split - good thing its 
    delta_time = max_t - min_t
    
    n_splits = int(math.ceil(delta_time/float(window)))

    for i in range(n_splits):
        lower_bound = df["timestamp"] > min_t + i*window
        upper_bound = df["timestamp"] < min_t + (i+1)*window
        df_slice = df[lower_bound][upper_bound]
        yield df_slice
        
        
def dump_binned_network(df, binsize, filename, kind):
    """Calls binned_network and store binned network into local file
    
    Parameters
    ----------
    df : network dataframe
        Pandas dataframe with network
    binsize : int
        Size of bins in resulting network
    filename : str
        Name of resulting file
    """
    
    binned_network = list(time_bin_network(df, window=binsize))
    
    with open('../Data/processed_data/binned_networks/'+kind+'/'+filename+'.pickle', 'w') as outfile:
        pickle.dump(binned_network, outfile)
        
            
def load_binned_network(kind,filename):
    with open('../Data/processed_data/binned_networks/'+kind+'/'+filename+'.pickle', 'r') as infile:
        return pickle.load(infile)

In [4]:
# ----------------------- #
## Reformat layered data ##
# ----------------------- #

def network_reformat_multiplex(layers, halflife=-1, w5min=1, norm_intra="False", expmult=1, laydiff=10 ):
    """Return multiplex representation of multiplex network
    
    Parameters
    ----------
    halflife : number
        Halflife in seconds of relax-rate decay between layers.
        Defaults to -1.
    layers : pandas df formatted layers
        
    Returns
    -------
    net_file : string
        A network string in multiplex format
    int_to_hash : dict
        Key-value pairs of node integer id and original hash id
    """
    
    # Infomap will only work with node ids as indices.
    
    # Get all node ids in original md5 hash values
    nodes = set()
    for l, df in enumerate(layers):
        layer_nodes = set()
        layer_nodes.update(df["user1"])
        layer_nodes.update(df["user2"])
        nodes.update(layer_nodes)
        
    
    ##########################
    ## Add vertices to file ##
    ##########################
    
    out_file = "*Vertices %d" % len(nodes)
    
    # Node name book-keeping, and adding to file
    hashid_to_intid = {}
    intid_to_hashid = {}
    for i,n in enumerate(nodes):
        intid = i+1
        hashid = str(n)
        out_file += '\n%d "Node %s" 1.0' % (intid,hashid)
        hashid_to_intid[hashid] = intid
        intid_to_hashid[intid] = hashid

        
    #############################
    ## Add Intra-edges to file ##
    #############################
    
    out_file += "\n*Multiplex\n# Intra edges: layer node layer node weight"
    
    for l, df in enumerate(layers):
        user1 = df["user1"]
        user2 = df["user2"]
            
        edges = zip(user1, user2)
        
        # Add weights. REDUNDANT FOR 5MINS TIMESLICES BECAUSE PPL ONLY MEET ONCE HERE.
        edges = [(e[0],e[1],w) for e,w in Counter(edges).items()]
        
        # Find max weight
        maxw = max( [w for (a,b,w) in edges] )
        
        
        # Add Intra-edges to file
        for i,j,w in edges:
            
            if w==1:
                w=w5min # reduce infomap confusion by random encounters
            
            if norm_intra:
                w=w/float(maxw)
                
            out_file += '\n%d %s %d %s %f' % (l+1,hashid_to_intid[i], l+1,hashid_to_intid[j],w) #+1 because 1 is first layer index
        
    #############################
    ## Add Inter-edges to file ##
    #############################
    
    out_file += "# Inter edges: layer node layer node weight"
    
    # Infinte halflife (represented as -1)
    if halflife == -1:
        return out_file, intid_to_hashid
    
    # Relax decay function
    def N(t):
        tau = halflife/np.log(2)
        return expmult*np.exp(-t/float(tau))
                        
    for l1, df1 in enumerate(layers):
        nodes1 = set(list(df1['user1'].values)+list(df1['user2'].values))
        for l2, df2 in enumerate(layers):    
            if not l2 > l1 or abs(l1-l2)>laydiff:
                continue   
            
            nodes2 = set(list(df2['user1'].values)+list(df2['user2'].values))
            common_nodes = nodes1 & nodes2
            time_diff = df2['timestamp'].values[0] - df1['timestamp'].values[0]
            
            for n in common_nodes:
                out_file += '\n%d %s %d %s %f' % (l1+1,hashid_to_intid[n],l2+1,hashid_to_intid[n],N(time_diff))
    
    return out_file, intid_to_hashid


def network_reformat_uniplex(layers):
    """Return graphs of timeslice-layer as a list of networkx graphs
    
    Parameters
    ----------
    layers : pandas df formatted timeslice-layers
        
    Returns
    -------
    graphlist : list of networkx.classes.graph objects

    """
        
    #####################################################################
    ## Make graphs by adding adding only nodes with edges between them ##
    #####################################################################
    
    graphlist = []
    
    for l, df in enumerate(layers):
        
        #print( dt.datetime.fromtimestamp( min(df["timestamp"]) ).isoweekday() )
        
        user1 = df["user1"]
        user2 = df["user2"]
        edges = zip(user1, user2)
        
        # Add weights. REDUNDANT FOR 5MINS TIMESLICES BECAUSE PPL ONLY MEET ONCE HERE.
        edges = [(e[0],e[1],w) for e,w in Counter(edges).items()]
        
        G = nx.Graph()
        G.add_weighted_edges_from(edges)
        graphlist.append(G)
    
    return graphlist

In [5]:
### load raw data and cut specific month out
#df_blue = pd.read_csv('../Data/raw_data/all.csv', sep=" ").loc[:,['#','user1','user2','timestamp']]
df_blue = pd.read_csv('../Data/raw_data/short.csv', sep=" ").loc[:,['#','user1','user2','timestamp']]
df_blue.columns = ['user1','user2','timestamp','duration']

# feb14 starts 2014-02-01 00:00:00 and ends 2014-02-28 23:59:59
startdate = datetime_to_posix(2014,2,1)
enddate = datetime_to_posix(2014,2,28,23,59,59)
feb14 = df_blue[ df_blue['timestamp'] > startdate ][ df_blue['timestamp'] < enddate ]



In [7]:
df_blue.head()

Unnamed: 0,user1,user2,timestamp,duration
0,51bbc59b59c49e7af848d5172ede97,a86f60d62eaad69ef35e36650fc10b,1391209200,300
1,2b8ead75b7a5ab690c49be08382131,3e003ad85a725fca24569f18f7fdb5,1391209200,300
2,a94b118a62ebe7883a42d8966003b5,d224183095b2fab5f490b9c567a8fb,1391209200,300
3,a5a21108856c86bbee025c40bdf05a,c71b8920ce278a5ad6d6da2b0f8df8,1391209200,300
4,688c883dc4b386437861c4333ecaa6,c71b8920ce278a5ad6d6da2b0f8df8,1391209200,300


In [5]:
### Bin the selected month into timeslices of various lengths, and pickle for future use

bins = [300, 900, 1800, 3600, 14400, 86400, 604800]
bin_names = ['5mins', '15mins', '30mins', 'hourly', '4hourly', 'daily', 'weekly']

for bs,bn in zip(bins,bin_names):
    dump_binned_network( feb14 , bs , 'feb14' + '_' + bn, 'bluetooth' )
    print(bn)

5mins
15mins
30mins
hourly
4hourly
daily
weekly


In [4]:
# Unpickling the pickles
feb14_5mins = load_binned_network('bluetooth','feb14_5mins')
feb14_15mins = load_binned_network('bluetooth','feb14_15mins')
feb14_30mins = load_binned_network('bluetooth','feb14_30mins')
feb14_hourly = load_binned_network('bluetooth','feb14_hourly')
feb14_4hourly = load_binned_network('bluetooth','feb14_4hourly')
feb14_daily = load_binned_network('bluetooth','feb14_daily')
feb14_weekly = load_binned_network('bluetooth','feb14_weekly')

In [6]:
### Cut out a specific day for binsizes < 1 day

bins_pr_day = {'5mins': 288, '15mins': 96, '30mins': 48, 'hourly': 24, '4hourly': 6 }
weekstart = 9 # monday = 2
datadays = 7 # how many days of data

week1_5mins  = feb14_5mins[ bins_pr_day['5mins']*weekstart : bins_pr_day['5mins']*(weekstart + datadays)]
week1_15mins = feb14_15mins[ bins_pr_day['15mins']*weekstart : bins_pr_day['15mins']*(weekstart + datadays)]
week1_30mins = feb14_30mins[ bins_pr_day['30mins']*weekstart : bins_pr_day['30mins']*(weekstart + datadays)]
week1_hourly = feb14_hourly[ bins_pr_day['hourly']*weekstart : bins_pr_day['hourly']*(weekstart + datadays)]
week1_4hourly = feb14_4hourly[ bins_pr_day['4hourly']*weekstart : bins_pr_day['4hourly']*(weekstart + datadays)]
week1_daily = feb14_daily[ weekstart : weekstart + datadays ]

dt.datetime.fromtimestamp( min(week1_daily[0]["timestamp"]) ).isoweekday()

1

In [11]:
bins_pr_day = {'5mins': 288, '15mins': 96, '30mins': 48, 'hourly': 24, '4hourly': 6 }
weekstart = 2 # monday = 2
datadays = 1 # how many days of data
mon1_5mins  = feb14_5mins[ bins_pr_day['5mins']*weekstart : bins_pr_day['5mins']*(weekstart + datadays)]
dt.datetime.fromtimestamp( min(mon1_5mins[0]["timestamp"]) ).isoweekday()

1

In [12]:
with open('../Data/processed_data/binned_networks/bluetooth/mon1_5mins.pickle', 'w') as outfile:
        pickle.dump(mon1_5mins, outfile)

In [8]:
### Store the binned data as lists of networkx graphs to be visualized
uniplex_network_filenames = ["feb14_5min", "feb14_15min", "feb14_30min",
                             "feb14_hourly", "feb14_4hourly", "feb14_daily", "feb14_weekly"]
layers_list = [feb14_5mins , feb14_15mins, feb14_30mins, feb14_hourly, feb14_4hourly, feb14_daily, feb14_weekly]

#uniplex_network_filenames = ["feb14_monday1_5min", "feb14_monday1_15min", "feb14_monday1_30min",
#                             "feb14_monday1_hourly", "feb14_monday1_4hourly", "feb14_daily", "feb14_weekly"]
#layers_list = [oneday_5mins , oneday_15mins, oneday_30mins, oneday_hourly, oneday_4hourly, feb14_daily, feb14_weekly]

#uniplex_network_filenames = ["feb14_4hourly"]                           
#layers_list = [ feb14_4hourly ] 
               
for filename, layers in zip(uniplex_network_filenames, layers_list) :
    graphlist = network_reformat_uniplex( layers ) 
    with open('pickle_uniplex/bluetooth/'+filename+'.pickle', 'w') as outfile:
        pickle.dump(graphlist, outfile )

In [15]:
uniplex_network_filenames = ["mon1_5mins","feb14_5min"]
layers_list = [mon1_5mins,feb14_5mins]

for filename, layers in zip(uniplex_network_filenames, layers_list) :
    graphlist = network_reformat_uniplex( layers ) 
    with open('pickle_uniplex/bluetooth/'+filename+'.pickle', 'w') as outfile:
        pickle.dump(graphlist, outfile )

In [16]:
### Store binned data (layered data) as multiplex network in pajek .net format for infomap to use
import timeit
tic=timeit.default_timer()


muwhat=2.0552 # for SHORT feb14 5min bins #5.1675333 # for feb14 5min bins # 4.4557725 # for feb14 30min bins
tau = 300#1800#14400#86400
e=0.0001
muwhatmults=np.arange(0.02,0.1001,0.02).tolist() + np.around(np.arange(0.15,1+e,0.05),3).tolist() + np.arange(1.5,4.5+e,0.5).tolist()

for muwhatmult in muwhatmults:
    netfile,intid_to_hashid = network_reformat_multiplex(mon1_5mins, halflife=tau, w5min=1, expmult=muwhatmult*muwhat)
    multiplexname = "mon1_5min_intra_norm_inter_2decay_{}muwhat".format(muwhatmult)
    with open("netfiles/bluetooth/"+multiplexname+".net", 'w') as outfile:
            outfile.write(netfile)
    with open('int2hash/bluetooth/'+multiplexname+'.pickle', 'w') as outfile:
            pickle.dump(intid_to_hashid, outfile)
        
print timeit.default_timer() - tic

25.002956152


In [19]:
muwhat=2.0552 # for SHORT feb14 5min bins #5.1675333 # for feb14 5min bins # 4.4557725 # for feb14 30min bins
tau = 300#1800#14400#86400
muwhatmult=0.5
netfile,intid_to_hashid = network_reformat_multiplex(feb14_5mins, halflife=tau, w5min=1, expmult=muwhatmult*muwhat)
multiplexname = "feb14_5min_intra_norm_inter_2decay_{}muwhat".format(muwhatmult)
with open("netfiles/bluetooth/"+multiplexname+".net", 'w') as outfile:
        outfile.write(netfile)
with open('int2hash/bluetooth/'+multiplexname+'.pickle', 'w') as outfile:
        pickle.dump(intid_to_hashid, outfile)