In [1]:
import numpy as np
import pandas as pd
from itertools import product
import os
from tqdm import tqdm

In [2]:
# Locate files
source = os.path.abspath( '/media/Data_storage/Mobilcell/Data/' ) + '/' # '/mnt2/data/csv/' #'/media/Data_storage/Mobilcell/Data/'
source_pol = os.path.abspath( '/media/Data_storage/Mobilcell/DayPolygonData/' ) + '/' #'/media/Data_storage/Mobilcell/DayPolygonData/'
target = os.path.abspath( '/media/Data_storage/Mobilcell/DayGraphData/' ) + '/' # '/mnt/DayGraphData/'  #'/media/Data_storage/Mobilcell/DayGraphData/'

files_events = np.array( sorted([ i for i in os.listdir(source) if 'EVENTS' in i]) )
files_poligons = np.array( sorted([ i for i in os.listdir(source) if 'POLIGONS' in i]) )
files_events_cleaned = np.array( sorted([ i for i in os.listdir(source) if 'Events' in i]) )


In [3]:
# load data
print("loading input data")
#poligons_df = pd.read_csv( source+poligons_path, delimiter=';' )
events_df = pd.read_csv( source+files_events[0], delimiter=';' )

loading input data


In [None]:
towers_df = pd.read_csv( source_pol+'fixed_merged_tower_locations.csv' )

# calculate tower encodings
tower_id = np.unique( towers_df.original_id.values )
tower_to_int = dict( zip( towers_df.original_id.tolist(), towers_df.tower_id.tolist() ) )

tower_coords_all = dict( zip( towers_df.values[:,1].tolist(), 
                              towers_df.values[:,2:4].tolist() ) )

# add encodings to the dataframes
print("encode ids of events")
#poligons_df["tower_idx"] = [ tower_to_int[i] for i in poligons_df.network_identifier.values ]
events_df["tower_idx"] = [ tower_to_int[i] for i in events_df.network_identifier.values ]

# calculate event (equipment id) encodings
event_id = np.unique( events_df.equipment_identifier.values )
event_to_int = dict(zip( event_id, np.arange(event_id.shape[0]) ))

# add event encodings to the dataframes
events_df["event_idx"] = [ event_to_int[i] for i in events_df.equipment_identifier.values ]

# partitioning the event dataframe to track individual events
eq_diff_idx = np.where( np.diff(events_df.event_idx.values) )[0]+1
# insert first element (zero) ## otherwise left out!
eq_diff_idx = np.insert(eq_diff_idx, 0, 0, axis=0)
# insert last element (size of array) ## otherwise left out!
eq_diff_idx = np.append( eq_diff_idx, events_df.event_idx.values.shape[0] )

# calculate trajectories of events (to which tower it connects to and when)
eq_trajectories_towers = []
eq_trajectories_time = []
print("calculate trajectories of events..")
for i in tqdm( range( eq_diff_idx.shape[0]-1 ) ):
    start_ = eq_diff_idx[i]
    end_ = eq_diff_idx[i+1]
    eq_trajectories_towers.append( events_df.tower_idx.values[ start_:end_ ] )
    eq_trajectories_time.append( events_df.event_datetime.values[ start_:end_ ] )

# these arrays holds the information about the events
eq_trajectories_towers = np.array( eq_trajectories_towers ) # array of variable length arrays!
eq_trajectories_time = np.array( eq_trajectories_time ) # array of variable length arrays!

# check for unique values and drop events that stand still for the whole day
eq_trajectories_towers_uq = np.array([ np.shape(np.unique(i))[0] for i in eq_trajectories_towers ])
eq_trajectories_time = eq_trajectories_time[ eq_trajectories_towers_uq > 1 ]
eq_trajectories_towers = eq_trajectories_towers[ eq_trajectories_towers_uq > 1 ]

In [5]:
# load datetime variable to code time as minutes
dates_clock = np.loadtxt( "../cellphone/Data/event_datetime.csv").astype(int) # time on clock
dates_time = np.arange( 1440 ) # time in sec
time_to_sec = dict( zip(dates_clock, dates_time) )

In [6]:
# building the day graph, defining it with its edge list.. (it is paralellizable!)
graph_edge_list_raw = []
eps = 1e-6 # add small time to avoid division by zero
print("building the graph, defining it with its edge list..")
for curr in tqdm( range( eq_trajectories_towers.shape[0] )[:10] ):
# get indices
    eq_path = np.vstack( (eq_trajectories_towers[curr][:-1], eq_trajectories_towers[curr][1:]) ).T

    # filter if source and destination is the same
    filt = (eq_path[:,0] != eq_path[:,1])
    eq_path = eq_path[filt]

    # get time in minutes format
    eq_time_ = np.array( list( map(time_to_sec.get, eq_trajectories_time[curr]) ) )
    eq_time_min = eq_time_[1:] - eq_time_[:-1]
    eq_time_min = eq_time_min[filt] / 60 # this becomes hour for calc speed!
    eq_time_ = eq_time_[:-1][filt]

    src_coords = np.array([ tower_coords_all[ m ] for m in eq_path[:,0] ])
    dst_coords = np.array([ tower_coords_all[ n ] for n in eq_path[:,1] ])

    eq_dist_km = np.sqrt( np.sum( ( dst_coords - src_coords )**2, 1 ) ) / 1000

    eq_speed_kmh = (eq_dist_km/eq_time_min+eps).astype(int)
    filt = np.logical_and( eq_speed_kmh > 3, eq_speed_kmh < 180 )
    graph_edge_list_raw.append( np.concatenate( (eq_path[filt], 
                                                 np.ones(filt.sum()).reshape(-1,1) ), 
                                               axis=1).astype(int) )

100%|██████████| 10/10 [00:00<00:00, 4496.95it/s]

building the graph, defining it with its edge list..





In [8]:
print("finalizing the output..")
graph_edge_list_raw = np.concatenate( graph_edge_list_raw )
uq_, counts_ = np.unique(graph_edge_list_raw, axis=0, return_counts=True)
graph_edge_list = np.concatenate( (uq_[:,:2], counts_.reshape(-1, 1)), axis=1 )

finalizing the output..


In [18]:
pd.DataFrame( np.concatenate( ( graph_edge_list, 
                                np.array([ tower_coords_all[s] for s in graph_edge_list[:,0] ]), 
                                np.array([ tower_coords_all[t] for t in graph_edge_list[:,1] ]) ), 
                              axis=1  ), 
              columns=[ ["src", "dst", "weight", 
                         "src_eovx", "src_eovy", 
                         "dst_eovx", "dst_eovy"] ] )

Unnamed: 0,src,dst,weight,src_eovx,src_eovy,dst_eovx,dst_eovy
0,104,337,1,241512,654573,242244,654566
1,188,643,1,218169,643253,221169,640261
2,245,1757,1,240577,655279,240174,655306
3,322,2421,1,228361,646926,217738,638149
4,337,37,1,242244,654566,240098,651662
5,337,104,1,242244,654566,241512,654573
6,360,1054,1,159686,542153,158288,549904
7,572,260,1,234744,641670,244397,641665
8,572,8815,1,234744,641670,234571,641479
9,643,188,1,221169,640261,218169,643253


In [12]:
graph_edge_list.shape, tower_coords_all
      #[ graph_edge_list[:,0] ].shape, tower_coords_all[ graph_edge_list[:,1].shape ] )

((56, 3),
 {9058: [250859, 557285],
  9059: [243343, 558377],
  9060: [247254, 549716],
  9061: [248646, 551068],
  5295: [273888, 796498],
  9062: [270283, 645289],
  298: [235061, 657193],
  851: [234450, 656584],
  1511: [235065, 656754],
  2348: [232774, 656220],
  2349: [232559, 655958],
  2350: [232900, 655898],
  1512: [227058, 655054],
  61: [226467, 654380],
  3632: [238190, 646107],
  2351: [238222, 645524],
  1513: [238667, 645877],
  9063: [296142, 775686],
  2352: [293914, 775433],
  299: [244580, 656217],
  852: [244004, 655558],
  1514: [244577, 655638],
  4: [236347, 650877],
  1515: [245313, 643023],
  853: [244477, 643003],
  1516: [246569, 655653],
  58: [246477, 654805],
  1517: [229216, 655242],
  357: [228695, 654222],
  45: [231459, 653785],
  854: [245101, 655529],
  300: [244431, 654551],
  1518: [245338, 654743],
  6: [238224, 651498],
  0: [239320, 650532],
  301: [232253, 655665],
  1519: [232060, 655102],
  1520: [232545, 655132],
  1: [242702, 650451],
  1

In [None]:



def calculate_day_graph( source, target, poligons_path, events_path ):
    
    # load data
    print("loading input data")
    #poligons_df = pd.read_csv( source+poligons_path, delimiter=';' )
    events_df = pd.read_csv( source+events_path, delimiter=';' )

    # drop poligons outside of the country
    #poligons_df = poligons_df[ np.logical_and( poligons_df.eovx.values < 366660, 
    #                                           poligons_df.eovx.values > 48210 ) ]
    #poligons_df = poligons_df[ np.logical_and( poligons_df.eovy.values < 934219, 
    #                                           poligons_df.eovy.values > 426341 ) ]

    # load global, fix tower data
    towers_df = pd.read_csv( source_pol+'fixed_merged_tower_locations.csv' )

    # calculate tower encodings
    tower_id = np.unique( towers_df.original_id.values )
    tower_to_int = dict( zip( towers_df.original_id.tolist(), towers_df.tower_id.tolist() ) )

    tower_coords_all = dict( zip( towers_df.values[:,1].tolist(), 
                                  towers_df.values[:,2:4].tolist() ) )

    # add encodings to the dataframes
    print("encode ids of events")
    #poligons_df["tower_idx"] = [ tower_to_int[i] for i in poligons_df.network_identifier.values ]
    events_df["tower_idx"] = [ tower_to_int[i] for i in events_df.network_identifier.values ]
    
    # calculate event (equipment id) encodings
    event_id = np.unique( events_df.equipment_identifier.values )
    event_to_int = dict(zip( event_id, np.arange(event_id.shape[0]) ))
    
    # add event encodings to the dataframes
    events_df["event_idx"] = [ event_to_int[i] for i in events_df.equipment_identifier.values ]

    # partitioning the event dataframe to track individual events
    eq_diff_idx = np.where( np.diff(events_df.event_idx.values) )[0]+1
    # insert first element (zero) ## otherwise left out!
    eq_diff_idx = np.insert(eq_diff_idx, 0, 0, axis=0)
    # insert last element (size of array) ## otherwise left out!
    eq_diff_idx = np.append( eq_diff_idx, events_df.event_idx.values.shape[0] )

    # calculate trajectories of events (to which tower it connects to and when)
    eq_trajectories_towers = []
    eq_trajectories_time = []
    print("calculate trajectories of events..")
    for i in tqdm( range( eq_diff_idx.shape[0]-1 ) ):
        start_ = eq_diff_idx[i]
        end_ = eq_diff_idx[i+1]
        eq_trajectories_towers.append( events_df.tower_idx.values[ start_:end_ ] )
        eq_trajectories_time.append( events_df.event_datetime.values[ start_:end_ ] )
    
    # these arrays holds the information about the events
    eq_trajectories_towers = np.array( eq_trajectories_towers ) # array of variable length arrays!
    eq_trajectories_time = np.array( eq_trajectories_time ) # array of variable length arrays!
    
    # check for unique values and drop events that stand still for the whole day
    eq_trajectories_towers_uq = np.array([ np.shape(np.unique(i))[0] for i in eq_trajectories_towers ])
    eq_trajectories_time = eq_trajectories_time[ eq_trajectories_towers_uq > 1 ]
    eq_trajectories_towers = eq_trajectories_towers[ eq_trajectories_towers_uq > 1 ]
    
    # load datetime variable to code time as minutes
    dates_clock = np.loadtxt( "../Data/event_datetime.csv").astype(int) # time on clock
    dates_time = np.arange( 1440 ) # time in sec
    time_to_sec = dict( zip(dates_clock, dates_time) )

    # building the day graph, defining it with its edge list.. (it is paralellizable!)
    graph_edge_list_raw = []
    eps = 1e-6 # add small time to avoid division by zero
    print("building the graph, defining it with its edge list..")
    for curr in tqdm( range( eq_trajectories_towers.shape[0] ) ):
# get indices
        eq_path = np.vstack( (eq_trajectories_towers[curr][:-1], eq_trajectories_towers[curr][1:]) ).T
        
        # filter if source and destination is the same
        filt = (eq_path[:,0] != eq_path[:,1])
        eq_path = eq_path[filt]
        
        # get time in minutes format
        eq_time_ = np.array( list( map(time_to_sec.get, eq_trajectories_time[curr]) ) )
        eq_time_min = eq_time_[1:] - eq_time_[:-1]
        eq_time_min = eq_time_min[filt] / 60 # this becomes hour for calc speed!
        eq_time_ = eq_time_[:-1][filt]
        
        src_coords = np.array([ tower_coords_all[ m ] for m in eq_path[:,0] ])
        dst_coords = np.array([ tower_coords_all[ n ] for n in eq_path[:,1] ])
        
        eq_dist_km = np.sqrt( np.sum( ( dst_coords - src_coords )**2, 1 ) ) / 1000
        
        eq_speed_kmh = (eq_dist_km/eq_time_min+eps).astype(int)
        filt = np.logical_and( eq_speed_kmh > 3, eq_speed_kmh < 180 )
        graph_edge_list_raw.append( np.concatenate( (eq_path[filt], 
                                                     np.ones(filt.sum()).reshape(-1,1) ), 
                                                   axis=1).astype(int) )
    
    # finalizing the edge list by removing duplicate lines and collecting edge weights
    print("finalizing the output..")
    graph_edge_list_raw = np.concatenate( graph_edge_list_raw )
    uq_, counts_ = np.unique(graph_edge_list_raw, axis=0, return_counts=True)
    graph_edge_list = np.concatenate( (uq_[:,:2], counts_.reshape(-1, 1)), axis=1 )
    
    print( graph_edge_list.shape, tower_coords_all[ graph_edge_list[:,0] ].shape, tower_coords_all[ graph_edge_list[:,1].shape ] )
    print("saving results to disk..")
    # saving resulting dataframe to disc with coordinates of the source and destination towers
    savename = target + 'output_day-graph-by-edgelist_' + events_path.split('.csv')[0][-8:]+'.csv.gz'
    print(savename)
    pd.DataFrame( np.concatenate( ( graph_edge_list, 
                                    tower_coords_all[ graph_edge_list[:,0] ], 
                                    tower_coords_all[ graph_edge_list[:,1] ] ), 
                                  axis=1  ), 
                  columns=[ ["src", "dst", "weight", 
                             "src_eovx", "src_eovy", 
                             "dst_eovx", "dst_eovy"] ] ).to_csv( savename, 
                                                                 index=False, 
                                                                 compression='gzip')

# call this function like this:
# calculate_day_graph( source, target, files_poligons[100], files_events[100] )
for q in range( args.start_idx, args.end_idx ):
    calculate_day_graph( source, target, files_poligons[q], files_events[q] )