# New features

## Number of shortest path
Mind that only distance-2 node pairs are used at the moment.

In [241]:
# Slow method, but providing also shortest paths at greater distance:
# [len(list(nx.all_shortest_paths(multigraph_mature, *sample))) for sample in tqdm(np.load('temp/samples.npy'))]

def single_source_number_paths_length_2(graph: nx.Graph, source):
  result = Counter()
  for nb in graph[source]: 
    for nnb in graph[nb]: 
      result[nnb] += 1
  return result

multigraph_mature = nx.read_gpickle(f'{temp_path}/multigraph-mature.pkl')

paths__of_length_2_from_node_u = {
  node: single_source_number_paths_length_2(multigraph_mature, node) 
  for node in np.load(f'{temp_path}/samples.npy')[:,0]}

joblib.dump(
  {Experiment('Number of shortest paths'): 
   np.array([paths__of_length_2_from_node_u[u][v] 
             for u, v in tqdm(np.load(f'{temp_path}/samples.npy'))])}, 
  f'{temp_path}/features/number-shortest-paths-time-unaware.pkl')

  0%|          | 0/20000 [00:00<?, ?it/s]

['01 dblp_coauthor/temp/features/number-shortest-paths-time-unaware.pkl']

## Node attributes

In [95]:
edgelist = joblib.load(f'{temp_path}/edgelist_mature.pkl')
samples=np.load(f'{temp_path}/index_sampled.npy')
verbose=True

def get_node_attributes(edgelist, samples, aggregation_strategy, time_strategy, **kwargs):
  edgelist['datetime_transformed'] = time_strategy(edgelist['datetime'])
  graph = nx.from_pandas_edgelist(
    edgelist, source='u', target='v', edge_attr=True, 
    create_using=nx.MultiGraph)
  
  result = list()
  for u, v in tqdm(samples):
    activity_u = aggregation_strategy(
      [edge_attributes['datetime_transformed'] 
       for nb in graph[u]
       for edge_attributes in graph.get_edge_data(u, nb).values()])
    activity_v = aggregation_strategy(
      [edge_attributes['datetime_transformed'] 
       for nb in graph[v]
       for edge_attributes in graph.get_edge_data(v, nb).values()])
    result.append(activity_u +activity_v)
  return result #!

In [96]:
t = [
    lp.Experiment(
      feature='N (time_aware)', time_strategy=time_str, aggregation_strategy=agg_str,
      scores=get_node_attributes(
        edgelist, samples, aggregation_strategy=agg_func, time_strategy=time_func, 
        verbose=verbose, position=2, leave=False))
    for time_str, time_func in tqdm(lp.TIME_STRATEGIES.items(), position=0, 
                                    disable=not verbose)
    for agg_str, agg_func in tqdm(lp.AGGREGATION_STRATEGIES.items(), position=1, 
                                  leave=False, disable=not verbose)]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

In [90]:
joblib.dump(t, f'{temp_path}/features/N.pkl')

['01/temp/features/N.pkl']

## Calculation: Network statistics

In [38]:
edgelist_dict = {
  'complete_graph': joblib.load(f'{dataset_id}/edgelist.pkl'),
  'mature_graph': joblib.load(f'{dataset_id}/edgelist_mature.pkl'),
  'mature_probe_graph': pd.concat([
    joblib.load(f'{dataset_id}/edgelist_mature.pkl'),
    joblib.load(f'{dataset_id}/edgelist_probe.pkl')])
}

tlp.analysis.network_stats(edgelist_dict, path=f'{dataset_id}/stats', 
                           verbose=True, fraction=0.001)

FileExistsError: [Errno 17] File exists: '01/stats'

## Table: Network Statistics

In [None]:
pd.DataFrame(
  [joblib.load(file).stats for file in os.listdir(f'{dataset_id}/stats')])

## Figure: Path distributions

In [None]:
df = pd.DataFrame(
  [joblib.load(file).path_distribution 
   for file in os.listdir(f'{dataset_id}/stats')])
plt.analysis.plot_path_distributions(df)