In [1]:
# Setting up the source code

import subprocess
import shlex

subprocess.run(shlex.split('python ./waf-2.0.24 configure'))
subprocess.run(shlex.split('python ./waf-2.0.24'))

[32m[0mSetting top to                           :[0m [0m[32m[32m/Users/sed_zeppelin/Master Project[0m [0m
[32m[0mSetting out to                           :[0m [0m[32m[32m/Users/sed_zeppelin/Master Project/bin[0m [0m
[32m[0mChecking for 'clang++' (C++ compiler)    :[0m [0m[32m[32m/usr/bin/clang++[0m [0m
[32m[0mUnpacking gtest                          :[0m [0m[32m[32myes[0m [0m
[32m[0mChecking for library pthread             :[0m [0m[32m[32myes[0m [0m
[32m'configure' finished successfully (0.545s)[0m
[32mWaf: Entering directory `/Users/sed_zeppelin/Master Project/bin'[0m
[32mWaf: Leaving directory `/Users/sed_zeppelin/Master Project/bin'[0m
[32m'build' finished successfully (0.110s)[0m


CompletedProcess(args=['python', './waf-2.0.24'], returncode=0)

In [2]:
# Converting the shell command to a python method -runAlgorithm


# import subprocess
# import shlex
#  'random_seed': '114514'

def runAlgorithm(**kwargs):
    
    # Default arguments of the method:
    default_args = {'type': 'gen', 'graph': "flower 1000 1 2", 'method': 'sketch', 'alpha': '1', 'least_coverage': '1',
           'sketch_k': '128', 'multipass': '10000', 'rad_min': '1', 'rad_max': '30' , 'random_seed': '42'}
    
    options = ""
    
    for key in kwargs:
        default_args[key] = kwargs[key]
        
    if default_args['type'] == 'gen':
        a = default_args['graph']
        default_args['graph'] = f'"{a}"'
            
    for key in default_args:
        options += f" -{key}={default_args[key]}"
        
    subprocess.run(shlex.split(f"./bin/box_cover {options}"))

In [3]:
# Reading the data:

import pandas as pd

df = pd.read_csv('./coaut.txt', sep='\t')

In [4]:
# Preparing APS dataset for the experiments:

import numpy as np
import itertools
import csv
from graph_tool.all import *


def get_aps (data, begin_time, end_time, journal):
    
    
    # Shrinking the data according to the journal:
    
    df = data.loc[data['domain'] == journal]

              
    # Removing self-loops and duplicate edges:
    
    df = df[df['fact_u'] > df['fact_v']]
    
    
    # Removing parallel edges:
    
    df = df.drop_duplicates(subset=['fact_u', 'fact_v'])
    
    # Shrinking the data according to the time period:
    
    timelist = df['time'].tolist()
    for i in range(len(timelist)):
        timelist[i] = timelist[i][:7]
    index1 = timelist.index(begin_time)
    timelist.reverse()
    index2 = timelist.index(end_time)
    index2 = len(timelist) - index2
    
    df = df.iloc[index1: index2]
       
    
    # Removing outliers:

    def get_no_of_authors(cell): # For getting the number of authors for each row
        return round(1/cell)
    
    df['no_authors'] = df['weight'].apply(get_no_of_authors) # Making the number-of-authors column
    
    std = df['no_authors'].std()
    mean = df['no_authors'].mean()
    
    x_std = 2 * std
    df = df[df['no_authors'] < mean + x_std]
    del df['no_authors']
    
    
    # Extracting the edges:
    
    df2 = df[['fact_u', 'fact_v']]
    edgelist = df2.values.tolist()
    
    
    # Re-labelling edgelist:
    
    temp_set = set()
    for edge in edgelist:
        for vertex in edge:
            temp_set.add(vertex)
    
    temp_list = list(temp_set)
    vertex_mapping = {} # old_label -> new_label
    for i in range(len(temp_list)):
        vertex_mapping[temp_list[i]] = i

    for i in range(len(edgelist)):
        for j in range(2):
            edgelist[i][j] = vertex_mapping[edgelist[i][j]]
      
    
    # Getting the largest connected component and writing it to file:
    
    G = Graph(directed=False)
    G.add_edge_list(edgelist)
    
    largest_comp = GraphView(G, vfilt = label_largest_component(G))
    
    edges = np.array(largest_comp.get_edges())
    np.savetxt('./apstest', edges, fmt='%d', delimiter = '\t')
    
    
    return edgelist, largest_comp, G

# aps = get_aps(df, '1993-01', '1993-12', 'PRE')

In [5]:
import glob
import os
import json


# The function for reading size and radius from the output file:

def get_size_radius():
    
    list_of_files = glob.glob('./jlog/*')
    latest_file = max(list_of_files, key=os.path.getctime)

    f = open(latest_file, "r").read()
    json_file = json.loads(f)

    radius = json_file['radius']
    size = json_file['size']

    print (f"radius: \n{radius}\n")
    print (f"size: \n{size}")

    return size, radius

In [6]:
from scipy.stats import linregress

# The function for fitting the lines to the data:

def fit(get_size_radius):
    
    lbs = get_size_radius[1]
    nbs = get_size_radius[0]
    
    x = np.array(lbs)
    y = np.array(nbs)
    x_log = np.log10(lbs)
    y_log = np.log10(nbs)
    
    fit_pl = linregress(x_log, y_log)
    fit_exp = linregress(x, y_log)
    
    y_log_pl = fit_pl.slope * x_log + fit_pl.intercept
    y_log_exp = fit_exp.slope * x + fit_exp.intercept
    
    return fit_pl.slope, fit_exp.slope, fit_pl, fit_exp, x_log, y_log, y_log_pl, y_log_exp, nbs

In [7]:
from collections import Counter
import graph_tool.all as gt


# The function for getting the average shortest path length, average clustering coefficient, and network diameter:

def statistics1(G):
        
    network_diameter = int(distance_histogram(G)[1][-1]-1)
    
    dist = gt.shortest_distance(G)
    average_shortest_path_length = sum([sum(i) for i in dist])/(G.num_vertices()**2-G.num_vertices())
    average_culstering_coefficient = sum(local_clustering(G))/G.num_vertices()
    
    return average_shortest_path_length, average_culstering_coefficient, network_diameter



# The function for getting susceptibility, correlation length, percolation probability, n_nodes, n_edges and avg_degree:

def statistics2(G):
    
    n_nodes = G.num_vertices()
    n_edges = G.num_edges()
    
    comp, hist = label_components(G)
    comp_size = list(hist)
    comp_size.sort(reverse=True)
    comp_size_freq = np.array(list(dict(Counter(comp_size)).items()))
    comp_size_freq = comp_size_freq[np.argsort(comp_size_freq[:, 0])]
    # percolation probability
    p = comp_size[0]/n_nodes
    # susceptibility
    s = 0
    for i in range(len(comp_size_freq)-1):
        s += comp_size_freq[i, 0]**2*comp_size_freq[i, 1]/n_nodes
    if s == 0:
        s = np.nan
    # correlation length
    xi = comp_size[1]
         
    degrees = G.degree_property_map("total")
    avg_degree = sum(degrees) / n_nodes
    
    return s, xi, p, avg_degree, n_nodes, n_edges

In [8]:
# The final function that gets the time period and journal, and returns fractality and biggest component:

def f(begin_time, end_time, journal):
    
    largets_connected_component = get_aps(df, begin_time, end_time, journal)[1]
    graph = get_aps(df, begin_time, end_time, journal)[2]
    runAlgorithm (type = 'tsv', graph = './apstest')
    size, radius = get_size_radius()
    
    return size, radius, largets_connected_component, graph

In [9]:
def get_windows_labels(starting_from: str, n: int): # starting_from should be in the form of "year-month"
    
    year = int(starting_from.split('-')[0])
    month = starting_from.split('-')[1]
    
    months_list = ['03', '06', '09', '12']
    
    position_in_months_list = months_list.index(month)
    
    time_windows = []
    
    for i in range(n):
        
        if i > 0 and (i + position_in_months_list)%4 == 0:
            year = year + 1
        time_windows.append(str(year) + '-' + str(months_list[(i + position_in_months_list)%4]))
    
    return (time_windows)

In [10]:
def get_aggregate_from(date_end):
    
    months_list1 = ['03', '06', '09', '12']
    months_list2 = ['04', '07', '10', '01']
    
    year = date_end.split('-')[0]
    month1 = date_end.split('-')[1]
    month2 = months_list2[months_list1.index(month1)]
    
    if months_list1.index(month1) == 3:
        year = str(int(year)//10)+str(int(year)%10+1)
    
    return year + '-' + month2

In [11]:
def get_first_window(date_end):
    
    months_list = ['03', '06', '09', '12', '03']
    
    year = date_end.split('-')[0]
    month1 = date_end.split('-')[1]
    month2 = months_list[months_list.index(month1)+1]
    
    if months_list.index(month1) == 3:
        year = str(int(year)//10)+str(int(year)%10+1)
    
    return year + '-' + month2

In [12]:
import datetime
import math

def run_snapshot(first_window, aggregate_from, number_of_windows, journal, snapshot_number):

    begin_time1 = datetime.datetime.now()

    strs = get_windows_labels(first_window, number_of_windows)

    avg_shortest_path_len_m3 = []
    avg_clus_coef_m3 = []
    net_diameter_m3 = []

    susceptibility_3m = []
    corrlength_3m = []
    percolation_3m = []

    avg_degree_3m = []
    n_nodes_3m = []
    n_edges_3m = []
    
    fractal_dimension_3m = []
    error_ratio_3m = []

    for i in range(len(strs)):

        print (f'iteration {i}')

        begin_time2 = datetime.datetime.now()

        aps = get_aps(df, aggregate_from, strs[i], journal)
        stats1 = statistics1(aps[1])
        avg_shortest_path_len_m3.append(stats1[0])
        avg_clus_coef_m3.append(stats1[1])
        net_diameter_m3.append(stats1[2])
        
        stats2 = statistics2(aps[2])
        susceptibility_3m.append(stats2[0])
        corrlength_3m.append(stats2[1])
        percolation_3m.append(stats2[2])
        avg_degree_3m.append(stats2[3])
        n_nodes_3m.append(stats2[4])
        n_edges_3m.append(stats2[5])
        
        F = f(aggregate_from, strs[i], journal)
        
        if len(F[0]) < 4:
            fractal_dimension_3m.append(math.nan)
            error_ratio_3m.append(math.nan)
        else:
            fitted = fit(F)
            fractal_dimension_3m.append(-1 * fitted[0])
            wmse_exp_3m = sum(fitted[8] * (fitted[7] - fitted[5]) ** 2) / sum(fitted[8]) / len(fitted[7])
            wmse_pl_3m = sum(fitted[8] * (fitted[6] - fitted[5]) ** 2) / sum(fitted[8]) / len(fitted[6])
            error_ratio_3m.append(wmse_exp_3m/wmse_pl_3m)
            
        end_time2 = datetime.datetime.now()
        runtime2 = end_time2 - begin_time2
        print (f'runtime: {runtime2}')
        
    dataframe = pd.DataFrame(data=[avg_shortest_path_len_m3, avg_clus_coef_m3, net_diameter_m3, susceptibility_3m, corrlength_3m, percolation_3m, avg_degree_3m, n_nodes_3m, n_edges_3m, fractal_dimension_3m, error_ratio_3m, strs]).transpose()
    dataframe.columns = ['avg_shortest_path_len_m3', 'avg_clus_coef_m3', 'net_diameter_m3', 'susceptibility_3m', 'corrlength_3m', 'percolation_3m', 'avg_degree_3m', 'n_nodes_3m', 'n_edges_3m', 'fractal_dimension_3m', 'error_ratio_3m', 'time_window']
    dataframe.to_csv('./PRE/snapshot'+str(snapshot_number)+'.csv')
    
    t_end = susceptibility_3m.index(max(susceptibility_3m))
    date_end = strs[t_end]
    
    end_time1 = datetime.datetime.now()
    runtime1 = end_time1 - begin_time1
    print (f'runtime: {runtime1}')
    
    return t_end, date_end

In [None]:
aggregate_from = '1993-01'
first_window = '1993-03'
number_of_windows = 30
snapshot_number = 1
journal = 'PRE'

t_ends = []
end_dates = []

all_window_labels = get_windows_labels('1993-03', 68)

while_loop_begin_time = datetime.datetime.now()

while int(first_window.split('-')[0]) < 2007:
    
    peak = run_snapshot(first_window, aggregate_from, number_of_windows, journal, snapshot_number)
    t_ends.append(peak[0])
    end_dates.append(peak[1])
    
    print (f'snapshot {snapshot_number} done!')
    print (f't_end: {peak[0]}')
    print (f'date_end: {peak[1]}\n')
    
    aggregate_from = get_aggregate_from(peak[1])
    first_window = get_first_window(peak[1])
    snapshot_number = snapshot_number + 1
    
    if len(all_window_labels) - all_window_labels.index(first_window) < number_of_windows:
        number_of_windows = len(all_window_labels) - all_window_labels.index(first_window)

while_loop_end_time = datetime.datetime.now()
while_loop_runtime = while_loop_end_time - while_loop_begin_time
print (f'runtime: {while_loop_runtime}')

In [None]:
def write_list_to_file(filename, data_list):
    with open("./"+journal+"/" + filename, 'w') as file:
        for item in data_list:
            file.write(str(item) + '\n')

# def read_list_from_file(filename):
#     data_list = []
#     with open("./PRE/" + filename, 'r') as file:
#         for line in file:
#             data_list.append(int(line.strip()))
#     return data_list

In [None]:
write_list_to_file('t_ends', t_ends)
write_list_to_file('end_dates', end_dates)