## Step 1

Lebel edges and save into 3 files:
- fully_shared_labels: two vertices of a given edge have totaly same labels 
- partial_shared_labels: two vertices share partial labels, and I use overlap ratio to indicate similarity
- others: otherwise

Sample edges:

first randomly edges from fully_shared_labels , then if it is still not enough , select from partial_shared_labels, then others.

Notes:
- when select edges from fully_shared_labels and others, the method is random selection.
- but when select edges from partial_shared_labels, the method is sequential selection, because edges have already been storted to descending orders by overlap values(similarities).






In [2]:
# data_path: working path
# sampled_path : the generated files will be saved here
# file_name : [blogCatalog3.txt,flickr.txt,youtube.txt]
# group_file_name: for example [flickr-groups.txt] 
from collections import defaultdict
def label_edges(data_path,sampled_path,file_name,group_file_name,keep_num=-1,max_label_classes=-1,full_num=-1,partial_num=-1,other_num=-1,random_seed=7766):
    
    
    vertices = set()
    edges = set()
    groups = dict()
    label_counter = defaultdict(int)
    # first read labels from data_path+file_name
    print('reading groups from %s' % group_file_name)
    with open(data_path+'/'+group_file_name,'r+') as fin:
        lines = fin.readlines()
        items = lines[0].strip('\n').split(' ')
        num_of_vertex = int(items[0])
        num_of_label = int(items[1])
        print('vertex num:%d    label: num%d' % (num_of_vertex, num_of_label))
        for line in lines[1:]:
            items = line.strip('\n').split(':')
            # node is vertex
            node = int(items[0].replace(' ',''))
            assert len(items) == 2
            if (len(items[1]) == 0):
                groups[node] = set([-1]) 
            else:
                id_strs = items[1].strip(' ')
                groups[node] = set([int(x) for x in id_strs.split(' ')])
                for idx in groups[node]:
                  if idx != -1:
                      label_counter[idx] += 1
    
    # sort label frequency
    sorted_labels = sorted(label_counter.items(), key=lambda x:x[1],reverse=True)
          
    valid_label_ids = set()
    if max_label_classes == -1:
        max_label_classes = len(sorted_labels)
    
    for i in range(max_label_classes):
        valid_label_ids.add(sorted_labels[i][0])
    
    # remove unvalid labels
    removed_count = 0
    cleaned_groups = dict()
    for node in groups.keys():
        labels = set(groups[node])
        c_labels = labels & valid_label_ids
        if len(c_labels) > 0:
            cleaned_groups[node] = c_labels
        else:
            removed_count += 1
    print('valid labels: %s' % valid_label_ids )
    print('removed %d vertex, remain %d vertex' % (removed_count, len(cleaned_groups)))
    groups = cleaned_groups
    num_of_label = len(valid_label_ids)
    
    
    
    # 3 types of edges
    fully_shared_labels = set() # 
    partial_shared_labels = set() #
    others = set()
    used_vertices = set()
    
    # then read network from data_path+file_name
    print('reading network from %s' % file_name)
    with open(data_path+'/'+file_name,'r+') as fin:
        lines = fin.readlines()
        print('original #vertices and #edges: '+ lines[0])
        items = lines[0].strip('\n').split(' ')
        
        for line in lines[1:]:
            items = line.strip('\n').split(' ')
            node1 = int(items[0])
            node2 = int(items[1])
            if node1 not in cleaned_groups or node2 not in cleaned_groups:
                continue
            used_vertices.add(node1)
            used_vertices.add(node2)
            node1_label = groups[node1]
            node2_label = groups[node2]
            # overlap_ratio could used to sort
            overlap_ratio = len(node1_label & node2_label) / len(node1_label | node2_label)
            node1_str = ' '.join(list([ str(x) for x in node1_label]))
            node2_str = ' '.join(list([ str(x) for x in node2_label]))
            vertices.add(node1)
            vertices.add(node2)

            if (node2,node1) not in edges:
                edges.add((node1,node2))
            if (node1_label == node2_label) and (node1_label != set([-1])):
                fully_shared_labels.add((node1,node2,1.0,node1_str,node2_str,overlap_ratio))
            elif len(node1_label & node2_label) > 0 and (node1_label != set([-1])) and (node2_label != set([-1])):
                partial_shared_labels.add((node1,node2,1.0,node1_str,node2_str,overlap_ratio))
            else:
                others.add((node1,node2,1.0,node1_str,node2_str,overlap_ratio))
                
        num_of_vertex = len(vertices) == num_of_vertex
        num_of_edges=  len(edges)
    edges = list(edges)

    
    partial_shared_labels = list(sorted(list(partial_shared_labels), key = lambda x:x[5], reverse =True))
    
    # periordically save files
    file_map = {
        'fully' : fully_shared_labels,
        'partial': partial_shared_labels,
        'others': others,
    }
    
    fully_shared_labels = list(fully_shared_labels) # 
    partial_shared_labels = list(partial_shared_labels) #
    others = list(others)
    
    total_edges = 0
    
    for file in file_map:
        out_path = '%s/%s.labeled.%s' % (sampled_path,file_name,file)
        with open(out_path, 'w+', encoding='utf-8') as fout:
            container = file_map[file]
            fout.write('%d\n' % (len(container)))
            print('#%s:%d' % (file,len(container)))
            total_edges += len(container)
            for edge in container:
                fout.write('%s\t%s\t%.2f\t%s\t%s\t%f\n' % edge)
    print('#%s:%d' % ('total edges', total_edges))
    print('#vertices:%d' % len(used_vertices))
    
    # sample
    import random
    random.seed(random_seed)
    
    if keep_num == -1:
        keep_num = total_edges
    assert keep_num <= total_edges, 'the keep num should less than or equal to total edges'
    if full_num != -1 and partial_num != -1 and other_num != -1:
        print('using pre-setted nums')
        nums_from_full = full_num
        nums_from_partial = partial_num
        nums_from_others = other_num
    else:
        nums_from_full = min(len(fully_shared_labels), keep_num)
        nums_from_partial = min(len(partial_shared_labels), keep_num - nums_from_full)
        nums_from_others = keep_num - nums_from_full - nums_from_partial
    
    res_set = []
    if nums_from_full > 0 :
        res_set += random.sample(fully_shared_labels, nums_from_full)
    if nums_from_partial > 0:
        res_set +=partial_shared_labels[0:nums_from_partial]
    if nums_from_others > 0:
        res_set += random.sample(others, nums_from_others)
    
    print('The consitutuion of sampled dataset: \n Full:%d Partial:%d Others: %d Total:%d ' % (nums_from_full,nums_from_partial, nums_from_others, keep_num))
   
    
    # write to file
    out_path = '%s/%s.labeled.%s' % (sampled_path,file_name,'sampled')
    with open(out_path, 'w+', encoding='utf-8') as fout:
        container = res_set
        fout.write('%d\n' % (len(container)))
        print('%s:%d' % (file,len(container)))
        for edge in container:
            fout.write('%s\t%s\t%.2f\t%s\t%s\t%f\n' % edge)
    print('####Done####')

#label_edges('raw_data','generated_data','blogCatalog.txt','blogCatalog-groups.txt',37500,max_label_classes=3)  
label_edges('raw_data','generated_data','youtube.txt','youtube-groups.txt',full_num=2000,partial_num=3200,other_num=4800,max_label_classes=3) 
#label_edges('raw_data','generated_data','flickr.txt','flickr-groups.txt',full_num=11000,partial_num=11000,other_num=33000,max_label_classes=3) 

reading groups from youtube-groups.txt
vertex num:22693    label: num47
valid labels: {19, 20, 6}
removed 13831 vertex, remain 8862 vertex
reading network from youtube.txt
original #vertices and #edges: 22693 96361

#fully:9489
#partial:9697
#others:4907
#total edges:24093
#vertices:7147
using pre-setted nums
The consitutuion of sampled dataset: 
 Full:2000 Partial:3200 Others: 4800 Total:24093 
others:10000
####Done####


# Step2 Assign new vertex id and groud id

In [4]:
def reindex(sampled_path,file_name):
    fully_shared_labels = set() # 
    partial_shared_labels = set() #
    others = set()
    
    new_vertex_index = dict()
    new_label_index = dict()
    
    new_label_index[-1] = 0
    
    in_path = '%s/%s.labeled.%s' % (sampled_path,file_name,'sampled')
    out_path = '%s/%s.labeled.%s' % (sampled_path,file_name,'reindex')
    with open(in_path, 'r', encoding='utf-8') as fin:
        with open(out_path, 'w+', encoding='utf-8') as fout:
            lines = fin.readlines()
            num = int(lines[0][0:-1])
            for line in lines[1:]:
                items = line.strip('\n').split('\t')
                
                vertex1 = str(items[0])
                vertex2 = str(items[1])
                weight = float(items[2])
                labels1 = [int(x) for x in items[3].split(' ')]
                labels2 = [int(x) for x in items[4].split(' ')]
                overlap_ratio = float(items[5])

                if vertex1 not in new_vertex_index:
                    new_vertex_index[vertex1] = len(new_vertex_index)
                if vertex2 not in new_vertex_index:
                    new_vertex_index[vertex2] = len(new_vertex_index)

                new_vertex1 = new_vertex_index[vertex1]
                new_vertex2 = new_vertex_index[vertex2]

                for label in labels1+labels2:
                    if label not in new_label_index:
                        new_label_index[label] = len(new_label_index)

                new_labels1_str = ' '.join([str(new_label_index[x]) for x in labels1])
                new_labels2_str = ' '.join([str(new_label_index[x]) for x in labels2])

                new_edge = (new_vertex1,new_vertex2,weight,new_labels1_str,new_labels2_str,overlap_ratio)
                fout.write('%s\t%s\t%.2f\t%s\t%s\t%f\n' % new_edge)

    
    out_path = '%s/%s.labeled.%s' % (sampled_path,file_name,'vertex_dict')

    # old_vertex_id new_vertex_id
    with open(out_path, 'w+', encoding='utf-8') as fout:
        for key in new_vertex_index.keys():
            fout.write('%s\t%d\n' % (key,new_vertex_index[key]))

    # old_group_id new_group_id
    out_path = '%s/%s.labeled.%s' % (sampled_path,file_name,'label_dict')
    with open(out_path, 'w+', encoding='utf-8') as fout:
        for key in new_label_index.keys():
            fout.write('%d\t%d\n' % (key,new_label_index[key]))

#reindex('generated_data','eco_flickr.txt')
reindex('generated_data','eco_youtube.txt')
#reindex('generated_data','eco_blogCatalog.txt')
print("done")

done


# Step 3 Generate Input for GAN-AAE
foramt:
vertex1 vertex2

### Input
Your input graph data should be a **txt** file and be under **GraphData folder** 



In [5]:
import os
def generate_input(data_path,file_name):
    GRAPH_PATH=data_path+'/output'
    if os.path.exists(GRAPH_PATH) is False:
        os.makedirs(GRAPH_PATH)
    in_path = '%s/%s.labeled.%s' % (data_path,file_name,'vertex_dict')
    with open(in_path,'r+',encoding='utf-8') as fin:
        vertex_num = len(fin.readlines())
        
    in_path = '%s/%s.labeled.%s' % (data_path,file_name,'reindex')
    with open(in_path,'r+',encoding='utf-8') as fin:
        edge_num = len(fin.readlines())
    
    in_path = '%s/%s.labeled.%s' % (data_path,file_name,'label_dict')
    with open(in_path,'r+',encoding='utf-8') as fin:
        label_num = len(fin.readlines())
    
    
    in_path = '%s/%s.labeled.%s' % (data_path,file_name,'reindex')
    out_path = '%s/%s.data' % (GRAPH_PATH,file_name)
    
    new_group_infor = dict()
    with open(in_path,'r+',encoding='utf-8') as fin:
        with open(out_path,'w+',encoding='utf-8') as fout:
            fout.write('%d %d\n' % (vertex_num, edge_num))
            lines = fin.readlines()
            for line in lines:
                items = line.strip('\n').split('\t')
                vertex1 = int(items[0])
                vertex2 = int(items[1])
                weight = float(items[2])
                new_group_infor[vertex1] = items[3]
                new_group_infor[vertex2] = items[4]
                labels1 = [int(x) for x in items[3].split(' ')]
                labels2 = [int(x) for x in items[4].split(' ')]
                overlap_ratio = float(items[5])
                fout.write('%d %d\n' % (vertex1,vertex2))
    out_path = '%s/%s.data_group' % (GRAPH_PATH,file_name)
    with open(out_path,'w+',encoding='utf-8') as fout:
        fout.write('%d %d\n' % (vertex_num, label_num))
        for key in sorted(new_group_infor.keys()):
            fout.write('%d : %s\n' % (key, new_group_infor[key]))


#generate_input('generated_data','eco_blogCatalog.txt')
#print("blog done\n")
#generate_input('generated_data','eco_flickr.txt')
#print("flickr done\n")
generate_input('generated_data','eco_youtube.txt')
print("youtube done\n")

youtube done

