# Goal

Doing Francois-style experiments for simple combinations of features (so, not GBDT at all this time).

Based on his notebook "ABCDoo.ipynb", and continuing from V2 of the notebooks with the same name.

In [1]:
import pickle
import pandas as pd
import igraph as ig
from collections import Counter
from matplotlib import pyplot as plt
import numpy as np
from CAS import *
import copy
from pathlib import Path
import glob
import utils
import importlib
importlib.reload(utils)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import random
import csv
import subprocess
import sys
sys.path.append('../')
from CAS import *
from itertools import combinations
import os

# Helper functions

In [2]:
# Build a simple row classifier.

def simple_classifier(x,thresh,weights = [1.0,0.1,0.1],high_deg = -1.0):
    if x[12] > high_deg:
        return 1.0*((weights[0]*x[3] + weights[1]*x[4] - weights[2]*x[5])>thresh)
    else:
        return 1.0*((weights[0]*x[0] + weights[1]*x[1] - weights[2]*x[2])>thresh)



In [3]:
#oNMI = '/Users/francois/Book/GraphMiningNotebooks/oNMI/onmi'          ## overlapping NMI executable
oNMI = '/work/home/fcthebe/Tools/oNMI/onmi'          ## overlapping NMI executable

## input format: 
def compute_oNMI(First, Second):
    fn1 = '__'+str(random.random())[2:]
    with open(fn1,"w") as f:
        wr = csv.writer(f, delimiter=" ")
        wr.writerows(First)
    f.close()   

    fn2 = '__'+str(random.random())[2:]
    with open(fn2,"w") as f:
        wr = csv.writer(f, delimiter=" ")
        wr.writerows(Second)
    f.close()   
    x = float(subprocess.check_output([oNMI,fn1,fn2]).decode("utf-8").split()[1])
    _ = os.system('rm '+fn1)
    _ = os.system('rm '+fn2)
    return x

# Standardize for feeding into algorithm

For now: just running once. 

Soon: make a "do_experiment" function that loops

In [4]:
# Wrapper
def run_expts(xis = [0.5], etas=[1.0,1.5], reps=[x for x in range(1,9)], path = '/data/ABCDoo/', weights = [[1.0,0.0,0.0],[1.0,0.1,0.1],[1.0,0.2,0.1]], threshes=[0.05,0.1,0.15,0.2,0.25], mean = True):
    res = dict()
    for xi in xis:
        for eta in etas:
            for i in range(len(weights)):
                weight = weights[i]
                for thresh in threshes:
                    name = "xi_" + str(xi) + "_eta_" + str(eta) + "_weightInd_" + str(i) + "_thresh_" + str(thresh)
                    temp = []
                    for rep in reps:
                        temp.append(run_expt(xi=xi, eta=eta,rep=rep,path=path,weight=weight,thresh=thresh))
                    if mean:
                        res[name] = np.mean(temp)
                    else:
                        res[name] = temp
    return res



In [5]:
def run_expt(xi,eta,rep,path,weight,thresh):
    # Load data, create features

    # Load graph, create ground-truth M matrix.
    g = utils.readGraph(xi=xi, eta=eta, rep=rep, path = path)
    GT = utils.mems2comms(g.vs['comms'])[1:]  ## ignore "outlier community"
    GT = [[str(i) for i in x] for x in GT]
    # Initial guess
    L = g.community_leiden(objective_function='modularity').membership
    M = partition2sparse(L)
    A = g.get_adjacency_sparse()
    ### Compute Beta and C scores; also get degrees w.r.t. initial partition
    IEF, Beta, C, Pv, DegPart = CAS(A, M, alpha=1)
    
    # Features.
    # Recall shape: (A) Loop through Scores, including (i) raw score, (ii) biggest other score, (iii) rank in that order. (B) Loop through global features (in this case, just degree).
    features = utils.build_basic_features(A, [Beta,IEF,C,Pv], [DegPart])

    # Get new community and process it
    MHat_vec = np.apply_along_axis(simple_classifier, 1, features,**{"thresh":thresh, "weights":weight})
    MHat = utils.contract_labels(MHat_vec,M.shape[0])
    Labels_Hat = utils.mems2comms(utils.m2mems(MHat))
    return compute_oNMI(GT,Labels_Hat)

    

In [6]:
%%time
res = run_expts()

CPU times: user 28min 8s, sys: 7.37 s, total: 28min 16s
Wall time: 28min 21s


In [7]:
res

{'xi_0.5_eta_1.0_weightInd_0_thresh_0.05': 0.0525145625,
 'xi_0.5_eta_1.0_weightInd_0_thresh_0.1': 0.5153425,
 'xi_0.5_eta_1.0_weightInd_0_thresh_0.15': 0.671961375,
 'xi_0.5_eta_1.0_weightInd_0_thresh_0.2': 0.9151613749999999,
 'xi_0.5_eta_1.0_weightInd_0_thresh_0.25': 0.916790125,
 'xi_0.5_eta_1.0_weightInd_1_thresh_0.05': 0.36774450000000003,
 'xi_0.5_eta_1.0_weightInd_1_thresh_0.1': 0.671719875,
 'xi_0.5_eta_1.0_weightInd_1_thresh_0.15': 0.85221625,
 'xi_0.5_eta_1.0_weightInd_1_thresh_0.2': 0.9190851250000001,
 'xi_0.5_eta_1.0_weightInd_1_thresh_0.25': 0.942109375,
 'xi_0.5_eta_1.0_weightInd_2_thresh_0.05': 0.13507787500000001,
 'xi_0.5_eta_1.0_weightInd_2_thresh_0.1': 0.44278874999999995,
 'xi_0.5_eta_1.0_weightInd_2_thresh_0.15': 0.669571125,
 'xi_0.5_eta_1.0_weightInd_2_thresh_0.2': 0.81602825,
 'xi_0.5_eta_1.0_weightInd_2_thresh_0.25': 0.927467,
 'xi_0.5_eta_1.5_weightInd_0_thresh_0.05': 0.05140375,
 'xi_0.5_eta_1.5_weightInd_0_thresh_0.1': 0.25920325,
 'xi_0.5_eta_1.5_weightIn