In [42]:
class Graph():

    def __init__(self):
        self.node_list = []
        self.edge_list = []        
    
    def add_node(self, n):
        self.node_list.append(n)        
    
    def add_edge(self, e):
        raise NotImplementedError
    
    def get_neighbors(self, n):
        raise NotImplementedError

class Node(object):
    def __init__(self, node_id, feature_vector = None, label = None):
        self.node_id = node_id
        self.feature_vector = feature_vector
        self.label = label


class Edge(object):
    def __init__(self, from_node, to_node, feature_vector = None, label = None):
        self.from_node = from_node
        self.to_node = to_node
        self.feature_vector = feature_vector
        self.label = label

In [43]:
from collections import defaultdict

In [45]:
class DirectedGraph(Graph):
    
    def __init__(self):
        super(DirectedGraph, self).__init__()
        self.successors = defaultdict(set)
        self.predecessors = defaultdict(set)
        self.str_class=[]

    def add_edge(self, e):
        self.edge_list.append(e)
        self.successors[e.from_node].add(e.to_node)
        self.predecessors[e.to_node].add(e.from_node)
    
    def get_out_neighbors(self, n):
        return self.successors[n]
    
    def get_in_neighbors(self, n):
        return self.predecessors[n]
    
    def get_neighbors(self, n):
        return self.successors[n].union(self.predecessors[n])

In [46]:
def load_links_data(content_file, cites_file):

    links_graph=DirectedGraph()
    domain_labels=[]
    id_obj_map={}
    
    with open(content_file, 'r') as node_file:
        for line in node_file:
            line_info=line.split('\n')[0]
            line_info = line_info.split('\t')
            # id - attribute - label
            n=Node(line_info[0],map(float,line_info[1:-1]),line_info[-1])
            links_graph.add_node(n)
            if line_info[-1] not in domain_labels:
                domain_labels.append(line_info[-1])
            id_obj_map[line_info[0]]=n

    with open(cites_file,'r') as edge_file:
        for line in edge_file:
            line_info=line.split('\n')[0]
            line_info=line_info.split('\t')
            if line_info[0] in id_obj_map.keys() and line_info[1] in id_obj_map.keys():
                from_node=id_obj_map[line_info[1]]
                to_node=id_obj_map[line_info[0]]
                links_graph.add_edge(Edge(from_node,to_node))

    print "domain labels"
    print domain_labels

    return links_graph,domain_labels


In [47]:
class Aggregator():
    
    def __init__(self, domain_labels):
        self.domain_labels = domain_labels # The list of labels in the domain
    
    def aggregate(self, graph, node, conditional_node_to_label_map):
        raise NotImplementedError

In [48]:
class CountAggregator(Aggregator):
    '''Count'''
    
    def aggregate(self, graph, node, conditional_node_to_label_map):
        neighbor_undirected = []
        #Initialisation des scores du voisinages
        for x in self.domain_labels:
            neighbor_undirected.append(0.0)
        #comptage des voisins
        for i in graph.get_neighbors(node):
            #Vérifier si le label est connu
            if i in conditional_node_to_label_map.keys():
                index = self.domain_labels.index(conditional_node_to_label_map[i])
                neighbor_undirected[index] += 1.0
        return neighbor_undirected



In [49]:
class ProportionalAggregator(Aggregator):
    '''Proportional'''
    
    def aggregate(self, graph, node, conditional_node_to_label_map):
        cntag = CountAggregator(self.domain_labels)
        cnt_agg = cntag.aggregate(graph,node,conditional_node_to_label_map)
        total_sum = sum(cnt_agg)
        if total_sum > 0:
            for r in range(len(cnt_agg)):
                cnt_agg[r] /= total_sum
        p_list = cnt_agg
        return p_list

In [50]:
class ExistAggregator(Aggregator):
    '''The exist aggregate'''
    
    def aggregate(self, graph, node, conditional_node_to_label_map):
        cntag=CountAggregator(self.domain_labels)
        cnt_agg = cntag.aggregate(graph,node,conditional_node_to_label_map)
        for r in range(len(cnt_agg)):
            if cnt_agg[r] >= 1:
                cnt_agg[r] = 1
        ext_list = cnt_agg
        return ext_list

In [51]:
class Classifier(object):
    '''
        The base classifier object
        '''
    
    def __init__(self):
        self.clf = DecisionTreeClassifier()
    
    
    def fit(self, graph, train_indices):
        raise NotImplementedError
    
    def predict(self, graph, test_indices, conditional_node_to_label_map = None):
        raise NotImplementedError

In [52]:
class LocalClassifier(Classifier):
    
    def fit(self, graph, train_indices):
        
        feature_list= []
        label_list=[]
        g= graph
        n= g.node_list
        training_nodes=[n[i] for i in train_indices]
        
        for nodes in training_nodes:
            feature_list.append(nodes.feature_vector)
            label_list.append(nodes.label)
        
        self.clf.fit(feature_list, label_list)
        return
    
    
    def predict(self, graph, test_indices, conditional_node_to_label_map = None):
        
        feature_list=[]
        g= graph
        n=g.node_list
        testing_nodes = [n[i] for i in test_indices]
        
        for nodes in testing_nodes:
            feature_list.append(nodes.feature_vector)
        
        y= self.clf.predict(feature_list)
        return y


In [53]:
class RelationalClassifier(Classifier):
    
    def __init__(self, aggregator, use_node_attributes = True):
        super(RelationalClassifier, self).__init__()
        self.aggregator = aggregator
        self.use_node_attributes = use_node_attributes
    
    
    
    def fit(self, graph, train_indices):
        conditional_map={}
        for i in train_indices:
            conditional_map[graph.node_list[i]]=graph.node_list[i].label
        features=[]
        labels=[]
        for i in train_indices:
            self.feature_combination_check(graph,features,i,conditional_map)
            labels.append(graph.node_list[i].label)
        self.clf.fit(features,labels)
    
    def predict(self, graph, test_indices, conditional_node_to_label_map = None):
        # raise NotImplementedError('You need to implement this method')
        features=[]
        for i in test_indices:
            self.feature_combination_check(graph,features,i,conditional_node_to_label_map)
        return self.clf.predict(features)
    
    def feature_combination_check(self,graph,features,i,conditional_map):
        aggregates=self.aggregator.aggregate(graph,graph.node_list[i],conditional_map)
        feat_list=np.array([])
        if self.use_node_attributes:
            feat_list = np.array([graph.node_list[i].feature_vector])
            feat_list.tolist()
            feat_list=np.append(feat_list.tolist(),aggregates)
        else:
            feat_list=np.append(feat_list.tolist(),aggregates)
        features.append(feat_list)

In [54]:
class ICA(Classifier):
    
    def __init__(self, local_classifier, relational_classifier, max_iteration = 10):
        self.local_classifier = local_classifier
        self.relational_classifier = relational_classifier
        self.max_iteration = max_iteration
    
    def fit(self, graph, train_indices):
        self.local_classifier.fit(graph, train_indices)
        self.relational_classifier.fit(graph, train_indices)
    
    
    def predict(self, graph, test_indices, conditional_node_to_label_map = None):
        predictclf=self.local_classifier.predict(graph,test_indices)
        self.cond_mp_upd(graph,conditional_node_to_label_map,predictclf,test_indices)
        relation_predict=[]
        temp=[]
        for eachTrail in range(self.max_iteration):
            for x in test_indices:
                temp.append(x)
                rltn_pred=list(self.relational_classifier.predict(graph,temp,conditional_node_to_label_map))
                self.cond_mp_upd(graph,conditional_node_to_label_map,rltn_pred,temp)
                temp.remove(x)
        for ti in test_indices:
            relation_predict.append(conditional_node_to_label_map[graph.node_list[ti]])
        return relation_predict

    def cond_mp_upd(self,graph,conditional_map,pred,indices):
        for x in range(len(pred)):
            conditional_map[graph.node_list[indices[x]]]=pred[x]


In [82]:
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
from sklearn.tree import DecisionTreeClassifier

graph, domain_labels = load_links_data('WebKB/content/cornell.content', 'WebKB/cites/cornell.cites')

    
accuracies = []
    
cm = None
    
for train, test in kf:
    clf = LocalClassifier()
    clf.fit(graph, train)
    y_pred = clf.predict(graph, test)
    y_true = [graph.node_list[t].label for t in test]
    accuracies.append(accuracy_score(y_true, y_pred))
    if cm is None:
        cm = confusion_matrix(y_true, y_pred, labels = domain_labels)
    else:
        cm += confusion_matrix(y_true, y_pred, labels = domain_labels)

    
print accuracies
print "Mean accuracy: %0.4f +- %0.4f" % (np.mean(accuracies), np.std(accuracies))
print cm

domain labels
['student', 'project', 'course', 'staff', 'faculty']
[0.5, 0.59999999999999998, 0.80000000000000004, 0.59999999999999998, 0.55000000000000004, 0.78947368421052633, 0.63157894736842102, 0.78947368421052633, 0.68421052631578949, 0.68421052631578949]
Mean accuracy: 0.6629 +- 0.1000
[[64  3  6  7  3]
 [11  3  1  2  2]
 [ 4  0 35  0  3]
 [ 6  0  1 11  1]
 [ 7  1  3  5 16]]


In [84]:
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
import numpy as np
from sklearn.tree import DecisionTreeClassifier

def create_map(graph,train_indices):
    conditional_map={}
    for i in train_indices:
        conditional_map[graph.node_list[i]]=graph.node_list[i].label
    return conditional_map


graph, domain_labels = load_links_data('WebKB/content/cornell.content', 'WebKB/cites/cornell.cites')

budget=[0.7,0.8,0.9]

n=range(len(graph.node_list))
   
ica_accuracies = defaultdict(list)
iteration = 100
for it in range(iteration):
    
    for b in budget:
        print 'iteration',it,'budget',b
    
        
        train, test = train_test_split(n, train_size=b,random_state=t)    
        # True labels
        y_true=[graph.node_list[t].label for t in test]
        local_clf=LocalClassifier()
        # Get aggregator
        agg= CountAggregator(domain_labels=domain_labels)
        relational_clf=RelationalClassifier( agg )
        ica=ICA(local_clf,relational_clf)
        ica.fit(graph,train)
        conditional_node_to_label_map=create_map(graph,train)
        ica_predict=ica.predict(graph,test,conditional_node_to_label_map)
        ica_accuracy=accuracy_score(y_true,ica_predict)
        ica_accuracies[b].append(ica_accuracy)
for b in budget:
    print str(b)+'\t\t'+str(np.mean(ica_accuracies[b]))


domain labels
['student', 'project', 'course', 'staff', 'faculty']
iteration 0 budget 0.7
iteration 0 budget 0.8
iteration 0 budget 0.9
iteration 1 budget 0.7
iteration 1 budget 0.8
iteration 1 budget 0.9
iteration 2 budget 0.7
iteration 2 budget 0.8
iteration 2 budget 0.9
iteration 3 budget 0.7
iteration 3 budget 0.8
iteration 3 budget 0.9
iteration 4 budget 0.7
iteration 4 budget 0.8
iteration 4 budget 0.9
iteration 5 budget 0.7
iteration 5 budget 0.8
iteration 5 budget 0.9
iteration 6 budget 0.7
iteration 6 budget 0.8
iteration 6 budget 0.9
iteration 7 budget 0.7
iteration 7 budget 0.8
iteration 7 budget 0.9
iteration 8 budget 0.7
iteration 8 budget 0.8
iteration 8 budget 0.9
iteration 9 budget 0.7
iteration 9 budget 0.8
iteration 9 budget 0.9
iteration 10 budget 0.7
iteration 10 budget 0.8
iteration 10 budget 0.9
iteration 11 budget 0.7
iteration 11 budget 0.8
iteration 11 budget 0.9
iteration 12 budget 0.7
iteration 12 budget 0.8
iteration 12 budget 0.9
iteration 13 budget 0.7
ite