In [1]:
import networkx as nx
import csv
import numpy as np
import pandas as pd
import pandas_profiling as pp
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tqdm import tqdm

Load Training data and building graph

In [2]:
training_set = []
G = nx.DiGraph()
with open("training.txt", "r") as f:
    for line in f:
        line = line.split()
        training_set.append(line)
        G.add_nodes_from(line[:2])
        if line[2]=='1':
            G.add_edge(line[0], line[1])
        
training_set = np.array(training_set)

In [11]:
testing_set = []
with open("testing.txt", "r") as f:
    for line in f:
        line = line.split()
        testing_set.append(line)
        
testing_set = np.array(testing_set)

# Features engineering

In [4]:
precompute = {}
print("preparing pagerank")
precompute["pr"] = nx.pagerank(G, alpha=0.85)
precompute["mean_pr"] = float(sum(precompute["pr"].values())) / len(precompute["pr"])
print("->.....OK")

print("preparing weakly_connected_components")
precompute["wcc"] = list(nx.weakly_connected_components(G)) #getting weekly connected edges from graph
print("->.....OK")

print("preparing katz")
precompute["katz"] = nx.katz.katz_centrality(G,alpha=0.005,beta=1)
precompute["mean_katz"] = float(sum(precompute["katz"].values())) / len(precompute["katz"])
print("->.....OK")

print("preparing hits")
precompute["hits"] = nx.hits(G, max_iter=100, tol=1e-08, nstart=None, normalized=True)
print("->.....OK")

#weight for source and destination of each link
print("preparing weight")
precompute["Weight_in"] = {}
precompute["Weight_out"] = {}
for i in  G.nodes():
    s1=set(G.predecessors(i))
    w_in = 1.0/(np.sqrt(1+len(s1)))
    precompute["Weight_in"][i]=w_in

    s2=set(G.successors(i))
    w_out = 1.0/(np.sqrt(1+len(s2)))
    precompute["Weight_out"][i]=w_out

#for imputing with mean
precompute["mean_weight_in"] = np.mean(list(precompute["Weight_in"] .values()))
precompute["mean_weight_out"] = np.mean(list(precompute["Weight_out"] .values()))



preparing pagerank
->.....OK
preparing weakly_connected_components
->.....OK
preparing katz
->.....OK
preparing hits
->.....OK
preparing weight


In [5]:
class GraphExtractedFeatures:
      
    def __init__(self, G, precom):
        self.G = G
        self.undir_G = nx.to_undirected(G)
        self.compute_degree_avg()
        
        self.pr = precom["pr"]
        self.mean_pr = precom["mean_pr"]
        
        self.wcc = precom["wcc"]
        
        self.katz = precom["katz"]
        self.mean_katz = precom["mean_katz"]
        
        self.hits = precom["hits"]
        
        #weight for source and destination of each link
        self.Weight_in = precom["Weight_in"]
        self.Weight_out = precom["Weight_out"]

        #for imputing with mean
        self.mean_weight_in = precom["mean_weight_in"]
        self.mean_weight_out = precom["mean_weight_out"]
    
    def set_features(self, F):
        self.features = F

    def compute_degree_avg(self):
        G_N_degrees = list(nx.degree(self.undir_G, self.undir_G.nodes()))
        self.degree_avg = sum([G_N_degrees[i][1] for i in range(len(G_N_degrees))])/len(G_N_degrees)
    
    def getCN_nodes(self, nodes):
        return list(set(self.undir_G.neighbors(nodes[0])) & set(self.undir_G.neighbors(nodes[1])))
        
    """ == Features for undirected Graph == """
    def _cn(self, nodes):
        return len(self.CN_nodes)
        
    def _aa(self, nodes):
        if len(self.CN_nodes)==0:
            return 0
        return sum(1/np.log([self.undir_G.degree(v) for v in self.CN_nodes if self.undir_G.degree(v)!=0 ]))
    
    def _ra(self, nodes):
        if len(self.CN_nodes)==0:
            return 0
        return sum(1/np.array([self.undir_G.degree(v) for v in self.CN_nodes if self.undir_G.degree(v)!=0 ]))
        
    """ == Features for directed Graph == """
    def _s_out(self, nodes):
        return self.kx_out
    
    def _t_out(self, nodes):
        return self.ky_out
    
    def _s_in(self, nodes):
        return self.kx_in
    
    def _t_in(self, nodes):
        return self.ky_in
    
    def _st_inter_out(self, nodes):
        return len(self.inter_out)
    
    def _st_inter_in(self, nodes):
        return len(self.inter_in)
    
    def _jc_out(self, nodes):
        if len(self.union_out)==0:
            return 0
        return len(self.inter_out)/len(self.union_out)
    
    def _jc_in(self, nodes):
        if len(self.union_in)==0:
            return 0
        return len(self.inter_in)/len(self.union_in)
    
    def _cos_out(self, nodes):
        if self.kx_out==0 or self.ky_out==0:
            return 0
        return len(self.inter_out)/(self.kx_out*self.ky_out)
    
    def _cos_in(self, nodes):
        if self.kx_in==0 or self.ky_in==0:
            return 0
        return len(self.inter_in)/(self.kx_in*self.ky_in)
       
    def _aa_d(self, nodes):
        return sum(1/np.log10([len(list(self.G.predecessors(v))) for v in self.inter_out if len(list(self.G.predecessors(v))) !=0 ]))
    
    def _follows_back(self,nodes):
        if self.G.has_edge(nodes[1],nodes[0]):
            return 1
        else:
            return 0
    
    def _sweight_out(self, nodes):
        return self.Weight_out.get(nodes[0], self.mean_weight_out)
    
    def _tweight_in(self, nodes):
        return self.Weight_in.get(nodes[1], self.mean_weight_in)
    
    def _weight_f1(self, nodes):
        return self._sweight_out(nodes) + self._tweight_in(nodes)
    
    def _weight_f2(self, nodes):
        return self._sweight_out(nodes) * self._tweight_in(nodes)
    
    def _pr_s(self, nodes):
        return self.pr.get(nodes[0])
    
    def _pr_t(self, nodes):
        return self.pr.get(nodes[1])
    
    def _katz_s(self, nodes):
        return self.katz.get(nodes[0])
    
    def _katz_t(self, nodes):
        return self.katz.get(nodes[1])
    
    def _hubs_s(self, nodes):
        return self.hits[0].get(nodes[0])
    
    def _hubs_t(self, nodes):
        return self.hits[0].get(nodes[1])
    
    def _auto_s(self, nodes):
        return self.hits[1].get(nodes[0])
    
    def _auto_t(self, nodes):
        return self.hits[1].get(nodes[1])
    
    def _pa(self, nodes):
        return self.kx_in*self.ky_in
    
    def _ded(self, nodes):
        if len(self.Ax)==0:
            return 0
        return len(set(self.Ax) & set(self.Dy))/len(self.Ax)
    
    def _ind(self, nodes):
        if len(self.Dx)==0:
            return 0
        return len(set(self.Dx) & set(self.Dy))/len(self.Dx)
    
    def _ded_log(self, nodes):
        if len(self.Ax)==0:
            return 0
        return len(set(self.Ax) & set(self.Ay)) * np.log(len(self.Ax)) / len(self.Ax)

    def _ind_log(self, nodes):
        if len(self.Dx)==0:
            return 0
        return len(set(self.Dx) & set(self.Dy)) * np.log(len(self.Dx)) / len(self.Dx)
    
    def _inf(self, nodes):
        return self._ded(nodes) + self._ded(nodes)

    def _inf_log(self, nodes):
        return self._ded_log(nodes) + self._ind_log(nodes)

    def hierachical(self, name, nodes):
        return getattr(self, '_'+name)(nodes) if len(self.Dx)>len(self.Dy) else 0 
    
    def getFeatures(self, nodes):
        self.CN_nodes = self.getCN_nodes(nodes)
        
        self.kx_out = len(list(self.G.successors(nodes[0])))
        self.ky_out = len(list(self.G.successors(nodes[1])))
        self.inter_out = list(set(self.G.successors(nodes[0])) & set(self.G.successors(nodes[1])))
        self.union_out = list(set(self.G.successors(nodes[0])) | set(self.G.successors(nodes[1])))
        
        self.kx_in = len(list(self.G.predecessors(nodes[0])))
        self.ky_in = len(list(self.G.predecessors(nodes[1])))
        self.inter_in = list(set(self.G.predecessors(nodes[0])) & set(self.G.predecessors(nodes[1])))
        self.union_in = list(set(self.G.predecessors(nodes[0])) | set(self.G.predecessors(nodes[1])))
        
        self.CN_nb = self._cn(nodes)
        self.kx = self.undir_G.degree(nodes[0])
        self.Dx = sorted(self.G.predecessors(nodes[0]))
        self.Ax = sorted(self.G.successors(nodes[0]))
        
        self.ky = self.undir_G.degree(nodes[1])
        self.Dy = sorted(self.G.predecessors(nodes[1]))
        self.Ay = sorted(self.G.successors(nodes[1]))
        ft_directed = [getattr(self, '_'+name)(nodes) for name in self.features["directed"]]
        ft_hierachical = [self.hierachical(name,nodes) for name in self.features["hierachical"]]
        return np.array(ft_directed+ft_hierachical)

In [6]:
GF = GraphExtractedFeatures(G, precompute)

In [7]:
features = {"directed":["weight_f1","pr_t", "auto_t", "weight_f2",
                        "s_out", "aa_d", "hubs_s", "jc_in", "jc_out","cos_out", "cos_in","hubs_t",
                        "cos_out", "cos_in", "st_inter_in", 
                        "pr_s", "pr_t", "t_in", "auto_s", "pa", "ded", "ind",
                        "inf", "inf_log", "katz_s", "katz_t", "st_inter_out", "sweight_out", "tweight_in", 'cn',
                        'ra'
                       ],
            
            "hierachical":["ra", "aa"]}
GF.set_features(features)

In [8]:
#embedding for 1 label edge
X = []
for i in tqdm(range(training_set.shape[0])):
    X.append(GF.getFeatures(training_set[i,:2]))
X = np.vstack(X)  

100%|██████████| 453797/453797 [09:23<00:00, 805.45it/s] 


In [12]:
test = []
for i in tqdm(range(len(testing_set))):
    test.append(GF.getFeatures(testing_set[i]))
test = np.vstack(test)  

100%|██████████| 113450/113450 [02:22<00:00, 797.84it/s] 


In [16]:
test.shape

(113450, 33)

In [10]:
y = training_set[:,2]
y = np.array([int(x) for x in y ])
y

array([1, 0, 1, ..., 0, 1, 0])

In [20]:
idx_0 = np.where(y==0)[0]
idx_1 = np.where(y==1)[0]

In [22]:
idx_0.shape

(170174,)

In [25]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=100)
kf.get_n_splits(idx_0)

100

In [28]:
for train_index, test_index in kf.split(X):
    print(train_index), print(test_index)

[  4538   4539   4540 ... 453794 453795 453796]
[   0    1    2 ... 4535 4536 4537]
[     0      1      2 ... 453794 453795 453796]
[4538 4539 4540 ... 9073 9074 9075]
[     0      1      2 ... 453794 453795 453796]
[ 9076  9077  9078 ... 13611 13612 13613]
[     0      1      2 ... 453794 453795 453796]
[13614 13615 13616 ... 18149 18150 18151]
[     0      1      2 ... 453794 453795 453796]
[18152 18153 18154 ... 22687 22688 22689]
[     0      1      2 ... 453794 453795 453796]
[22690 22691 22692 ... 27225 27226 27227]
[     0      1      2 ... 453794 453795 453796]
[27228 27229 27230 ... 31763 31764 31765]
[     0      1      2 ... 453794 453795 453796]
[31766 31767 31768 ... 36301 36302 36303]
[     0      1      2 ... 453794 453795 453796]
[36304 36305 36306 ... 40839 40840 40841]
[     0      1      2 ... 453794 453795 453796]
[40842 40843 40844 ... 45377 45378 45379]
[     0      1      2 ... 453794 453795 453796]
[45380 45381 45382 ... 49915 49916 49917]
[     0      1      2 

In [26]:
np.save("X.npy", X)
np.save('y.npy', y)


In [17]:
np.save('X_test_old.npy', test)

split data

In [24]:
test = []
for i in range(len(testing_set)):
    test.append(GF.getFeatures(testing_set[i]))
test = np.vstack(test)  

In [25]:
np.save("test.npy", test)

In [9]:
# pred = model.predict(test[:,:-3]).tolist()
predstr = [str(pred[i]) for i in range(len(pred))]

In [45]:
predstr = [str(pred[i]) for i in range(len(pred))]
predictions = zip(range(len(predstr)), predstr)
# Write the output in the format required by Kaggle
with open("predictions.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in predictions:
        csv_out.writerow(row) 

In [18]:
import pickle
loaded_model = pickle.load(open("/home/kanva/Workspace/3A/INF554-ML1/Challenge/2-ModelEvaluation/result/handcraft/Random_Forest.sav", 'rb'))

In [19]:
test = np.load('/home/kanva/Workspace/3A/INF554-ML1/Challenge/2-ModelEvaluation/data_old/X_test.npy')

In [22]:
test.shape

(113450, 26)

In [20]:
pred = loaded_model.predict(test[:,:-2])

NameError: name 'scaler' is not defined

In [44]:
pred

array([1, 0, 0, ..., 0, 0, 0])