In [1]:
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn import metrics

### Features

- **x1** - source → c → sink
- **x2** - source → c ← sink
- **x3** - source ← c → sink
- **x4** - source ← c ← sink
- **x5** - common nodes (#c)
- **x6** - Number following (Source).
- **x7** - Number of followers (Sink)
- **x8** - Adamic Adar
- **x9** - Jaccard's coeffcient

### Load Training Data

In [5]:
pos_trainData = pd.read_csv('Total_Features/pos_total_ra.xls')
print(pos_trainData)

       Unnamed: 0  Unnamed: 0.1  x1  x2  x3  x4  y        x8        HD  \
0               0             0  59   0  57   0  1  5.823572  0.000590   
1               1             1   2   2   2   2  1  0.199040  0.002567   
2               2             2  19   0  11   0  1  2.325911  0.000999   
3               3             3   1   0   1   0  1  0.092985  0.000001   
4               4             4   0   0   0   0  1  0.000000  0.000000   
...           ...           ...  ..  ..  ..  .. ..       ...       ...   
19996       19996         19996  23   0  23   0  1  2.105039  0.001442   
19997       19997         19997  69   0  69   0  1  7.528091  0.001964   
19998       19998         19998   0   0   0   0  1  0.000000  0.000000   
19999       19999         19999  35   0  34   0  1  3.891917  0.000254   
20000       20000         20000   0   0   0   0  1  0.000000  0.000000   

             ra        x9           LHN        SI        SC        HP  
0      0.003815  0.000590  8.310343e-06

In [7]:
neg_trainData = pd.read_csv('Total_Features/neg_total_ra.xls')
print(neg_trainData)

       Unnamed: 0  Unnamed: 0.1  x1  x2  x3  x4  y        x8        x9  \
0               0             0   1   0   1   0  0  0.088650  0.000366   
1               1             1   0   0   0   0  0  0.000000  0.000000   
2               2             2   0   0   0   0  0  0.000000  0.000000   
3               3             3   0   0   0   0  0  0.000000  0.000000   
4               4             4   0   0   0   0  0  0.000000  0.000000   
...           ...           ...  ..  ..  ..  .. ..       ...       ...   
19996       19996         19996   0   0   0   0  0  0.000000  0.000000   
19997       19997         19997   0   0   0   0  0  0.000000  0.000000   
19998       19998         19998  14   0   9   0  0  1.566421  0.020380   
19999       19999         19999   0   0   0   0  0  0.000000  0.000000   
20000       20000         20000   0   0   0   0  0  0.000000  0.000000   

             ra        HP        SC        SI       LHN        HD  
0      0.000013  0.090909  0.005779  0.0003

In [8]:
trainData = pd.concat([pos_trainData, neg_trainData], ignore_index=True)
print(trainData)

       Unnamed: 0  Unnamed: 0.1  x1  x2  x3  x4  y        x8        HD  \
0               0             0  59   0  57   0  1  5.823572  0.000590   
1               1             1   2   2   2   2  1  0.199040  0.002567   
2               2             2  19   0  11   0  1  2.325911  0.000999   
3               3             3   1   0   1   0  1  0.092985  0.000001   
4               4             4   0   0   0   0  1  0.000000  0.000000   
...           ...           ...  ..  ..  ..  .. ..       ...       ...   
39997       19996         19996   0   0   0   0  0  0.000000  0.000000   
39998       19997         19997   0   0   0   0  0  0.000000  0.000000   
39999       19998         19998  14   0   9   0  0  1.566421  0.025168   
40000       19999         19999   0   0   0   0  0  0.000000  0.000000   
40001       20000         20000   0   0   0   0  0  0.000000  0.000000   

             ra        x9           LHN        SI        SC        HP  
0      0.003815  0.000590  8.310343e-06

In [10]:
trainData = shuffle(trainData).reset_index(drop=True)
print(trainData)

       Unnamed: 0  Unnamed: 0.1  x1  x2   x3  x4  y         x8        HD  \
0           17207         17207   0   0    0   0  1   0.000000  0.000000   
1           13319         13319  30   0   30   0  1   2.792255  0.000335   
2           11882         11882   9   0    9   0  1   0.791686  0.000131   
3            3117          3117  58   0   56   0  1   5.803111  0.012748   
4           19015         19015   0   0    0   0  0   0.000000  0.000000   
...           ...           ...  ..  ..  ...  .. ..        ...       ...   
39997       11438         11438   1   0    0   0  0   0.087614  0.002242   
39998       10809         10809   8   0    8   0  1   0.892251  0.000321   
39999       11501         11501  12   0   11   0  1   1.385755  0.000155   
40000       11157         11157  67  47  207  43  1  30.375041  0.165791   
40001       12514         12514   0   0    0   0  0   0.000000  0.000000   

             ra        x9       LHN        SI        SC        HP  
0      0.000000  0.

### Calculate Measure

In [11]:
def calculate_AUC (X_train, X_test, y_train, y_test, clf) :
    try :
        predict_prob = clf.predict_proba(X_test)[:,1:]
        y_pred_train = clf.predict(X_train)[:,1:]
        y_pred_test = clf.predict(X_test)[:,1:]
    except:
        predict_prob = clf.predict(X_test)
        y_pred_train = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)
    
    auc = metrics.roc_auc_score(y_test, predict_prob)
    try: 
        acc = metrics.accuracy_score(y_test, predict_prob)
    except:
        acc = None
    
    w_sklearn = np.r_[clf.intercept_, clf.coef_.squeeze()]
    print("Weights: {}".format(w_sklearn))
    print("Accuracy: {}".format(acc))
    print("ROC AUC: {}".format(auc))
    print('Train MSE:', mean_squared_error(y_pred_train, y_train))
    print('Test MSE:', mean_squared_error(y_pred_test, y_test))
    return

In [12]:
def mean_squared_error(y_true, y_pred):
    return np.mean((y_pred - y_true)**2) 

### Prepare Models

In [13]:
y = trainData['y']
print(y)

0        1
1        1
2        1
3        1
4        0
        ..
39997    0
39998    1
39999    1
40000    1
40001    0
Name: y, Length: 40002, dtype: int64


### Selecting Different Features

In [26]:
from sklearn.model_selection import train_test_split
X_9feat = trainData[['x1', 'x2', 'x3', 'x4', 'x8', 'x9', 'SI', 'SC', 'HP']]
X_6feat_final = trainData[['x1', 'x2', 'x3', 'x4', 'x8', 'ra']]
X_common_JC_ad = trainData[['x1', 'x2', 'x3', 'x4', 'x8', 'x9']]
X_3feat = trainData[['x8', 'x9', 'ra']]
X = X_6feat_final

### Initial Tests

In [16]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90051)
print("Training set has {} instances. Test set has {} instances.".format(X_train.shape[0], X_test.shape[0]))

print('\n------Logistic Regression------')
model = LogisticRegression()
model.fit(X_train, y_train)
calculate_AUC(X_train, X_test, y_train, y_test, model)

print('\n-------Linear Regression-------')
lr = LinearRegression()
lr.fit(X_train, y_train)
calculate_AUC(X_train, X_test, y_train, y_test, lr)

print('\n--------Ridge Regression-------')
rr = linear_model.Ridge(alpha=.5)
rr.fit(X_train, y_train)
calculate_AUC(X_train, X_test, y_train, y_test, rr)

Training set has 32001 instances. Test set has 8001 instances.

------Logistic Regression------
Weights: [-1.20721851  0.11580326 -1.26353888 -0.38439932  0.50649421  9.13202787
 -0.43306146]
Accuracy: 0.8360204974378203
ROC AUC: 0.8341702158241909
Train MSE: 0.16343239273772694
Test MSE: 0.16397950256217972

-------Linear Regression-------
Weights: [ 4.21853703e-01  4.13255967e-03 -7.63562050e-05  2.31638013e-03
 -1.49057107e-03 -2.66502070e-04  1.75262831e-03]
Accuracy: None
ROC AUC: 0.8659383055910559
Train MSE: 0.2109752434851763
Test MSE: 0.20771365511620787

--------Ridge Regression-------
Weights: [ 4.21853704e-01  4.13256138e-03 -7.63602876e-05  2.31637598e-03
 -1.49056647e-03 -2.66480157e-04  1.75256574e-03]
Accuracy: None
ROC AUC: 0.8659383055910559
Train MSE: 0.2109752434852292
Test MSE: 0.20771365495954186


### Performce Stats: Different Models

In [17]:
from sklearn.model_selection import cross_validate

def average (list_num) :
    return sum(list_num)/len(list_num)

def stats_classification (model, X) :
    scoring = ['accuracy', 'roc_auc', 'precision_macro', 'recall_macro', 'f1_macro', 'neg_mean_squared_error']
    scores = cross_validate(model, X, y, scoring=scoring, cv=10)
    accuracy = 100*average(scores['test_accuracy'])
    auc = 100*average(scores['test_roc_auc'])
    precision = 100*average(scores['test_precision_macro'])
    recall = 100*average(scores['test_recall_macro'])
    f1 = 100*average(scores['test_f1_macro'])
    mse = -1*average(scores['test_neg_mean_squared_error'])
    
    return [accuracy, auc, precision, recall, f1, mse]

def stats_regression (model, X) :
    scoring = ['roc_auc', 'neg_mean_squared_error']
    scores = cross_validate(model, X, y, scoring=scoring, cv=10)
    auc = 100*average(scores['test_roc_auc'])
    mse = -1*average(scores['test_neg_mean_squared_error'])
    
    return [0, auc, 0, 0, 0, mse]

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

model_1 = LogisticRegression()
model_5 = LogisticRegression(penalty = 'none')
model_2 = RandomForestClassifier(random_state=90051)
model_3 = svm.SVC(kernel = 'linear')
model_4 = LinearRegression()

model_list = [model_3, model_1, model_2, model_4]
model_name = ['SVM', 'Logistic Regression', 'Random Forest', 'Linear Regression']
performance = [['Model', 'Accuracy', 'ROC AUC', 'Precision Macro', 'Recall Macro', 'F1 Macro', 'MSE']]

for i, model in enumerate(model_list) :
    try :
        new_list = stats_classification(model, X)
    except :
        new_list = stats_regression(model, X)
    new_list.insert(0, model_name[i])
    performance.append(new_list)

In [19]:
performance_df = pd.DataFrame(performance)

print(performance_df)

                     0         1        2                3             4  \
0                Model  Accuracy  ROC AUC  Precision Macro  Recall Macro   
1                  SVM   83.8633  87.2654          86.5014       83.8633   
2  Logistic Regression   83.7583  87.2882          86.6532       83.7583   
3        Random Forest   85.2707   87.251          86.9254       85.2707   
4    Linear Regression         0  86.9851                0             0   

          5         6  
0  F1 Macro       MSE  
1   83.5663  0.161367  
2    83.431  0.162417  
3   85.1036  0.147293  
4         0    0.2106  


In [20]:
performance_df = performance_df
print(performance_df)
performance_df.to_csv('model-performance.csv', sep=",", index=False, header=True)

                     0         1        2                3             4  \
0                Model  Accuracy  ROC AUC  Precision Macro  Recall Macro   
1                  SVM   83.8633  87.2654          86.5014       83.8633   
2  Logistic Regression   83.7583  87.2882          86.6532       83.7583   
3        Random Forest   85.2707   87.251          86.9254       85.2707   
4    Linear Regression         0  86.9851                0             0   

          5         6  
0  F1 Macro       MSE  
1   83.5663  0.161367  
2    83.431  0.162417  
3   85.1036  0.147293  
4         0    0.2106  


### Performce Stats: Different Features on SVM

In [27]:
model_1 = svm.SVC(kernel = 'linear')
model_2 = svm.SVC(kernel = 'linear')
model_3 = svm.SVC(kernel = 'linear')

model_list = [model_1, model_2, model_3]
model_name = ['6', '9', '3']
performance2 = [['Features', 'Accuracy', 'ROC AUC', 'Precision Macro', 'Recall Macro', 'F1 Macro', 'MSE']]

new_list = stats_classification(model_1, X_6feat_final)
new_list.insert(0, model_name[0])
performance2.append(new_list)

new_list = stats_classification(model_2, X_9feat)
new_list.insert(0, model_name[1])
performance2.append(new_list)

new_list = stats_classification(model_3, X_3feat)
new_list.insert(0, model_name[2])
performance2.append(new_list)

done-1
done-2


In [29]:
performance2_df = pd.DataFrame(performance2)

print(performance2_df)

performance2_df.to_csv('Feature-performance.csv', sep=",", index=False, header=True)


          0         1        2                3             4         5  \
0  Features  Accuracy  ROC AUC  Precision Macro  Recall Macro  F1 Macro   
1         6   83.8633  87.2654          86.5014       83.8633   83.5663   
2         9   84.5333  88.2825          87.1846       84.5333   84.2525   
3         3   84.0083  87.3879          86.7598       84.0083   83.7034   

          6  
0       MSE  
1  0.161367  
2  0.154667  
3  0.159917  


### Random Forest Test

In [37]:
from sklearn.ensemble import RandomForestClassifier

rand_forest = RandomForestClassifier(random_state=90051)
rand_forest.fit(X_train, y_train)

predict_prob = rand_forest.predict_proba(X_test)[:,1:]
y_pred_train = rand_forest.predict(X_train)
y_pred_test = rand_forest.predict(X_test)

print(rand_forest.feature_importances_)
print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_pred_test)))
print("AUC: {}".format(metrics.roc_auc_score(y_test, predict_prob)))
print('Train MSE:', mean_squared_error(y_pred_train, y_train))
print('Test MSE:', mean_squared_error(y_pred_test, y_test))

[0.23791687 0.00151816 0.21301399 0.         0.40388157 0.1436694 ]
Accuracy: 0.8568928883889514
AUC: 0.8774879743506138
Train MSE: 0.1499640636230118
Test MSE: 0.1431071116110486


### Observing Weights of Different Models

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

def print_w (clf) :
    try :
        w_sklearn = np.r_[clf.intercept_, clf.coef_.squeeze()]
    except :
        w_sklearn = clf.feature_importances_
    print("Weights: {}".format(w_sklearn))

model_1 = LogisticRegression()
model_2 = RandomForestClassifier(random_state=90051)
model_3 = svm.SVC(kernel = 'linear')
model_4 = LinearRegression()

model_list = [model_3, model_1, model_2, model_4]
model_name = ['SVM', 'Logistic Regression', 'Random Forest', 'Linear Regression']

for i, model in enumerate(model_list) :
    model.fit(X_common_JC_ad, y)
    print("-----", model_name[i], "-----")
    print_w(model)

    

----- SVM -----
Weights: [-1.00018235  0.13249002 -1.45003852 -0.62535861  0.66844795 10.68519869
 -8.18953907]
----- Logistic Regression -----
Weights: [-1.22032636  0.12081511 -1.31380812 -0.38714932  0.50128681  9.43641152
 -3.20296236]
----- Random Forest -----
Weights: [0.20474815 0.00523881 0.17697839 0.0008182  0.39673105 0.2154854 ]
----- Linear Regression -----
Weights: [ 4.08012199e-01  5.24310463e-03 -2.76687286e-04  8.37344507e-04
 -2.08289480e-03  5.91402630e-04  5.36455404e+00]


### Observing Weights when features are normalised

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_trans = scaler.fit_transform(X_all)

model_1 = LogisticRegression()
model_5 = LogisticRegression(penalty = 'none')

model_list = [model_1, model_5]
model_name = ['Logistic Regression with L2 reg', 'Logistic Regression without L2 reg']

for i, model in enumerate(model_list) :
    model.fit(X_common_JC_ad, y)
    print("-----", model_name[i], "-----")
    print_w(model)

----- Logistic Regression with L2 reg -----
Weights: [-1.22032636  0.12081511 -1.31380812 -0.38714932  0.50128681  9.43641152
 -3.20296236]
----- Logistic Regression without L2 reg -----
Weights: [  -1.20925769   -0.12216648   -2.10523217   -0.7258134     1.18281419
   15.88603572 -119.02839379]
