In [2]:
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn import metrics

### Features

- **x1** - source → c → sink
- **x2** - source → c ← sink
- **x3** - source ← c → sink
- **x4** - source ← c ← sink
- **x5** - common nodes (#c)
- **x6** - Number following (Source).
- **x7** - Number of followers (Sink)
- **x8** - Adamic Adar
- **x9** - Jaccard's coeffcient

### Load Training Data

In [6]:
pos_trainData = pd.read_csv('Total_Features/pos_total_ra.xls', sep=',')
print(pos_trainData)

       Unnamed: 0  Unnamed: 0.1  x1  x2  x3  x4  y        x8        HD  \
0               0             0  59   0  57   0  1  5.823572  0.000590   
1               1             1   2   2   2   2  1  0.199040  0.002567   
2               2             2  19   0  11   0  1  2.325911  0.000999   
3               3             3   1   0   1   0  1  0.092985  0.000001   
4               4             4   0   0   0   0  1  0.000000  0.000000   
...           ...           ...  ..  ..  ..  .. ..       ...       ...   
19996       19996         19996  23   0  23   0  1  2.105039  0.001442   
19997       19997         19997  69   0  69   0  1  7.528091  0.001964   
19998       19998         19998   0   0   0   0  1  0.000000  0.000000   
19999       19999         19999  35   0  34   0  1  3.891917  0.000254   
20000       20000         20000   0   0   0   0  1  0.000000  0.000000   

             ra        x9           LHN        SI        SC        HP  
0      0.003815  0.000590  8.310343e-06

In [7]:
neg_trainData = pd.read_csv('Total_Features/neg_total_ra.xls', sep=',')
print(neg_trainData)

       Unnamed: 0  Unnamed: 0.1  x1  x2  x3  x4  y        x8        x9  \
0               0             0   1   0   1   0  0  0.088650  0.000366   
1               1             1   0   0   0   0  0  0.000000  0.000000   
2               2             2   0   0   0   0  0  0.000000  0.000000   
3               3             3   0   0   0   0  0  0.000000  0.000000   
4               4             4   0   0   0   0  0  0.000000  0.000000   
...           ...           ...  ..  ..  ..  .. ..       ...       ...   
19996       19996         19996   0   0   0   0  0  0.000000  0.000000   
19997       19997         19997   0   0   0   0  0  0.000000  0.000000   
19998       19998         19998  14   0   9   0  0  1.566421  0.020380   
19999       19999         19999   0   0   0   0  0  0.000000  0.000000   
20000       20000         20000   0   0   0   0  0  0.000000  0.000000   

             ra        HP        SC        SI       LHN        HD  
0      0.000013  0.090909  0.005779  0.0003

In [8]:
trainData = pd.concat([pos_trainData, neg_trainData], ignore_index=True)
print(trainData)

       Unnamed: 0  Unnamed: 0.1  x1  x2  x3  x4  y        x8        HD  \
0               0             0  59   0  57   0  1  5.823572  0.000590   
1               1             1   2   2   2   2  1  0.199040  0.002567   
2               2             2  19   0  11   0  1  2.325911  0.000999   
3               3             3   1   0   1   0  1  0.092985  0.000001   
4               4             4   0   0   0   0  1  0.000000  0.000000   
...           ...           ...  ..  ..  ..  .. ..       ...       ...   
39997       19996         19996   0   0   0   0  0  0.000000  0.000000   
39998       19997         19997   0   0   0   0  0  0.000000  0.000000   
39999       19998         19998  14   0   9   0  0  1.566421  0.025168   
40000       19999         19999   0   0   0   0  0  0.000000  0.000000   
40001       20000         20000   0   0   0   0  0  0.000000  0.000000   

             ra        x9           LHN        SI        SC        HP  
0      0.003815  0.000590  8.310343e-06

In [9]:
trainData = shuffle(trainData).reset_index(drop=True)
print(trainData)

       Unnamed: 0  Unnamed: 0.1  x1    x2  x3  x4  y          x8        HD  \
0           19112         19112  10     0  10   0  1    0.995184  0.000379   
1           19471         19471   0     0   0   0  0    0.000000  0.000000   
2            3069          3069   0     0   0   0  0    0.000000  0.000000   
3            9703          9703   0     0   0   0  0    0.000000  0.000000   
4           11433         11433  64  1825  57  54  1  374.379393  0.412503   
...           ...           ...  ..   ...  ..  .. ..         ...       ...   
39997       13419         13419   4     0   4   0  1    0.429713  0.000047   
39998       17227         17227  81     0  76   0  1    9.246682  0.007901   
39999       13104         13104   2     0   2   0  1    0.190345  0.000023   
40000        6029          6029   0     0   0   0  0    0.000000  0.000000   
40001        9177          9177   0     0   0   0  0    0.000000  0.000000   

              ra        x9       LHN        SI        SC       

### Calculate Measure

In [10]:
def calculate_AUC (X_train, X_test, y_train, y_test, clf) :
    try :
        predict_prob = clf.predict_proba(X_test)[:,1:]
        y_pred_train = clf.predict(X_train)[:,1:]
        y_pred_test = clf.predict(X_test)[:,1:]
    except:
        predict_prob = clf.predict(X_test)
        y_pred_train = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)
    
    auc = metrics.roc_auc_score(y_test, predict_prob)
    try: 
        acc = metrics.accuracy_score(y_test, predict_prob)
    except:
        acc = None
    
    w_sklearn = np.r_[clf.intercept_, clf.coef_.squeeze()]
    print("Weights: {}".format(w_sklearn))
    print("Accuracy: {}".format(acc))
    print("ROC AUC: {}".format(auc))
    print('Train MSE:', mean_squared_error(y_pred_train, y_train))
    print('Test MSE:', mean_squared_error(y_pred_test, y_test))
    return

In [11]:
def mean_squared_error(y_true, y_pred):
    return np.mean((y_pred - y_true)**2) 

### Prepare Models

In [12]:
y = trainData['y']
print(y)

0        1
1        0
2        0
3        0
4        1
        ..
39997    1
39998    1
39999    1
40000    0
40001    0
Name: y, Length: 40002, dtype: int64


### Selecting Different Features

In [14]:
from sklearn.model_selection import train_test_split
X_9feat = trainData[['x1', 'x2', 'x3', 'x4', 'x8', 'x9', 'SI', 'SC', 'HP']]
X_6feat_final = trainData[['x1', 'x2', 'x3', 'x4', 'x8', 'ra']]
X_common_JC_ad = trainData[['x1', 'x2', 'x3', 'x4', 'x8', 'x9']]
X_3feat = trainData[['x8', 'x9', 'ra']]
X = X_6feat_final

### Initial Tests - Logistic Regression, Linear Regression, Ridge Regression

In [15]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90051)
print("Training set has {} instances. Test set has {} instances.".format(X_train.shape[0], X_test.shape[0]))

print('\n------Logistic Regression------')
model = LogisticRegression()
model.fit(X_train, y_train)
calculate_AUC(X_train, X_test, y_train, y_test, model)

print('\n-------Linear Regression-------')
lr = LinearRegression()
lr.fit(X_train, y_train)
calculate_AUC(X_train, X_test, y_train, y_test, lr)

print('\n--------Ridge Regression-------')
rr = linear_model.Ridge(alpha=.5)
rr.fit(X_train, y_train)
calculate_AUC(X_train, X_test, y_train, y_test, rr)

Training set has 32001 instances. Test set has 8001 instances.

------Logistic Regression------
Weights: [-1.21463659  0.16701439 -1.25900842 -0.37111649  0.41500732  8.98760229
 -0.44783611]
Accuracy: 0.842644669416323
ROC AUC: 0.8418250809203044
Train MSE: 0.16227617886941034
Test MSE: 0.15735533058367704

-------Linear Regression-------
Weights: [ 4.19384814e-01  3.74816549e-03 -8.25172831e-05  2.96638818e-03
 -1.79644486e-03 -1.74269086e-04  1.08642948e-03]
Accuracy: None
ROC AUC: 0.8686967456915407
Train MSE: 0.21008064412897506
Test MSE: 0.21139588269244167

--------Ridge Regression-------
Weights: [ 4.19384815e-01  3.74816694e-03 -8.25201923e-05  2.96638496e-03
 -1.79644151e-03 -1.74253531e-04  1.08638543e-03]
Accuracy: None
ROC AUC: 0.8686967456915407
Train MSE: 0.21008064412896224
Test MSE: 0.21139588246922839


### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

rand_forest = RandomForestClassifier(max_depth=2, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=5, n_estimators=50, n_jobs=4, 
                                     random_state=90051)
rand_forest.fit(X_train, y_train)
#(max_depth=2, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=5, n_estimators=50, n_jobs=4, random_state=90051)

predict_prob = rand_forest.predict_proba(X_test)[:,1:]
y_pred_train = rand_forest.predict(X_train)
y_pred_test = rand_forest.predict(X_test)

print(rand_forest.feature_importances_)
print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_pred_test)))
print("AUC: {}".format(metrics.roc_auc_score(y_test, predict_prob)))
print('Train MSE:', mean_squared_error(y_pred_train, y_train))
print('Test MSE:', mean_squared_error(y_pred_test, y_test))
#RandomForestClassifier(n_estimators=100, *, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None,  n_jobs=None)

[0.11244849 0.00138874 0.1291978  0.         0.39080046 0.36616451]
Accuracy: 0.8532683414573178
AUC: 0.8722109051826487
Train MSE: 0.14821411830880285
Test MSE: 0.14673165854268216


### SVM

In [17]:
from sklearn import svm

sup_vm = svm.SVC(probability = True)
sup_vm.fit(X_train, y_train)

predict_prob = sup_vm.predict_proba(X_test)[:,1:]
y_pred_train = sup_vm.predict(X_train)
y_pred_test = sup_vm.predict(X_test)

print(sup_vm.support_)
print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_pred_test)))
print("AUC: {}".format(metrics.roc_auc_score(y_test, predict_prob)))
print('Train MSE:', mean_squared_error(y_pred_train, y_train))
print('Test MSE:', mean_squared_error(y_pred_test, y_test))

[   20    23    32 ... 31973 31990 32000]
Accuracy: 0.8086489188851393
AUC: 0.8714112313633353
Train MSE: 0.2020874347676635
Test MSE: 0.19135108111486065


### Load Testing Data

In [18]:
testData = pd.read_csv('Total_Features/Test_Total_Feature.csv', sep=',')
testData = testData[['x1', 'x2', 'x3', 'x4', 'x8', 'x9']]
print(testData)
print(testData.shape)

      x1  x2  x3  x4        x8        x9
0      0   0   0   0  0.000000  0.000000
1      0   0   0   0  0.000000  0.000000
2      2   0   2   0  0.459422  0.011152
3      2   0   2   0  0.178376  0.003670
4      0   0   0   0  0.000000  0.000000
...   ..  ..  ..  ..       ...       ...
1995   0   0   0   0  0.000000  0.000000
1996   0   0   1   0  0.100383  0.006061
1997   0   0   0   0  0.000000  0.000000
1998   0   0   0   0  0.000000  0.000000
1999   0   0   0   0  0.000000  0.000000

[2000 rows x 6 columns]
(2000, 6)


### Prediction On Test Data

In [19]:
# model = LogisticRegression()
# model.fit(X, y)


sup_vm = svm.SVC(probability = True, kernel = 'linear')
sup_vm.fit(X, y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [20]:
X_new = testData
##### change below with appropriate model
y_new = sup_vm.predict_proba(X_new)[:,1:]

output = pd.DataFrame({'Id': testData.index+1, 'Predicted': y_new.flatten()})
output.head

<bound method NDFrame.head of         Id  Predicted
0        1   0.228602
1        2   0.228602
2        3   0.959786
3        4   0.453445
4        5   0.228602
...    ...        ...
1995  1996   0.228602
1996  1997   0.328080
1997  1998   0.228602
1998  1999   0.228602
1999  2000   0.228602

[2000 rows x 2 columns]>

In [18]:
output.to_csv('output.csv', sep=",", index=False, header=True)