In [2]:
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn import metrics

### Features

- **x1** - source → c → sink
- **x2** - source → c ← sink
- **x3** - source ← c → sink
- **x4** - source ← c ← sink
- **x5** - common nodes (#c)
- **x6** - Number following (Source).
- **x7** - Number of followers (Sink)
- **x8** - Adamic Adar
- **x9** - Jaccard's coeffcient

### Load Training Data

In [3]:
pos_trainData = pd.read_csv('Total_Features/Pos_Total_Feature.csv', sep=',')
print(pos_trainData)

       x1  x2  x3  x4  x5  y      x6   x7        x8        x9
0      59   0  57   0  60  1  101514   71  5.823572  0.000590
1       2   2   2   2   2  1     222   36  0.199040  0.001986
2      19   0  11   0  19  1   18915   40  2.325911  0.000998
3       1   0   1   0   1  1  764195    2  0.092985  0.000001
4       0   0   0   0   0  1    4435    1  0.000000  0.000000
...    ..  ..  ..  ..  .. ..     ...  ...       ...       ...
19996  23   0  23   0  23  1   15948   40  2.105039  0.001440
19997  69   0  69   0  69  1   35111  142  7.528091  0.001960
19998   0   0   0   0   0  1    1579    9  0.000000  0.000000
19999  35   0  34   0  35  1  137761   43  3.891917  0.000254
20000   0   0   0   0   0  1  422106    1  0.000000  0.000000

[20001 rows x 10 columns]


In [4]:
neg_trainData = pd.read_csv('Total_Features/Neg_Total_Feature.csv', sep=',')
print(neg_trainData)

       x1  x2  x3  x4  x5    x6   x7  y        x8        x9
0       1   0   1   0   1  2722   11  0  0.088650  0.000366
1       0   0   0   0   0   743    1  0  0.000000  0.000000
2       0   0   0   0   0    79    4  0  0.000000  0.000000
3       0   0   0   0   0     9    1  0  0.000000  0.000000
4       0   0   0   0   0   142    1  0  0.000000  0.000000
...    ..  ..  ..  ..  ..   ...  ... ..       ...       ...
19996   0   0   0   0   0    39    1  0  0.000000  0.000000
19997   0   0   0   0   0    48    1  0  0.000000  0.000000
19998  14   0   9   0  15   584  155  0  1.566421  0.020380
19999   0   0   0   0   0     1    3  0  0.000000  0.000000
20000   0   0   0   0   0   507    3  0  0.000000  0.000000

[20001 rows x 10 columns]


In [5]:
trainData = pd.concat([pos_trainData, neg_trainData], ignore_index=True)
print(trainData)

       x1  x2  x3  x4  x5  y      x6   x7        x8        x9
0      59   0  57   0  60  1  101514   71  5.823572  0.000590
1       2   2   2   2   2  1     222   36  0.199040  0.001986
2      19   0  11   0  19  1   18915   40  2.325911  0.000998
3       1   0   1   0   1  1  764195    2  0.092985  0.000001
4       0   0   0   0   0  1    4435    1  0.000000  0.000000
...    ..  ..  ..  ..  .. ..     ...  ...       ...       ...
39997   0   0   0   0   0  0      39    1  0.000000  0.000000
39998   0   0   0   0   0  0      48    1  0.000000  0.000000
39999  14   0   9   0  15  0     584  155  1.566421  0.020380
40000   0   0   0   0   0  0       1    3  0.000000  0.000000
40001   0   0   0   0   0  0     507    3  0.000000  0.000000

[40002 rows x 10 columns]


In [6]:
trainData = shuffle(trainData).reset_index(drop=True)
print(trainData)

       x1  x2  x3  x4  x5  y      x6  x7        x8        x9
0       0   0   0   0   0  1  280325   4  0.000000  0.000000
1       5   0   4   0   5  1   25437  10  0.447018  0.000196
2       1   0   1   0   1  0     466   7  0.077271  0.002020
3      18   0  18   0  18  1   14222  68  1.760742  0.001261
4       0   0   0   0   0  0      73   1  0.000000  0.000000
...    ..  ..  ..  ..  .. ..     ...  ..       ...       ...
39997   6   0   6   0   6  1    4819   9  0.579288  0.001236
39998  41   0  45   0  45  1  105865  52  4.256149  0.000424
39999   0   0   0   0   0  0      13   1  0.000000  0.000000
40000   0   0   0   0   0  0      32   5  0.000000  0.000000
40001   1   0   1   0   1  1     675  11  0.087795  0.001325

[40002 rows x 10 columns]


### Calculate Measure

In [7]:
def calculate_AUC (X_train, X_test, y_train, y_test, clf) :
    try :
        predict_prob = clf.predict_proba(X_test)[:,1:]
        y_pred_train = clf.predict(X_train)[:,1:]
        y_pred_test = clf.predict(X_test)[:,1:]
    except:
        predict_prob = clf.predict(X_test)
        y_pred_train = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)
    
    auc = metrics.roc_auc_score(y_test, predict_prob)
    
    w_sklearn = np.r_[clf.intercept_, clf.coef_.squeeze()]
    print("Weights: {}".format(w_sklearn))
    print("AUC: {}".format(auc))
    print('Train MSE:', mean_squared_error(y_pred_train, y_train))
    print('Test MSE:', mean_squared_error(y_pred_test, y_test))
    return

In [8]:
def mean_squared_error(y_true, y_pred):
    return np.mean((y_pred - y_true)**2) 

### Prepare Models

In [9]:
y = trainData['y']
print(y)

0        1
1        1
2        0
3        1
4        0
        ..
39997    1
39998    1
39999    0
40000    0
40001    1
Name: y, Length: 40002, dtype: int64


In [36]:
from sklearn.model_selection import train_test_split
X_all = trainData[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9']]
X_common_JC = trainData[['x1', 'x2', 'x3', 'x4', 'x9']]
X_common_JC_ad = trainData[['x1', 'x2', 'x3', 'x4', 'x8', 'x9']]
X = X_common_JC_ad

In [37]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90051)
print("Training set has {} instances. Test set has {} instances.".format(X_train.shape[0], X_test.shape[0]))

print('\n------Logistic Regression------')
model = LogisticRegression()
model.fit(X_train, y_train)
calculate_AUC(X_train, X_test, y_train, y_test, model)

print('\n-------Linear Regression-------')
lr = LinearRegression()
lr.fit(X_train, y_train)
calculate_AUC(X_train, X_test, y_train, y_test, lr)

print('\n--------Ridge Regression-------')
rr = linear_model.Ridge(alpha=.5)
rr.fit(X_train, y_train)
calculate_AUC(X_train, X_test, y_train, y_test, rr)

Training set has 32001 instances. Test set has 8001 instances.

------Logistic Regression------
Weights: [-1.20806007  0.15162216 -1.22706059 -0.35506215  0.47977662  8.69113204
 -2.67614635]
AUC: 0.839353641750404
Train MSE: 0.16368238492547107
Test MSE: 0.15898012748406448

-------Linear Regression-------
Weights: [ 4.09527485e-01  5.16261494e-03 -3.68178358e-04  8.60933821e-04
 -2.02471314e-03  9.67694627e-04  5.35256790e+00]
AUC: 0.8611200163381232
Train MSE: 0.20647876239615098
Test MSE: 0.2054560150337808

--------Ridge Regression-------
Weights: [ 4.10655933e-01  5.06191802e-03 -3.55041637e-04  1.00669837e-03
 -1.96452606e-03  9.23210920e-04  4.81527417e+00]
AUC: 0.8617462568987387
Train MSE: 0.20651918640032868
Test MSE: 0.20550256059636154


### Load Testing Data

In [38]:
testData = pd.read_csv('Total_Features/Test_Total_Feature.csv', sep=',')
testData = testData[['x1', 'x2', 'x3', 'x4', 'x8', 'x9']]
print(testData)
print(testData.shape)

      x1  x2  x3  x4        x8        x9
0      0   0   0   0  0.000000  0.000000
1      0   0   0   0  0.000000  0.000000
2      2   0   2   0  0.459422  0.011152
3      2   0   2   0  0.178376  0.003670
4      0   0   0   0  0.000000  0.000000
...   ..  ..  ..  ..       ...       ...
1995   0   0   0   0  0.000000  0.000000
1996   0   0   1   0  0.100383  0.006061
1997   0   0   0   0  0.000000  0.000000
1998   0   0   0   0  0.000000  0.000000
1999   0   0   0   0  0.000000  0.000000

[2000 rows x 6 columns]
(2000, 6)


### New Prediction

In [39]:
model = LogisticRegression()
model.fit(X, y)

w_sklearn = np.r_[model.intercept_, model.coef_.squeeze()]
print("Weights: {}".format(w_sklearn))

Weights: [-1.22032636  0.12081511 -1.31380812 -0.38714932  0.50128681  9.43641152
 -3.20296236]


In [40]:
X_new = testData
y_new = model.predict_proba(X_new)[:,1:]

output = pd.DataFrame({'Id': testData.index+1, 'Predicted': y_new.flatten()})
output.head

<bound method NDFrame.head of         Id  Predicted
0        1   0.227879
1        2   0.227879
2        3   0.927343
3        4   0.479632
4        5   0.227879
...    ...        ...
1995  1996   0.227879
1996  1997   0.336343
1997  1998   0.227879
1998  1999   0.227879
1999  2000   0.227879

[2000 rows x 2 columns]>

In [41]:
output.to_csv('output_v6-5-logreg.csv', sep=",", index=False, header=True)