# Devin's Notebook

In [97]:
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from random import choice, sample, shuffle

### 1. Load Training Data

In [20]:
with open("train.txt") as file :
    all_lines = file.read().splitlines()

followers_numb = Counter()
followers_list = defaultdict(list)
following_numb = Counter()
following_list = defaultdict(list)

all_nodes = []
source_nodes = []

for line in tqdm(all_lines) :
    nodes = line.split("\t")
    source, sinks = nodes[0], nodes[1:]
    sinks = list(map(int, sinks))
    source = int(source)
    while source in sinks: sinks.remove(source)
        
    following_list[source] = sinks
    following_numb[source] = len(sinks)
    
    for sink in sinks :
        followers_list[sink].append(source)
        followers_numb[sink] = len(followers_list[sink])
    
    all_nodes.append(source)
    all_nodes.extend(sinks)
    
all_nodes = list(set(all_nodes))

100%|██████████| 20000/20000 [00:27<00:00, 728.90it/s] 


Stats about `train.txt`

In [22]:
i = 0
null = 0

for node in following_numb :
    if following_numb[node] == 0 :
        null += 1
    else :
        i += following_numb[node]

print("Total lines in train.txt: ", len(all_lines))
print("Total edges in train.txt: ", i)
print("Total distinct nodes", len(all_nodes))
print("Size of Source nodes: ", len(edges))
print("Size of null entries: ", null)

Total lines in train.txt:  20000
Total edges in train.txt:  24004344
Total distinct nodes 4867136
Size of Source nodes:  20000
Size of null entries:  430


### 3. Loading Training Data

#### 3.1 Positive Data

In [23]:
pos_subPair = pd.read_csv('pos_subPair.csv', sep=",")
print(pos_subPair.head())
print(pos_subPair.shape)
pos_train_source_list = list(pos_subPair['source'])
pos_train_sink_list = list(pos_subPair['sink'])

print(len(pos_train_source_list), len(pos_train_sink_list))

   Unnamed: 0   source     sink
0           0   765601  2006214
1           1  1750842  4144037
2           2  1973082   229375
3           3  3361377  1542427
4           4  4230144  1246469
(20001, 3)
20001 20001


#### 3.2 Negative Data

In [24]:
neg_subPair = pd.read_csv('neg_subPair.csv', sep=",")
neg_subPair = neg_subPair.rename(columns = {'0': 'source', '1': 'sink'}, inplace = False)
print(neg_subPair.head())
print(neg_subPair.shape)
neg_train_source_list = list(neg_subPair['source'])
neg_train_sink_list = list(neg_subPair['sink'])

print(len(neg_train_source_list), len(neg_train_sink_list))

   Unnamed: 0   source     sink
0           0   493062     9330
1           1  4618437  2311748
2           2  4391648  1548304
3           3  1628298  1893139
4           4  1807730  4719681
(20001, 3)
20001 20001


### 4. Feature Engineering

#### 4.1 Common Friends

- **x1** - source → c → sink
- **x2** - source → c ← sink
- **x3** - source ← c → sink
- **x4** - source ← c ← sink
- **x5** - Jaccard's coeffcient

In [79]:
def gen_common_friends (source, sink, following_list, followers_list) :
    
    source_following, source_followers, sink_following, sink_followers = [], [], [], []
    x1, x2, x3, x4, x5 = 0, 0, 0, 0, 0
    
    if source in following_list :
        source_following = following_list[source]
        
    if source in followers_list :
        source_followers = followers_list[source]
    
    if sink in following_list :
        sink_following = following_list[sink]
        
    if sink in followers_list :
        sink_followers = followers_list[sink]
    
    if source_following != [] and sink_followers != [] :
        x1 = len(set(source_following).intersection(sink_followers))
        
    if source_following != [] and sink_following != [] :
        x2 = len(set(source_following).intersection(sink_following))
        
    if source_followers != [] and sink_followers != [] :
        x3 = len(set(source_followers).intersection(sink_followers))
    
    if source_followers != [] and sink_following != [] :
        x4 = len(set(source_followers).intersection(sink_following))
        
    related_source = set(source_following).union(source_followers)
    related_sink = set(sink_following).union(sink_followers)
    common_nodes = list(set(related_source).intersection(related_sink))
    related_both = len(set(related_source).union(related_sink))
    if related_both != 0:
        x5 = len(common_nodes)/related_both
        
    return x1, x2, x3, x4, x5

In [173]:
test = gen_common_friends(765601, 2006214, following_list, followers_list)
print(test)
test = gen_common_friends(4391648, 1548304, following_list, followers_list)
print(test)

(59, 0, 57, 0, 0.0005899705014749262)
(0, 0, 0, 0, 0.0)


#### 4.1 Node stats

- **x6** - Number following (Source).
- **x7** - Number following (Sink).
- **x8** - Number of followers (Source).
- **x9** - Number of followers (Sink).

In [81]:
def gen_node_stats (source, sink, following_numb, followers_numb) :
    x6, x7, x8, x9 = 0, 0, 0, 0
    x6 = following_numb[source]
    x7 = following_numb[sink]
    x8 = followers_numb[source]
    x9 = followers_numb[sink]
    
    return x6, x7, x8, x9

#### 4.3 Generating Features

In [85]:
pos_train_features = []

for i in tqdm(range(len(pos_train_source_list))) :
    source = pos_train_source_list[i]
    sink = pos_train_sink_list[i]
    
    x1, x2, x3, x4, x5 = gen_common_friends(source, sink, following_list, followers_list)
    #feat2 = gen_node_stats(source, sink, following_numb, followers_numb)
    
    pos_train_features.append([x1, x2, x3, x4, x5, 1])
    
print(len(pos_train_features))

100%|██████████| 20001/20001 [15:02<00:00, 22.15it/s]

20001





In [86]:
neg_train_features = []

for i in tqdm(range(len(neg_train_source_list))) :
    source = neg_train_source_list[i]
    sink = neg_train_sink_list[i]
    
    x1, x2, x3, x4, x5 = gen_common_friends(source, sink, following_list, followers_list)
    #feat2 = gen_node_stats(source, sink, following_numb, followers_numb)
    
    neg_train_features.append([x1, x2, x3, x4, x5, 0])
    
print(len(neg_train_features))

100%|██████████| 20001/20001 [00:09<00:00, 2160.30it/s]

20001





In [93]:
print(len(pos_train_features), pos_train_features[0:10])
print(len(neg_train_features), neg_train_features[0:10])

20001 [[59, 0, 57, 0, 0.0005899705014749262, 1], [2, 2, 2, 2, 0.0019860973187686196, 1], [19, 0, 11, 0, 0.0009982661692849262, 1], [1, 0, 1, 0, 1.3112037112309842e-06, 1], [0, 0, 0, 0, 0.0, 1], [37, 0, 36, 0, 0.0009086561453849832, 1], [0, 0, 0, 0, 0.0, 1], [0, 0, 0, 0, 0.0, 1], [13, 0, 13, 0, 0.0016337815759708432, 1], [41, 0, 40, 0, 0.00043891107233468577, 1]]
20001 [[1, 0, 1, 0, 0.00036603221083455345, 0], [0, 0, 0, 0, 0.0, 0], [0, 0, 0, 0, 0.0, 0], [0, 0, 0, 0, 0.0, 0], [0, 0, 0, 0, 0.0, 0], [1, 0, 0, 0, 0.008, 0], [0, 0, 0, 0, 0.0, 0], [0, 0, 0, 0, 0.0, 0], [0, 0, 0, 0, 0.0, 0], [0, 0, 0, 0, 0.0, 0]]


In [171]:
train_features = pos_train_features.copy()
train_features.extend(neg_train_features)
len(train_features)

40002

In [174]:
train_features_df = pd.DataFrame(train_features, columns = ['x1', 'x2', 'x3', 'x4', 'x5', 'y'])
print(train_features_df)
print(train_features_df.shape)

X = train_features_df[['x1', 'x2', 'x3', 'x4', 'x5']]
y = train_features_df['y']

train_features_df.to_csv('features-common-friends-with-JC.csv', sep=",", index=True, index_label = 'Id', header=True)

       x1  x2  x3  x4        x5  y
0      59   0  57   0  0.000590  1
1       2   2   2   2  0.001986  1
2      19   0  11   0  0.000998  1
3       1   0   1   0  0.000001  1
4       0   0   0   0  0.000000  1
...    ..  ..  ..  ..       ... ..
39997   0   0   0   0  0.000000  0
39998   0   0   0   0  0.000000  0
39999  14   0   9   0  0.020380  0
40000   0   0   0   0  0.000000  0
40001   0   0   0   0  0.000000  0

[40002 rows x 6 columns]
(40002, 6)


### 5. Training Model

In [175]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90051)
print("Training set has {} instances. Test set has {} instances.".format(X_train.shape[0], X_test.shape[0]))

Training set has 32001 instances. Test set has 8001 instances.


In [176]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='none')
model.fit(X, y)
w_sklearn = np.r_[model.intercept_, model.coef_.squeeze()]
print("Weights: {}".format(w_sklearn))

Weights: [ -1.19244575   0.51252288   0.14435657   0.29479233  -0.03321564
 -24.83857673]


### 6. Loading Test Data

In [177]:
testData = pd.read_csv('test-public.txt', sep="\t")
testData_ids = list(testData['Id'])
test_source_list = list(testData['Source'])
test_sink_list = list(testData['Sink'])
print(testData.head())
print(testData.shape)

   Id   Source     Sink
0   1  3563811  3600160
1   2  2052043  1401960
2   3  4517994  1690636
3   4  1660006  4349447
4   5   581111  1882617
(2000, 3)


In [180]:
test_features = []

for i in tqdm(range(len(test_source_list))) :
    source = test_source_list[i]
    sink = test_sink_list[i]
    
    x1, x2, x3, x4, x5 = gen_common_friends(source, sink, following_list, followers_list)
    #feat2 = gen_node_stats(source, sink, following_numb, followers_numb)
    
    test_features.append([x1, x2, x3, x4, x5])
    
print(len(test_features))

test_features = pd.DataFrame(test_features, columns = ['x1', 'x2', 'x3', 'x4', 'x5'])
print(test_features)
print(test_features.shape)

100%|██████████| 2000/2000 [00:02<00:00, 842.38it/s] 

2000
      x1  x2  x3  x4        x5
0      0   0   0   0  0.000000
1      0   0   0   0  0.000000
2      2   0   2   0  0.011152
3      2   0   2   0  0.003670
4      0   0   0   0  0.000000
...   ..  ..  ..  ..       ...
1995   0   0   0   0  0.000000
1996   0   0   1   0  0.006061
1997   0   0   0   0  0.000000
1998   0   0   0   0  0.000000
1999   0   0   0   0  0.000000

[2000 rows x 5 columns]
(2000, 5)





### 7. New Prediction

In [181]:
X_new = test_features
y_new = model.predict_proba(X_new)
prob_link = [prob[1] for prob in y_new]

In [182]:
output = pd.DataFrame({'Id': testData_ids, 'Predicted': prob_link})
output.head

<bound method NDFrame.head of         Id  Predicted
0        1   0.232822
1        2   0.232822
2        3   0.536230
3        4   0.582011
4        5   0.232822
...    ...        ...
1995  1996   0.232822
1996  1997   0.259573
1997  1998   0.232822
1998  1999   0.232822
1999  2000   0.232822

[2000 rows x 2 columns]>

In [165]:
output.to_csv('output_v3-common-friends-noJC.csv', sep=",", index=False, header=True)