In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
filename = "email-Eu-core-temporal-Dept3.txt"

In [3]:
import pyspark
import itertools
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix, RowMatrix
import csv
import numpy as np
import pandas as pd

# Read data

In [4]:

conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)

# create a spark session
spark = SparkSession.builder.appName('link-prediction').getOrCreate()

# read data as list
with open(filename) as csvfile:
    data = [list(map(int, row)) for row in csv.reader(csvfile, delimiter=' ')]

# make a spark dataframe
columns = ["u", "v", "t"]
df = spark.createDataFrame(data=data, schema=columns)

df.toPandas()

Unnamed: 0,u,v,t
0,11,39,0
1,48,54,2635
2,69,60,4249
3,69,60,13362
4,33,84,16943
...,...,...,...
12211,26,46,45301515
12212,46,26,45301553
12213,16,73,45304269
12214,30,60,45305184


# Select based on timeframe

In [5]:
train_df = df.filter(F.col("t") < 20_000_000)
test_df = df.filter(F.col("t") >= 20_000_000).filter(F.col("t") < 22_000_000)

train_df.toPandas(), test_df.toPandas()

(       u   v         t
 0     11  39         0
 1     48  54      2635
 2     69  60      4249
 3     69  60     13362
 4     33  84     16943
 ...   ..  ..       ...
 5152  23  88  19987782
 5153  87  56  19988698
 5154  43  67  19989289
 5155  16  15  19989592
 5156  64  85  19993413
 
 [5157 rows x 3 columns],
       u   v         t
 0    54  49  20045846
 1    54  63  20045902
 2    67  43  20049149
 3    87  56  20051568
 4     2  24  20051831
 ..   ..  ..       ...
 653  60  16  21899410
 654  60  67  21899410
 655  60  54  21899410
 656  60  52  21899410
 657  60  15  21899410
 
 [658 rows x 3 columns])

In [6]:

edges = train_df\
  .drop("t")\
  .groupby("u", "v")\
  .count()

maxCount = edges\
  .select(
      F.max(edges["count"]).alias("maxCount")
  ).collect()[0]["maxCount"]

edges.orderBy("u", "v").toPandas()

Unnamed: 0,u,v,count
0,0,1,8
1,0,39,22
2,0,64,6
3,0,70,9
4,0,71,50
...,...,...,...
970,88,77,1
971,88,78,1
972,88,79,1
973,88,84,2


In [7]:
nodes = train_df\
  .select("u")\
  .intersect(edges.select("v"))\
  .withColumnRenamed("u", "node")\
  .orderBy("node")
nodes.toPandas()

Unnamed: 0,node
0,0
1,1
2,2
3,3
4,4
...,...
63,84
64,85
65,86
66,87


# Shortest Length

In [8]:
node = 0
shortest_length = nodes\
  .withColumn("shortest_length", F.when(nodes["node"] != node, F.lit(10000000)).otherwise(0))\
  .orderBy("node")

shortest_length.toPandas()

for i in range(shortest_length.count()):
  shortest_length = shortest_length # join(-).groupBy(-).aggregate(min)
  


Unnamed: 0,node,shortest_length
0,0,0
1,1,10000000
2,2,10000000
3,3,10000000
4,4,10000000
...,...,...
63,84,10000000
64,85,10000000
65,86,10000000
66,87,10000000


# Katz

In [9]:
_katz_adj = nodes.withColumnRenamed("node", "_u")\
  .join(nodes.withColumnRenamed("node", "_v"))\
  .orderBy("_u", "_v")

beta = 1 / (2 * maxCount)

katz_edges = edges\
  .withColumn("value", edges["count"] * beta)\
  .drop("count")

katz_adj = _katz_adj\
  .join(
    katz_edges, 
    (_katz_adj._u == katz_edges.u)
      & (_katz_adj._v == katz_edges.v),
    'left'
  )\
  .select("_u", "_v", "value")\
  .fillna(0)\
  .withColumnRenamed("_u", "u")\
  .withColumnRenamed("_v", "v")

katz_adj.orderBy("u", "v").toPandas()

Unnamed: 0,u,v,value
0,0,0,0.000000
1,0,1,0.027586
2,0,2,0.000000
3,0,3,0.000000
4,0,4,0.000000
...,...,...,...
4619,88,84,0.006897
4620,88,85,0.000000
4621,88,86,0.003448
4622,88,87,0.000000


In [10]:
_katz_A = katz_adj\
  .orderBy("v")\
  .groupby("u")\
  .agg(F.collect_list("value").alias('row'))\
  .orderBy('u')\
  .withColumn("id", F.monotonically_increasing_id())
_katz_A.toPandas()

Unnamed: 0,u,row,id
0,0,"[0.0, 0.027586206896551724, 0.0, 0.0, 0.0, 0.0...",0
1,1,"[0.0, 0.0, 0.0, 0.0, 0.010344827586206896, 0.0...",1
2,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
3,3,"[0.0, 0.0, 0.0, 0.0, 0.0034482758620689655, 0....",3
4,4,"[0.0, 0.02413793103448276, 0.0, 0.010344827586...",4
...,...,...,...
63,84,"[0.0, 0.0, 0.0, 0.0, 0.010344827586206896, 0.0...",63
64,85,"[0.0, 0.020689655172413793, 0.0, 0.0, 0.010344...",64
65,86,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",65
66,87,"[0.0034482758620689655, 0.017241379310344827, ...",66


In [11]:
matrix_n_size = _katz_A.count()
katz_A = IndexedRowMatrix(_katz_A.select("id", "row").rdd.map(lambda row: IndexedRow(*row)))\
  .toBlockMatrix(matrix_n_size, matrix_n_size)
katz_matrix = IndexedRowMatrix(_katz_A.select("id", "row").rdd.map(lambda row: IndexedRow(*row)))\
  .toBlockMatrix(matrix_n_size, matrix_n_size)
katz_scores = IndexedRowMatrix(sc.parallelize([IndexedRow(_, [0] * matrix_n_size) for _ in range(matrix_n_size)]))\
  .toBlockMatrix(matrix_n_size, matrix_n_size)
katz_A.toLocalMatrix().toArray()

array([[0.        , 0.02758621, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00344828],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00344828],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00344828, 0.01724138, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00344828, ..., 0.00344828, 0.        ,
        0.        ]])

In [12]:

for i in range(100):
  katz_scores = katz_scores.add(katz_matrix)
  katz_matrix = katz_matrix.multiply(katz_A)
print(katz_scores.toLocalMatrix().toArray())


[[7.55804774e-02 3.20094377e-02 8.95584413e-05 ... 3.97581505e-04
  1.53237801e-03 4.02834115e-03]
 [1.47909683e-04 5.22799876e-03 9.30293449e-05 ... 1.38605864e-04
  1.65050379e-03 4.92616496e-03]
 [4.87606217e-05 1.72954490e-04 3.92607146e-04 ... 1.32452548e-04
  2.30843481e-04 3.94269285e-03]
 ...
 [2.20485315e-06 5.53457270e-06 2.02473168e-06 ... 3.69681066e-05
  2.67865281e-06 1.23782037e-04]
 [4.30308313e-03 2.80614397e-02 2.32491454e-04 ... 4.35892947e-04
  5.34433119e-03 1.86947149e-03]
 [6.82489500e-04 2.42837789e-03 4.25789422e-03 ... 4.58416186e-03
  9.16239006e-04 1.58253797e-02]]


# Hitting Time

In [None]:
edges = [[0,1], [0,1], [2,3], [2,4], [1, 2], [4, 1]]

A = [
    [0, 2, 0, 0, 0],
    [0, 0, 1, 0, 0],
    [0, 0, 0, 1, 1],
    [0, 0, 0, 0, 0],
    [0, 1, 0, 0, 0]
]

beta = 0.05
X = np.array(A) * beta
A = np.array(A) * beta
scores = np.zeros(A.shape)

for i in range(10):
#     print(X)
    scores += X
    X = X.dot(A)
    print(scores)

# DP^(t+1)[i][j] = sum_(k=0)^(n-1) DP^(t)[i][k] * DP^(1)[k][j]
    
# At time t
# DP^(t)[i][j]: (number of paths from i to j in time t)
# DP^(t+1) = DP^(t).dot(DP^(1))

# score[i][j]: sum_(l=1)^(t) beta^l * |paths_i_j^l|
# for i in range(n):
#     DP^(i+1) = DP^(i).dot(DP^(1))
#     score += beta^(i+1) * DP^(i+1)

In [13]:
# transition function


# SimRank

Matrix multiplication: https://en.wikipedia.org/wiki/SimRank#:~:text=It%20is%20important%20to%20note,notion%20of%20similarity%20on%20relationships.