### Data Prep

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("lib/datasets/twitch_gamers/large_twitch_edges.csv")
df

Unnamed: 0,numeric_id_1,numeric_id_2
0,98343,141493
1,98343,58736
2,98343,140703
3,98343,151401
4,98343,157118
...,...,...
6797552,97507,29359
6797553,71175,12020
6797554,151702,128281
6797555,118034,38021


## Random walk with Restart.
When the amount of nodes gets to 150 we randomly select a new node and then repeat

In [3]:
import random
def sub_graph(df, query, unique):
    nodes = []
    nodes.append(query)
    # random walk w/ restart
    groups1 = df.groupby("numeric_id_1")
    groups2 = df.groupby("numeric_id_2")
    end = True
    while len(nodes) < 5000:
        while len(nodes) < 150:
            end = True
            curr = query
            while curr in nodes and end:
                selected = list(groups1.get_group(curr)["numeric_id_2"]) + list(groups2.get_group(curr)["numeric_id_1"])
                ind = random.choice(selected)
                if ind not in df["numeric_id_1"].unique():
                    end = False
                elif ind not in df["numeric_id_2"].unique():
                    end = False
                curr = ind
            if ind not in nodes:
                nodes.append(ind)
        query =  np.random.randint(0,123518)
        while query not in unique:
            query =  np.random.randint(0,123518)
        nodes.append(query)

    return nodes

In [4]:
sub = sub_graph(df, 1, df["numeric_id_1"].unique())

In [5]:
id1 = list(df.numeric_id_1)
id2 = list(df.numeric_id_2)
new1 = []
new2 = []
for i in range(len(id1)):
    if i %1000000 == 0:
        print(i)

    if id1[i] in sub:
        if id2[i] in sub:
            new1.append(id1[i])
            new2.append(id2[i])

0
1000000
2000000
3000000
4000000
5000000
6000000


In [6]:
P = np.zeros((len(new1),len(new1)))
unique = list(set(new1 + new2))
for i in range(len(new2)):
    P[unique.index(new1[i])][unique.index(new2[i])] = 1/new1.count(new1[i])
P

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
for i in range(len(P)):
    if np.array_equal(P[i],np.zeros(len(new1))):
        P[i] = np.full(len(new1),1/len(new1))

### PPR

In [11]:
c = .85
n = len(new2)
query = 1
q = np.zeros((n,1))
q[query - 1][0] = 1
r = np.ones((n,1)) / n
rback = np.zeros((n,1))
while np.sum(np.absolute(r - rback)) > 10**-6:
    rback = r
    r = c * np.dot(P.T,r) + (1-c) * q

### SALSA

In [12]:
hubs = (-r.reshape((n,))).argsort()[:200]
ha = df[df["numeric_id_1"].isin(hubs)]

In [13]:
bi = pd.DataFrame()
bi["Hubs"] = ha["numeric_id_1"]
bi["Auth"] = ha["numeric_id_2"]
bi.to_csv("lib/datasets/bi.csv",index=False)

In [14]:
bi = pd.read_csv("lib/datasets/bi.csv")
bi

Unnamed: 0,Hubs,Auth
0,3215,115407
1,3215,48461
2,3215,45752
3,3215,42925
4,3215,78399
...,...,...
8439,2636,78700
8440,1468,93841
8441,608,137902
8442,3631,155080


In [15]:
order = np.unique(bi)
Lr = np.zeros((len(order),len(order)))
bi_g = bi.groupby("Hubs")
for i in range(len(order)):
    if order[i] in bi["Hubs"].tolist():
        vals = bi_g.get_group(order[i])["Auth"]
        Lr[i][np.where(np.isin(order, vals))] = 1/len(vals)

In [16]:
Lc = np.zeros((len(order),len(order)))
bi_g = bi.groupby("Auth")
for i in range(len(order)):
    if order[i] in bi["Auth"].tolist():
        vals = bi_g.get_group(order[i])["Hubs"]
        Lc[i][np.where(np.isin(order, vals))] = 1/len(vals)
Lc = Lc.T

In [17]:
rc = Lr.dot(Lc.T)
rc_u = order[~np.all(rc == 0, axis=1)]
H = rc[~np.all(rc == 0, axis=1)]
H = H[:,~np.all(H == 0, axis=0)]

cr = Lc.T.dot(Lr)
cr_u = order[~np.all(cr == 0, axis=1)]
A = cr[~np.all(cr == 0, axis=1)]
A = A[:,~np.all(A == 0, axis=0)]
print(H.shape,A.shape)

(145, 145) (8011, 8011)


In [18]:
tmp = np.array([[0,1,1,0],
                [0,1,1,0]])
tmp[0,np.all(tmp == 0, axis=0)] = 1
tmp

array([[1, 1, 1, 1],
       [0, 1, 1, 0]])

### Check connected
check if something appears once on right and then if what it's connected to on the left appears once then disconnected

In [19]:
class DFS:

    # init function to declare class variables
    def __init__(self, V):
        self.V = V
        self.visited = np.zeros(V.shape[0])
        self.vis = np.zeros(V.shape)

    def test(self):
        curr = 1
        for i in range(len(self.V)):
            if self.visited[i] == 0:
                unique = np.unique(self.visited[~np.all(self.V[i] == 0, axis=0)])

                if unique[0] == 0:
                    self.visited[~np.all(self.V[i] == 0, axis=0)] = curr
                    curr += 1
                else:
                    self.visited[~np.all(self.V[i] == 0, axis=0)] = int(unique[1])

    def extract(self):
        self.test()
        cc = []
        print(self.visited)
        for i in range(int(self.visited.max())):
            cc.append(np.where(self.visited == i+1))
        return cc

In [20]:
g = DFS(A)
A_cc = g.extract()

[1. 1. 1. ... 1. 1. 1.]


(8011, 8011)


array([   0,    1,    2, ..., 8008, 8009, 8010], dtype=int64)

In [23]:
from scipy.linalg import eig
S, U = eig(A.T)
stationary = np.array(U[:, np.where(np.abs(S - 1.) < 1e-8)[0][0]].flat)
stationary = stationary / np.sum(stationary)
stationary

array([0.00025132+0.j, 0.00012566+0.j, 0.00012566+0.j, ...,
       0.00012566+0.j, 0.00012566+0.j, 0.00012566+0.j])

In [26]:
(stationary).max()

(0.0005026395234617779+0j)

In [38]:
idx = (-stationary).argsort()[:5]
tmp = cr_u[idx]
tmp

array([104231,  83176, 104384, 159889,   6729], dtype=int64)

In [37]:
cr_u

array([     2,      8,     44, ..., 168079, 168090, 168103], dtype=int64)

In [4]:
P = np.zeros((10000,10000))
count = 0
for index, values in reduced_df.iterrows():
    P[values["numeric_id_1"]-1][np.array(values["numeric_id_2"])-1] = 1/len(values["numeric_id_2"])
    print(P[values["numeric_id_1"]-1].sum())

Unnamed: 0,Hubs,Auth
0,9008,113885
1,9008,5240
2,9008,47049
3,9008,163802
4,9008,142846
...,...,...
7970,3778,88523
7971,2562,89426
7972,2562,68970
7973,6467,86519
