## Reading Data from Disk and Calculate Collab Distance

In [1]:
from pyspark.sql import Row
from pyspark.sql.types import *
import queue

### Read from Disk

In [2]:
author_df = spark.read.parquet("Data/authors-catstat.ML+OR+catstat.AP+OR+catstat.CO+OR+catstat.ME+OR+catstat.OT+OR+catstat.TH-total49904.parquet")
collab_df = spark.read.parquet("Data/collab-catstat.ML+OR+catstat.AP+OR+catstat.CO+OR+catstat.ME+OR+catstat.OT+OR+catstat.TH-total49904.parquet")


In [3]:
# id = 2 and id = 90
# author1 = 2
# author2 = 3338
# depth_max = 10

def collab_dist(author1, author2, depth_max = 3):
    
    # BFS
    fifo = queue.Queue()
    fifo.put(author1)

    # To track depth to stop search at max depth
    depth = queue.Queue()
    depth.put(0)

    # To find depth and the path backwards
    parents = {author1 : -1}
    i = 0
    while not fifo.empty():
        a = fifo.get() ; d = depth.get()
#         print("AuthorID:", a)
        if a == author2:
            break

        if d > depth_max:
            print("Max depth of %i is reached." % d)
            break

        # "src" in collab item is equal to author1, look for the authors in "dest"
        df_dest = collab_df.filter(collab_df.src == a).select(collab_df.columns[1])
        for i in [int(row.dest) for row in df_dest.collect()]:
#             print("Next author: %i" % i)
            if i not in parents: #if already visited, don't add the queue
                fifo.put(i); depth.put(d + 1)
                parents[i] = a

        # "dest" in collab item is equal to author1, look for the authors in "src"
        df_src = collab_df.filter(collab_df.dest == a).select(collab_df.columns[0])
        for i in [int(row.src) for row in df_src.collect()]:
#             print("Next author: %i" % i)
            if i not in parents:
                fifo.put(i); depth.put(d + 1)
                parents[i] = a
        i = i + 1
        if i == 1000: break
    # Calculate the depth.
    dist = 0
    ancestry = [a]
    while parents[a] > 0:
        dist = dist + 1
        a = parents[a]
        ancestry.append(a)
    return (dist, d, ancestry, parents)
    

# print("Parents: %s" % parents)
# print("Dist:", dist)

In [7]:
dist, d, ancestry, parents = collab_dist(8742,352,3)

Max depth of 4 is reached.


In [43]:
dist
ancestry

[352, 1260, 323]

In [None]:
Alex 6896
Ale 352
Joel 15622
Peter 1690
Daren Wang 6212
Kayvan 350
Addison 23894
Martin 323
Peter - Larry - Ale - Kayvan

In [5]:
r = author_df.where(author_df.name == "Geoffrey Hinton").select("id")
r.rdd.map(lambda x: x.id).first()

33526

In [39]:
author_df.filter(author_df.id == 1260).collect()

[Row(id=1260, name='Aaditya Ramdas')]

In [67]:
author_df.filter(author_df.name == "").collect()

[Row(id=6896, name='Alex Reinhart')]

In [36]:
collab_dest = collab_df.filter(collab_df.dest == 2).select(collab_df.columns[0]).collect()
collab_dest[1].src
[int(row.mvv) for row in mvv_list.collect()]

3338

In [6]:
collab_df.filter((collab_df.src == 33526) | (collab_df.dest == 33526)).collect()

[Row(src=33525, dest=33526, arxiv='1206.4635v1', title='Deep Mixtures of Factor Analysers'),
 Row(src=9619, dest=33526, arxiv='1206.4635v1', title='Deep Mixtures of Factor Analysers'),
 Row(src=44985, dest=33526, arxiv='1902.01889v1', title='Analyzing and Improving Representations with the Soft Nearest Neighbor\n  Loss'),
 Row(src=39191, dest=33526, arxiv='1902.01889v1', title='Analyzing and Improving Representations with the Soft Nearest Neighbor\n  Loss'),
 Row(src=33525, dest=33526, arxiv='1206.6445v1', title='Deep Lambertian Networks'),
 Row(src=9619, dest=33526, arxiv='1206.6445v1', title='Deep Lambertian Networks'),
 Row(src=33313, dest=33526, arxiv='1412.7449v3', title='Grammar as a Foreign Language'),
 Row(src=34682, dest=33526, arxiv='1412.7449v3', title='Grammar as a Foreign Language'),
 Row(src=48591, dest=33526, arxiv='1412.7449v3', title='Grammar as a Foreign Language'),
 Row(src=48592, dest=33526, arxiv='1412.7449v3', title='Grammar as a Foreign Language'),
 Row(src=33652

In [42]:
df_dest = collab_df.filter(collab_df.src == a).select(collab_df.columns[1])
[int(row.src) for row in df_dest.collect()]

[]

In [49]:
collab_df.filter(collab_df.src == 1557).collect()

[Row(src=1557, dest=1555, arxiv='1406.7536v1', title='Estimating the distribution of Galaxy Morphologies on a continuous space')]

In [8]:
import arxiv

In [12]:
distance, ancestry = arxiv.dist("Peter Freeman", "Alessandro Rinaldo", depth_max = 3)
print(distance) # 2
print(ancestry) # [352, 1557, 1690], where Peter is 1690, Larry 1557 and Ale 352.

2
[352, 1557, 1690]
