In [7]:
import pandas as pd
import numpy as np

# change this number if you only want to load part of the data (which is a lot faster)
nrows = 300000

Load data and calculate adjacency Matrix A, and a list of all projects/items

In [8]:
df = pd.read_csv('data/data.csv', names=['user', 'project'], nrows = nrows)
df['follows'] = 1
A = df.pivot_table(columns=['project'],index=['user'],values=['follows'],fill_value=0)
projects = df['project'].unique()


Calculate co-citation Matrix C and normalize it by dividing through root follower counts

In [9]:
# Co-citation matrix
C = A.T.dot(A)
follower_counts = np.array(df.groupby('project')['user'].count().reset_index()['user'])
sqrt_count = np.sqrt(follower_counts)
C_norm = C.div(sqrt_count, axis=1).div(sqrt_count, axis=0)
#C_norm = C.div(follower_counts, axis=0)

In [10]:
C_norm

Unnamed: 0_level_0,Unnamed: 1_level_0,follows,follows,follows,follows,follows,follows,follows,follows,follows,follows,follows,follows,follows,follows,follows,follows,follows,follows,follows,follows,follows
Unnamed: 0_level_1,project,0x,0xbtc,0xcert,1337coin,42-coin,4new,808coin,abulaba,accelerator-network,acchain,...,zenswap-network-token,zero,zest,zetacoin,zilliqa,zippie,zoin,zoomba,zozocoin,zurcoin
Unnamed: 0_level_2,project,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
follows,0x,0.938071,0.044249,0.007310,0.031863,0.008516,0.000000,0.018396,0.0,0.026016,0.000000,...,0.000000,0.021242,0.005547,0.005547,0.074699,0.033795,0.0,0.0,0.000000,0.0
follows,0xbtc,0.044249,1.000000,0.000000,0.077152,0.000000,0.000000,0.089087,0.0,0.125988,0.000000,...,0.000000,0.051434,0.000000,0.000000,0.041513,0.054554,0.0,0.0,0.000000,0.0
follows,0xcert,0.007310,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
follows,1337coin,0.031863,0.077152,0.000000,1.000000,0.133631,0.000000,0.288675,0.0,0.408248,0.000000,...,0.000000,0.166667,0.000000,0.000000,0.019217,0.176777,0.0,0.0,0.000000,0.0
follows,42-coin,0.008516,0.000000,0.000000,0.133631,1.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.046524,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
follows,4new,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
follows,808coin,0.018396,0.089087,0.000000,0.288675,0.000000,0.000000,1.000000,0.0,0.235702,0.000000,...,0.000000,0.192450,0.000000,0.000000,0.022189,0.204124,0.0,0.0,0.000000,0.0
follows,abulaba,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
follows,accelerator-network,0.026016,0.125988,0.000000,0.408248,0.000000,0.000000,0.235702,0.0,1.000000,0.000000,...,0.000000,0.136083,0.000000,0.000000,0.015690,0.144338,0.0,0.0,0.000000,0.0
follows,acchain,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0


We look up nearest neighbours in C and calculate predictions by checking if the neighbours were liked by the user

In [11]:
def nearestNeighbours(project, n = 5, minSimilarity = 0.02):
    result = C_norm['follows'][project]['follows'].nlargest(n+1).nsmallest(n)
    result = result[result > minSimilarity]
    return result

def getPrediction(project, userFollows):
    neighbours = nearestNeighbours(project)
    if len(neighbours) < 1:
        return np.nan
    followedProjects = neighbours[neighbours.index.isin(userFollows)]
    return followedProjects.sum() / neighbours.sum()



Lets calculate some ratings

In [14]:
user_ratings = ['ethereum', '0x', 'raiden-network-token']

predictions = pd.DataFrame(projects)
predictions['rating'] = predictions.apply(lambda project: getPrediction(project[0], user_ratings), axis = 1)
print(predictions.nlargest(10, 'rating'))

                        0    rating
111                status  0.620788
106                 augur  0.456576
18           loom-network  0.431661
25          kyber-network  0.427968
104  golem-network-tokens  0.419249
28         enigma-project  0.419166
265                 melon  0.412715
37        request-network  0.409779
105             chainlink  0.409665
16                 bancor  0.404316


And some nearest neighbours

In [13]:
print(nearestNeighbours('eos'))

project
ethereum    0.075587
bitcoin     0.076655
siacoin     0.077896
nano        0.083554
cardano     0.085899
Name: iota, dtype: float64
