In [1]:
# Download and decompress data into your Jupyter environment

import urllib.request
import io
import gzip


for file in ['web-NotreDame.txt']:
    print ('Downloading compressed image of', file)
    source = urllib.request.urlopen("https://snap.stanford.edu/data/" + file + ".gz")
    compressedFile = io.BytesIO(source.read())
    decompressedFile = gzip.GzipFile(fileobj=compressedFile)

    with open(file, 'wb') as outfile:
        outfile.write(decompressedFile.read())
        outfile.close()
        print ('Saved', file)


Downloading compressed image of web-NotreDame.txt
Saved web-NotreDame.txt


In [2]:
import pandas as pd
import numpy as np
import pyspark
import os
import pyspark
import os
del os.environ['PYSPARK_SUBMIT_ARGS']
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, col, lit
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [3]:
web_NotreDame_df = pd.read_csv('web-NotreDame.txt',sep='\t',skiprows=(0,1,2))

In [4]:
web_NotreDame_df = web_NotreDame_df[web_NotreDame_df['# FromNodeId'] < 10000]
web_NotreDame_df = web_NotreDame_df[web_NotreDame_df['ToNodeId']<10000]
web_NotreDame_df['binary'] = 1

In [5]:
web_NotreDame_df

Unnamed: 0,# FromNodeId,ToNodeId,binary
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1
5,0,5,1
6,0,6,1
7,0,7,1
8,0,8,1
9,0,9,1


In [6]:
#adjacency matrix formation 
d = pd.pivot_table(web_NotreDame_df,index='# FromNodeId',columns='ToNodeId',values='binary')
d.fillna(0,inplace=True)
index = d.index.union(d.columns)
d = d.reindex(index=index, columns=index, fill_value=0)

In [7]:
M = d.as_matrix()
n = 1/(np.sum(M,axis=1))
n[n==np.inf]=0
M = M.T*n

  from ipykernel import kernelapp as app


In [8]:
M

array([[ 0.05882353,  0.04      ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.05882353,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.05882353,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [9]:
def pagerank(M,alpha,iteration):
    PR = np.ones(len(M))
    for i in range(0,iteration):
        PR = alpha*np.dot(M,PR) + ((1-alpha)*np.ones(len(M)))
    return PR

In [15]:
PR = pagerank(M, 0.85, 30)

In [16]:
PR_df = pd.DataFrame(PR,  columns=['PageRank'])

In [17]:
PR2_df = PR_df.sort(columns='PageRank',ascending=False)[0:10]

  if __name__ == '__main__':


In [18]:
PR2_df['ID'] = PR2_df.index
PR2_df.reset_index()
PR2_df.rename(columns={'index': 'ID'})

Unnamed: 0,PageRank,ID
0,223.441081,0
1973,187.701609,1973
1790,51.239128,1790
1828,50.341321,1828
1,27.776906,1
238,26.104598,238
140,23.449762,140
14,22.062259,14
16,21.241687,16
15,18.05379,15


## Step 3.3

In [14]:
M[10:30,10:30]

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  

In [26]:
PR = pagerank(M, 0.85, 15)
PR_df = pd.DataFrame(PR,  columns=['PageRank'])
PR2_df = PR_df.sort(columns='PageRank',ascending=False)[0:10]

  app.launch_new_instance()


In [27]:
PR2_df['ID'] = PR2_df.index
PR2_df.reset_index()
PR2_df.rename(columns={'index': 'ID'})
PR2_df

Unnamed: 0,PageRank,ID
0,224.702638,0
1973,189.250314,1973
1790,53.438593,1790
1828,50.954873,1828
1,27.975911,1
238,26.779136,238
140,23.520898,140
14,22.232264,14
16,21.591054,16
162,18.283386,162
