## 有偏随机游走

In [1]:
import networkx as nx
import random
random.seed(0)
import numpy as np
np.random.seed(0)

G=nx.erdos_renyi_graph(10, 0.3, seed=1, directed=False)

In [3]:
def next_node(previous, current, p, q):
    neighbors = list(G.neighbors(current))
    alphas =[]
    for neighbor in neighbors:
        if neighbor == previous:
            alpha = 1 / p
        elif G.has_edge(neighbor, previous):
            alpha = 1
        else:
            alpha = 1 / q
        alphas.append(alpha)

    probs = [alpha / sum(alphas) for alpha in alphas]
    next = np.random.choice(neighbors, size=1, p=probs)[0]
    return next

In [5]:
def random_walk(start, length, p, q):
    walk = [start]

    for i in range(length):
        current = walk[-1]
        previous = walk[-2] if len(walk) > 1 else None
        next = next_node(previous, current, p, q)
        walk.append(next)
    return [str(x) for x in walk]




In [6]:
random_walk(0, 8, p=1, q=1)

['0', '4', '7', '6', '4', '5', '4', '5', '6']

In [7]:
random_walk(0, 8, p=1, q=10)

['0', '9', '1', '9', '1', '9', '1', '0', '1']

In [8]:
random_walk(0, 8, p=10, q=1)

['0', '1', '9', '4', '7', '8', '7', '4', '6']

### 实现Node2Vec

In [10]:
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [12]:
G = nx.karate_club_graph()

labels = []
for node in G.nodes:
    label = G.nodes[node]['club']
    labels.append(1 if label == 'Officer' else 0)

In [13]:
walks = []
for node in G.nodes:
    for _ in range(80):
        walks.append(random_walk(node, 10, p=3, q=2))

In [15]:
node2vec = Word2Vec(walks,
                    hs=1,
                    sg=1,
                    vector_size=100,
                    window=10,
                    workers=2,
                    min_count=1,
                    seed=0)

In [16]:
node2vec.train(walks, total_examples=node2vec.corpus_count, epochs=30, report_delay=1)

(185807, 897600)

In [17]:
train_mask = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24]
train_mask_str = [str(x) for x in train_mask]
test_mask = [0, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33]
test_mask_str = [str(x) for x in test_mask]
labels = np.array(labels)

In [18]:
clf = RandomForestClassifier(random_state=0)
clf.fit(node2vec.wv[train_mask_str], labels[train_mask])
y_pred = clf.predict(node2vec.wv[test_mask_str])
acc = accuracy_score(labels[test_mask], y_pred)
print(acc)





0.9090909090909091


### 构建电影推荐系统

In [19]:
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile


In [20]:
url = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
with urlopen(url) as zurl:
    with ZipFile(BytesIO(zurl.read())) as zfile:
        zfile.extractall('.')






In [26]:
import pandas as pd

ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [25]:
movies = pd.read_csv('ml-100k/u.item', sep='|', usecols=range(2), names=['movie_id', 'title'], encoding='latin-1')
print(movies)






      movie_id                                      title
0            1                           Toy Story (1995)
1            2                           GoldenEye (1995)
2            3                          Four Rooms (1995)
3            4                          Get Shorty (1995)
4            5                             Copycat (1995)
...        ...                                        ...
1677      1678                          Mat' i syn (1997)
1678      1679                           B. Monkey (1998)
1679      1680                       Sliding Doors (1998)
1680      1681                        You So Crazy (1994)
1681      1682  Scream of Stone (Schrei aus Stein) (1991)

[1682 rows x 2 columns]


In [27]:
ratings = ratings[ratings.rating >= 4]
ratings






Unnamed: 0,user_id,movie_id,rating,timestamp
5,298,474,4,884182806
7,253,465,5,891628467
11,286,1014,5,879781125
12,200,222,5,876042340
16,122,387,5,879270459
...,...,...,...,...
99988,421,498,4,892241344
99989,495,1091,4,888637503
99990,806,421,4,882388897
99991,676,538,4,892685437


In [33]:
from collections import defaultdict

pairs = defaultdict(int)

for group in ratings.groupby('user_id'):
    user_movies = list(group[1]["movie_id"])

for i in range(len(user_movies)):
    for j in range(i+1, len(user_movies)):
        pairs[(user_movies[i], user_movies[j])] += 1

G = nx.Graph()
for pair in pairs:
    moviel, movie2 = pair
    score = pairs[pair]
    if score >= 10:
        G.add_edge(moviel, movie2, weight=score)

In [34]:
!pip install node2vec



In [35]:
from node2vec import Node2Vec

  from .autonotebook import tqdm as notebook_tqdm


In [39]:
node2vec = Node2Vec(G, dimensions=64, walk_length=20, num_walks=200,p=2,q=1, workers=1)

Computing transition probabilities: 0it [00:00, ?it/s]
Generating walks (CPU: 1): 100%|██████████| 200/200 [00:00<00:00, 1064544.16it/s]


In [40]:
model = node2vec.fit(window=10, min_count=1, batch_words=4)

RuntimeError: you must first build vocabulary before training the model