In [1]:
import networkx as nx
import pandas as pd


# Load CSV files
edges_df = pd.read_csv('amzn_directed_graph.csv')
products_df = pd.read_csv('products.csv')

# Dropping NAs in the data
products_df = products_df.dropna()


# Create a directed graph
G = nx.DiGraph()

# Add nodes from the products dataframe
for idx, row in products_df.iterrows():
    G.add_node(row['Id'], 
               ASIN=row['ASIN'],
               title=row['title'],
               group=row['group'],
               salesrank=row['salesrank'],
               similar=row['similar'],
               categories=row['categories'],
               total_reviews=row['total_reviews'],
               avg_rating=row['avg_rating'])

# Add edges from the edges dataframe
for idx, row in edges_df.iterrows():
    G.add_edge(row['FromNodeId'], row['ToNodeId'])

In [7]:
reference_nodes = list(G.nodes())[:2]  # Using the first 5 nodes as reference
shortest_path_features = {}

for node in G.nodes():
    shortest_path_features[node] = []
    for ref_node in reference_nodes:
        try:
            shortest_path_features[node].append(nx.shortest_path_length(G, node, ref_node))
        except nx.NetworkXNoPath:
            shortest_path_features[node].append(float('inf'))


In [8]:
shortest_path_features

{1: [0, 1],
 2: [2, 0],
 4: [inf, inf],
 5: [inf, inf],
 6: [inf, inf],
 7: [inf, inf],
 8: [inf, inf],
 10: [inf, inf],
 11: [inf, inf],
 12: [inf, inf],
 15: [inf, inf],
 16: [inf, inf],
 17: [inf, inf],
 18: [inf, inf],
 19: [inf, inf],
 20: [inf, inf],
 21: [inf, inf],
 22: [inf, inf],
 25: [inf, inf],
 26: [inf, inf],
 27: [inf, inf],
 28: [inf, inf],
 29: [inf, inf],
 30: [inf, inf],
 31: [inf, inf],
 32: [inf, inf],
 33: [inf, inf],
 34: [inf, inf],
 35: [inf, inf],
 37: [inf, inf],
 38: [inf, inf],
 39: [inf, inf],
 42: [inf, inf],
 43: [inf, inf],
 44: [inf, inf],
 45: [inf, inf],
 46: [inf, inf],
 47: [inf, inf],
 48: [inf, inf],
 50: [inf, inf],
 51: [inf, inf],
 52: [inf, inf],
 54: [inf, inf],
 59: [inf, inf],
 60: [inf, inf],
 62: [inf, inf],
 63: [inf, inf],
 64: [inf, inf],
 65: [inf, inf],
 66: [inf, inf],
 68: [inf, inf],
 69: [inf, inf],
 71: [inf, inf],
 72: [inf, inf],
 73: [inf, inf],
 74: [inf, inf],
 75: [inf, inf],
 77: [inf, inf],
 78: [inf, inf],
 79: [inf, i

In [10]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting networkx<3.0,>=2.5
  Using cached networkx-2.8.8-py3-none-any.whl (2.0 MB)
Installing collected packages: networkx, node2vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.1
    Uninstalling networkx-3.1:
      Successfully uninstalled networkx-3.1
Successfully installed networkx-2.8.8 node2vec-0.4.6


In [None]:
from node2vec import Node2Vec

node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)
model = node2vec.fit(window=10, min_count=1)

node2vec_features = {}
for node in G.nodes():
    node2vec_features[node] = model.wv[node]


Computing transition probabilities:   0%|          | 0/458167 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 50/50 [48:26<00:00, 58.12s/it]   
Generating walks (CPU: 3): 100%|██████████| 50/50 [55:41<00:00, 66.83s/it] 
Generating walks (CPU: 4): 100%|██████████| 50/50 [1:04:38<00:00, 77.57s/it]it]
Generating walks (CPU: 2): 100%|██████████| 50/50 [1:16:00<00:00, 91.20s/it]   


In [None]:
features_df = pd.DataFrame.from_dict(shortest_path_features, orient='index')
features_df['node2vec'] = features_df.index.map(node2vec_features)
features_df = features_df.join(products_df.set_index('Id'), on=features_df.index)


In [None]:
features_df = pd.DataFrame.from_dict(shortest_path_features, orient='index')
features_df['node2vec'] = features_df.index.map(node2vec_features)
features_df = features_df.join(products_df.set_index('Id'), on=features_df.index)


In [None]:
# Convert the node2vec features dictionary to a DataFrame.
features_df = pd.DataFrame.from_dict(node2vec_features, orient='index')

# Join features_df with the products_df
features_df = features_df.join(products_df.set_index('Id'), on=features_df.index)

# only use 'node2vec' column from features_df and salesrank from the original products metadata
X = list(features_df['node2vec'])
X = pd.DataFrame(X)
X['salesrank'] = features_df['salesrank']
