In [1]:
from stellargraph.data import EdgeSplitter
import stellargraph as sg
import numpy as np
from sklearn.model_selection import train_test_split
import multiprocessing
import plotly.graph_objects as go
from sklearn.metrics import accuracy_score
import networkx as nx
import plotly.graph_objs as go
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'stellargraph'

## Constansok

In [None]:
# ['0', '107']
edge_path = 'facebook/0.edges' # 0.edges

## Gráf Beolvasása

In [None]:
def is_directed_graph(graph):

    adjacency_matrix = nx.to_numpy_array(graph)
    A = np.array(adjacency_matrix)

    # Csekkolom, hogy szimmetrikus-e, elméletileg annak kell lennie
    is_symmetric = np.allclose(A, A.T)

    if is_symmetric:
        print("The matrix is symmetric, not directed")
    else:
        print("The matrix is not symmetric.")

def print_graph_info(G):
    text = G.info()
    print(text)

In [None]:
G = nx.read_edgelist(edge_path, nodetype=int, create_using=nx.Graph())
for edge in G.edges():
    G[edge[0]][edge[1]]['weight'] = 1

# is_directed_graph(G)
# Castolás StellarGraph-ra, ment ebben találtam hasznos függvényeket
G = sg.StellarGraph.from_networkx(G)

In [None]:
print_graph_info(G)

In [None]:
edge_splitter_test = EdgeSplitter(G)

In [None]:
# Itt az edge_splitter tesztből vesz éleket, ezeket az examples_test,
# Majd vesz negatív éleket is, ezt szintén hozzárakja az examples_test-hez concatolva,
# a labels test az, hogy ez az él létezik-e vagy sem
# a graph test a gráf a maradék éllel
graph_test, examples_test, labels_test = edge_splitter_test.train_test_split(p=0.1, method="global")

In [None]:
print_graph_info(graph_test)

In [None]:
edge_splitter_train = EdgeSplitter(g=graph_test, g_master=G)
graph_train, examples, labels = edge_splitter_train.train_test_split(p=0.1, method="global")



(examples_train, examples_model_selection, labels_train, labels_model_selection,)\
                                        = train_test_split(examples, labels, train_size=0.75, test_size=0.25)

In [None]:
print_graph_info(graph_train)

In [None]:
pd.DataFrame(
    [
        (
            "Training Set",
            len(examples_train),
            "Train Graph",
            "Test Graph",
            "Train the Link Classifier",
        ),
        (
            "Model Selection",
            len(examples_model_selection),
            "Train Graph",
            "Test Graph",
            "Select the best Link Classifier model",
        ),
        (
            "Test set",
            len(examples_test),
            "Test Graph",
            "Full Graph",
            "Evaluate the best Link Classifier",
        ),
    ],
    columns=("Split", "Number of Examples", "Hidden from", "Picked from", "Use"),
).set_index("Split")

In [None]:
p = 1.0
q = 1.0
dimensions = 128
num_walks = 60
walk_length = 60
window_size = 16
num_iter = 5
workers = multiprocessing.cpu_count()

In [None]:
def node2vec_embedding(graph, name):
    rw = BiasedRandomWalk(graph)
    walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q)
    print(f"Number of random walks for '{name}': {len(walks)}")

    model = Word2Vec(
        walks,
        vector_size=dimensions,
        window=window_size,
        min_count=0,
        sg=1,
        workers=workers,
        epochs=num_iter,
    )

    def get_embedding(u):
        return model.wv[u]

    return get_embedding

In [None]:
embedding_train = node2vec_embedding(graph_train, "Train Graph")

In [None]:
# 1. link embeddings
def link_examples_to_features(link_examples, transform_node, binary_operator):
    return [
        binary_operator(transform_node(src), transform_node(dst))
        for src, dst in link_examples
    ]

# 2. training classifier
def train_link_prediction_model(
        link_examples, link_labels, get_embedding, binary_operator
):
    # Ezt akár lehet módosítani is más algoritmussal
    clf = link_prediction_classifier()

    # Itt távolságot számol a start és end pont embeddingje között
    link_features = link_examples_to_features(
        link_examples, get_embedding, binary_operator
    )
    # Majd arra fitteli a modelt
    clf.fit(link_features, link_labels)
    return clf


def link_prediction_classifier(max_iter=4000):
    lr_clf = LogisticRegressionCV(Cs=10, cv=10, scoring="roc_auc", max_iter=max_iter)
    return Pipeline(steps=[("sc", StandardScaler()), ("clf", lr_clf)])


# 3. and 4. evaluate classifier
def evaluate_link_prediction_model(
        clf, link_examples_test, link_labels_test, get_embedding, binary_operator
):
    link_features_test = link_examples_to_features(
        link_examples_test, get_embedding, binary_operator
    )
    score = evaluate_roc_auc(clf, link_features_test, link_labels_test)
    return score


def evaluate_roc_auc(clf, link_features, link_labels):
    predicted = clf.predict_proba(link_features)

    # check which class corresponds to positive links
    positive_column = list(clf.classes_).index(1)
    return roc_auc_score(link_labels, predicted[:, positive_column])

In [None]:
def operator_hadamard(u, v):
    return u * v


def operator_l1(u, v):
    return np.abs(u - v)


def operator_l2(u, v):
    return (u - v) ** 2


def operator_avg(u, v):
    return (u + v) / 2.0


def run_link_prediction(binary_operator):
    clf = train_link_prediction_model(
        examples_train, labels_train, embedding_train, binary_operator
    )
    score = evaluate_link_prediction_model(
        clf,
        examples_model_selection,
        labels_model_selection,
        embedding_train,
        binary_operator,
    )

    return {
        "classifier": clf,
        "binary_operator": binary_operator,
        "score": score,
    }

binary_operators = [operator_hadamard, operator_l1, operator_l2, operator_avg]

In [None]:
results = [run_link_prediction(op) for op in binary_operators]
best_result = max(results, key=lambda result: result["score"])

print(f"Best result from '{best_result['binary_operator'].__name__}'")

pd.DataFrame(
    [(result["binary_operator"].__name__, result["score"]) for result in results],
    columns=("name", "ROC AUC score"),
).set_index("name")

In [None]:
embedding_test = node2vec_embedding(graph_test, "Test Graph")

In [None]:
test_score = evaluate_link_prediction_model(
    best_result["classifier"],
    examples_test,
    labels_test,
    embedding_test,
    best_result["binary_operator"],
)
print(
    f"ROC AUC score on test set using '{best_result['binary_operator'].__name__}': {test_score}"
)

In [None]:
from sklearn.decomposition import PCA

# Calculate edge features for test data
link_features = link_examples_to_features(
    examples_test, embedding_test, best_result["binary_operator"]
)

# Learn a projection from 128 dimensions to 2
pca = PCA(n_components=2)
X_transformed = pca.fit_transform(link_features)

# plot the 2-dimensional points
plt.figure(figsize=(16, 12))
plt.scatter(
    X_transformed[:, 0],
    X_transformed[:, 1],
    c=np.where(labels_test == 1, "b", "r"),
    alpha=0.5,
)

In [None]:
user = 2 #322

possible_connections = []

for node in G.nodes():
    if node != user:
        possible_connections.append((user, node))

In [None]:
existing_connections = {}
for (start, end ) in possible_connections:
    if (start, end) in G.edges() or (end, start) in G.edges():
        existing_connections[(start, end)] = 1
    else:
        existing_connections[(start, end)] = 0

In [None]:
best_binary_operator = best_result['binary_operator']
model = best_result['classifier']

emb_distance = [best_binary_operator(embedding_train(src), embedding_train(dst)) for src, dst in possible_connections]

In [None]:
preds = model.predict(emb_distance)
pred_prob = model.predict_proba(emb_distance)

In [None]:
# Create a DataFrame from the existing_connections dictionary
df = pd.DataFrame(list(existing_connections.items()), columns=['Connection', 'Real'])
df['Pred'] = preds
df['Prob'] = pred_prob[:, 1]

In [None]:
def calculate_accuracy(real, pred):
    accuracy = accuracy_score(real, pred)
    return accuracy


calculate_accuracy(df['Real'], df['Pred']);

In [None]:
def plot_roc_curve(predictions_df):
    # Extract the 'Real', 'Pred', and 'Prob' columns
    ground_truth_values = predictions_df['Real'].tolist()
    predictions = predictions_df['Pred'].tolist()

    # Calculate the ROC curve
    fpr, tpr, _ = roc_curve(ground_truth_values, predictions)

    # Calculate the AUC (Area Under the Curve)
    roc_auc = roc_auc_score(ground_truth_values, predictions)

    # Plot the ROC curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')
    plt.show()

# Example usage:
plot_roc_curve(df)

In [None]:
was_in_train = examples[labels == 1]
was_in_test = examples_test[labels_test == 1]

In [None]:
was_in_train = [(int(a), int(b)) for a,b in was_in_train]
was_in_test = [(int(a), int(b)) for a,b in was_in_test]

In [None]:
def was_in_test_function(row, list):
    src, dest = row['Connection']

    if [src, dest] in list or (dest, src) in list:
        return 1
    else:
        return 0

df['IncludedInTrain'] = df.apply(lambda x: was_in_test_function(x, was_in_train), axis=1)
df['IncludedInTest'] = df.apply(lambda x: was_in_test_function(x, was_in_test), axis=1)

In [None]:
t = df[df['IncludedInTrain'] == 1]
train_acc = calculate_accuracy(t['Real'], t['Pred'])


t = df[df['IncludedInTest'] == 1]
test_acc = calculate_accuracy(t['Real'], t['Pred'])

t = df[~((df['IncludedInTrain'] == 1) | (df['IncludedInTest'] == 1))]
not_in_any_acc = calculate_accuracy(t['Real'], t['Pred'])

accuracy = calculate_accuracy(df['Real'], df['Pred'])

print(f'Train Accuracy: {train_acc:.2%}')
print(f'Test Accuracy: {test_acc:.2%}')
print(f'Not in any Accuracy: {not_in_any_acc:.2%}')
print(f'Test Accuracy: {accuracy:.2%}')

In [None]:
df.sort_values('Prob', ascending =False)

In [None]:
result = df[(df['Real'] == 0) & (df['Pred'] == 1)].sort_values('Prob', ascending = False).head(10)

In [None]:
result['recommended_friend'] = result['Connection'].apply(lambda x: x[1])

In [None]:
new_recomendations = list(result['Connection'].values)

In [None]:
def visualize_graph(graph, recommended_edges=None):
    # Create a NetworkX graph from your 'graph' adjacency matrix
    G = nx.Graph(graph)

    # Create positions for the nodes using a layout algorithm (e.g., spring_layout)
    pos = nx.spring_layout(G)

    # Create edges and nodes
    edge_trace = go.Scatter(
        x=[],
        y=[],
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    node_trace = go.Scatter(
        x=[],
        y=[],
        text=[],
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            colorscale='YlGnBu',
            size=10,
        )
    )

    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_trace['x'] += tuple([x0, x1, None])
        edge_trace['y'] += tuple([y0, y1, None])

    for node in G.nodes():
        x, y = pos[node]
        node_trace['x'] += tuple([x])
        node_trace['y'] += tuple([y])

    # Add recommended edges in green color
    if recommended_edges:
        recommended_x = []
        recommended_y = []
        recommended_text = []

        for edge in recommended_edges:
            x0, y0 = pos[edge[0]]
            x1, y1 = pos[edge[1]]
            recommended_x += [x0, x1, None]
            recommended_y += [y0, y1, None]
            recommended_text.append(f"Edge: {edge}, Color: Green")

        edge_trace_recommended = go.Scatter(
            x=recommended_x,
            y=recommended_y,
            text=recommended_text,
            line=dict(width=2, color='green'),
            hoverinfo='text',
            mode='lines'
        )

        fig = go.Figure(data=[edge_trace, node_trace, edge_trace_recommended],
                        layout=go.Layout(
                            showlegend=False,
                            hovermode='closest',
                            margin=dict(b=0, l=0, r=0, t=0)
                        )
                        )

    else:
        fig = go.Figure(data=[edge_trace, node_trace],
                        layout=go.Layout(
                            showlegend=False,
                            hovermode='closest',
                            margin=dict(b=0, l=0, r=0, t=0)
                        )
                        )

    fig.show()


G_nx = G.to_networkx()

# Example usage:
# visualize_graph(G_nx, recommended_edges=new_recomendations)

## Recommend For All

In [None]:
possible_connections = []
for user in G.nodes():
    for node in G.nodes():
        if node != user:
            possible_connections.append((user, node))

In [None]:
existing_connections = {}
for (start, end ) in possible_connections:
    if (start, end) in G.edges() or (end, start) in G.edges():
        existing_connections[(start, end)] = 1
    else:
        existing_connections[(start, end)] = 0

In [None]:
best_binary_operator = best_result['binary_operator']
model = best_result['classifier']

emb_distance = [best_binary_operator(embedding_train(src), embedding_train(dst)) for src, dst in possible_connections]

In [None]:
preds = model.predict(emb_distance)
pred_prob = model.predict_proba(emb_distance)

# Create a DataFrame from the existing_connections dictionary
df = pd.DataFrame(list(existing_connections.items()), columns=['Connection', 'Real'])
df['Pred'] = preds
df['Prob'] = pred_prob[:, 1]

In [None]:
accuracy = calculate_accuracy(df['Real'], df['Pred'])

In [None]:
accuracy

In [None]:
df['Prob'].hist()

In [None]:
df['Real'].value_counts()

In [None]:
df['Pred'].value_counts()

In [None]:
plot_roc_curve(df)

In [None]:
ids = ['0', '107', '348', '414', '686', '698']

def get_path(id):
    path = f'facebook/{id}.edges' # 0.edges
    return path

for id in ids:
    edge_path = get_path(id)
    G = nx.read_edgelist(edge_path, nodetype=int, create_using=nx.Graph())
    G = sg.StellarGraph.from_networkx(G)
    print(id)
    print(f'Nodes: {len(G.nodes())}')
    print(f'Edges: {len(G.edges())}')
    print('+++++++++++++++++++++')
