In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a graph
G = nx.Graph()

# Add nodes (mathematical subjects)
subjects = df["Subject"].tolist()
G.add_nodes_from(subjects)

# Add weighted edges based on TF-IDF similarity
threshold = 0.1  # Only add edges for similarity above this threshold
for i in range(len(subjects)):
    for j in range(i + 1, len(subjects)):
        weight = tfidf_similarity[i, j]
        if weight > threshold:  # Only add meaningful connections
            G.add_edge(subjects[i], subjects[j], weight=weight)

# Draw the graph
plt.figure(figsize=(8, 6))
pos = nx.spring_layout(G)  # Positioning nodes
edges = G.edges(data=True)

# Draw nodes and edges
nx.draw(G, pos, with_labels=True, node_color="lightblue", edge_color="gray", node_size=2000, font_size=10)
nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): f"{d['weight']:.2f}" for u, v, d in edges}, font_size=8)

# Show the graph
plt.title("Graph Representation of Mathematical Subject Similarity")
plt.show()


In [None]:
# Check if PyTorch Geometric is available
try:
    import torch
    from torch_geometric.data import Data
    from torch_geometric.nn import GCNConv
    from torch.nn import functional as F

    # Step 1: Convert NetworkX Graph to PyTorch Geometric Data Format
    edge_index = torch.tensor(list(G.edges), dtype=torch.long).t().contiguous()

    # Node Features: Using TF-IDF matrix as initial node features
    node_features = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float)

    # Create a PyTorch Geometric data object
    data = Data(x=node_features, edge_index=edge_index)

    # Step 2: Define a Graph Convolutional Network (GCN) Model
    class GCN(torch.nn.Module):
        def __init__(self, in_channels, hidden_channels, out_channels):
            super(GCN, self).__init__()
            self.conv1 = GCNConv(in_channels, hidden_channels)
            self.conv2 = GCNConv(hidden_channels, out_channels)

        def forward(self, x, edge_index):
            x = self.conv1(x, edge_index)
            x = F.relu(x)
            x = self.conv2(x, edge_index)
            return x

    # Step 3: Train the GNN Model
    model = GCN(in_channels=node_features.shape[1], hidden_channels=16, out_channels=2)  # 2D embedding
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

    def train():
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = F.mse_loss(out, data.x)  # Simple reconstruction loss
        loss.backward()
        optimizer.step()
        return loss.item()

    # Train for a few epochs
    for epoch in range(200):
        loss = train()

    # Step 4: Extract Learned Node Embeddings
    model.eval()
    with torch.no_grad():
        embeddings = model(data.x, data.edge_index).numpy()

    # Convert embeddings to DataFrame for visualization
    embeddings_df = pd.DataFrame(embeddings, index=df["Subject"], columns=["Dim1", "Dim2"])

    # Display learned embeddings
    tools.display_dataframe_to_user(name="GNN Learned Embeddings", dataframe=embeddings_df)

except ImportError:
    print("PyTorch Geometric is not available in this environment.")


In [None]:
# Check if PyTorch Geometric is available for advanced GNN implementation
try:
    import torch
    from torch_geometric.nn import GATConv  # Using Graph Attention Networks (GAT)
    from sklearn.cluster import KMeans
    from sklearn.manifold import TSNE
    import numpy as np

    # Define Graph Attention Network (GAT) Model
    class GAT(torch.nn.Module):
        def __init__(self, in_channels, hidden_channels, out_channels):
            super(GAT, self).__init__()
            self.conv1 = GATConv(in_channels, hidden_channels, heads=4, concat=True)  # Multi-head attention
            self.conv2 = GATConv(hidden_channels * 4, out_channels, heads=1, concat=False)  # Output layer

        def forward(self, x, edge_index):
            x = self.conv1(x, edge_index)
            x = F.elu(x)  # Exponential Linear Unit (ELU) activation
            x = self.conv2(x, edge_index)
            return x

    # Initialize the GAT model
    model = GAT(in_channels=node_features.shape[1], hidden_channels=16, out_channels=4)  # 4D embeddings
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

    def train_gat():
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = F.mse_loss(out, data.x)  # Reconstruction loss
        loss.backward()
        optimizer.step()
        return loss.item()

    # Train GAT for 300 epochs
    for epoch in range(300):
        loss = train_gat()

    # Extract learned node embeddings
    model.eval()
    with torch.no_grad():
        embeddings = model(data.x, data.edge_index).numpy()

    # Step 2: Perform Clustering on Node Embeddings
    num_clusters = 3  # Assuming 3 clusters
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings)

    # Step 3: Project embeddings into 2D space using t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    tsne_embeddings = tsne.fit_transform(embeddings)

    # Create a DataFrame to store results
    embeddings_df = pd.DataFrame(tsne_embeddings, index=df["Subject"], columns=["TSNE_Dim1", "TSNE_Dim2"])
    embeddings_df["Cluster"] = cluster_labels

    # Display Clustering Results
    tools.display_dataframe_to_user(name="GAT Clustering Results", dataframe=embeddings_df)

    # Step 4: Visualize 2D embeddings with Clusters
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1], c=cluster_labels, cmap="viridis", s=100, alpha=0.8)
    for i, txt in enumerate(df["Subject"]):
        plt.annotate(txt, (tsne_embeddings[i, 0], tsne_embeddings[i, 1]), fontsize=10, ha='right')
    plt.colorbar(scatter, label="Cluster")
    plt.title("Mathematical Subject Embeddings (t-SNE Projection)")
    plt.xlabel("TSNE Dimension 1")
    plt.ylabel("TSNE Dimension 2")
    plt.show()

except ImportError:
    print("PyTorch Geometric is not available in this environment.")


In [None]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import networkx as nx

# Step 1: Perform Clustering on TF-IDF Feature Vectors
num_clusters = 3  # Assuming 3 clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(tfidf_matrix.toarray())

# Step 2: Reduce Dimensionality Using t-SNE for Visualization
tsne = TSNE(n_components=2, random_state=42)
tsne_embeddings = tsne.fit_transform(tfidf_matrix.toarray())

# Convert to DataFrame for better readability
embeddings_df = pd.DataFrame(tsne_embeddings, index=df["Subject"], columns=["TSNE_Dim1", "TSNE_Dim2"])
embeddings_df["Cluster"] = cluster_labels

# Step 3: Link Prediction using Jaccard Similarity
predicted_links = []
for u, v in nx.non_edges(G):  # Check pairs that do not have an edge yet
    jaccard_coeff = list(nx.jaccard_coefficient(G, [(u, v)]))  # Compute Jaccard Coefficient
    if jaccard_coeff and jaccard_coeff[0][2] > 0.1:  # Only keep meaningful links
        predicted_links.append((u, v, jaccard_coeff[0][2]))

# Convert to DataFrame for visualization
predicted_links_df = pd.DataFrame(predicted_links, columns=["Node 1", "Node 2", "Jaccard Similarity"])

# Display Results
tools.display_dataframe_to_user(name="Mathematical Subject Clusters", dataframe=embeddings_df)
tools.display_dataframe_to_user(name="Predicted Missing Links", dataframe=predicted_links_df)

# Step 4: Visualizing the t-SNE Embeddings with Clusters
plt.figure(figsize=(8, 6))
scatter = plt.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1], c=cluster_labels, cmap="coolwarm", s=100, alpha=0.8)
for i, txt in enumerate(df["Subject"]):
    plt.annotate(txt, (tsne_embeddings[i, 0], tsne_embeddings[i, 1]), fontsize=10, ha='right')
plt.colorbar(scatter, label="Cluster")
plt.title("Mathematical Subject Embeddings (t-SNE Projection)")
plt.xlabel("TSNE Dimension 1")
plt.ylabel("TSNE Dimension 2")
plt.show()
