<i>The purpose of this notebook is to create some form of synthetic graph involving student organizations to allow the creation of an architecture which can close triangles and provide good information.</i>

It is entirely possible if not expected that the biases introduced in this notebook will show up later in our project as inferences. As such this notebook will be tweaked as time goes on to create more complex data to train on as well as erase any bias we think is mis-representative of something which could actually exist.

In [1]:
import networkx as nx
import numpy as np

In [2]:
N_students = 300
N_orgs = 15

student_features = {
    'type':{'dtype': str, 'values':['student']},
    'year': {
        'dtype':int,

        # 1, 2, 3, 4, 5
        # Freshman, Sophmore, Junior, Senior, Grad Student
        'values':np.arange(5) + 1,
        'p': np.array([10 - i for i in range(5)])
    },
    'major': {
        'dtype':str,
        'values': ['STEM', 'Arts', 'Pre-Med', 'Other'],
        'p': np.array([10, 1, 2, 5])
    },
    'commitment_limit': {
        'dtype':int,
        'values': [15],
    }

}
    
org_features = {
    'type':{'dtype': str, 'values':['org']},
    'hour_req':{
        'dtype':int,
        'values':np.arange(12) + 1
    },
    'topic':{
        'dtype':str,
        'values': ['STEM', 'Arts', 'Pre-Med', 'Entertainment', 'Other'],
        'p': np.array([10, 2, 0, 8, 4])
    }

}

In [3]:
def synthesize_student(__studentname__=[1]):
    student = {}
    for feature_name, feature_dict in student_features.items():
        p = None
        if feature_dict.get('p', None) is not None:
            p = feature_dict['p'] / feature_dict['p'].sum()
        student[feature_name] = feature_dict['dtype'](np.random.choice(feature_dict['values'], p=p))

    student['name'] = f'student_{__studentname__[0]}'
    student['id'] = __studentname__[0]
    __studentname__[0] += 1

    return (student['name'], student)

def synthesize_students(N):
    return [synthesize_student() for _ in range(N)]

def synthesize_org(__orgname__=[1]):
    org = {}
    for feature_name, feature_dict in org_features.items():
        p = None
        if feature_dict.get('p', None) is not None:
            p = feature_dict['p'] / feature_dict['p'].sum()
        org[feature_name] = feature_dict['dtype'](np.random.choice(feature_dict['values'], p=p))

    org['name'] = f'org_{__orgname__[0]}'
    org['id'] = __orgname__[0]
    __orgname__[0] += 1

    return (org['name'], org)

def synthesize_orgs(N):
    return [synthesize_org() for _ in range(N)]

In [4]:
# Students can connect to orgs in a number of ways
    # Preferrential attachment for major matching
    # Preferrential attachment for bigger orgs
    # Cannot go over their maximum commitment limit


# Can create a number of subgraphs
    # Orgs with shared students have an edge with properties like number of shared students

# Likewise students can be linked by the orgs they share

In [4]:
G = nx.Graph()
students = synthesize_students(N_students)
orgs = synthesize_orgs(N_orgs)
G.add_nodes_from(students)
G.add_nodes_from(orgs)

In [5]:
# attach students to orgs
l1, l2, l3 = 0.5, 10, -0.1

for student in students:
    hours_committed = 0
    s = student[1]
    sid = student[0]
    while hours_committed < s['commitment_limit']:
        orgs_can_join = [o for o in orgs if o[0] not in G[sid]]
        if len(orgs_can_join) == 0:
            break

        org_ids = [o[0] for o in orgs_can_join]

        # Calculate probability for each org
        degree = np.array([G.degree[o] for o in org_ids]) # Bigger orgs
        majors = np.array([G.nodes[o]['topic'] == s['major'] for o in org_ids], dtype='int')
        hours = np.array([G.nodes[o]['hour_req'] for o in org_ids])

        p = degree * l1 + majors * l2 + hours * l3        # Calc p metric
        q = p - p.min() + 0.00001                         # Set to positive and prevent NaN
        k = q / q.sum()                                   # Normalize
        
        org = np.random.choice(org_ids, p=k)
        G.add_edge(sid, org)

        hours_committed += [o[1]['hour_req'] for o in orgs_can_join if o[0] == org][0] # This line sucks - find a better way with numpy

        if np.random.random() < 0.1 * hours_committed:
            # Student stops signing up for things even if they haven't reached their absolute limit bc some have personal lives
            break

In [None]:
# ideas
#
#   passion_factor -> [0, inf) := the degree to which a student prefers their own major. multiplicative with major pref
#       rationale: low passion factor implies a student who likes to explore orgs outside of their own major. High implies the reverse

In [23]:
import itertools

def intersect(a1, a2):
    return [n for n in a1 if n in a2]

def create_shared_subgraph(G, type='org'):
    # Create a graph with connections between orgs
    SG = nx.Graph()
    nodes = [n for n in G.nodes if G.nodes[n]['type'] == type]
    SG.add_nodes_from(nodes)
    for n1, n2 in itertools.combinations(nodes, 2):
        # add shared students to an edge
        n1_neighbors = G[n1]
        n2_neighbors = G[n2]

        both_neighbors = intersect(n1_neighbors, n2_neighbors)
        if len(both_neighbors) > 0:
            SG.add_edge(n1, n2, shared_neighbors = both_neighbors)

    return SG

In [24]:
G2 = create_shared_subgraph(G)

In [27]:
edges = np.asarray(G2.edges)

In [38]:
x = np.random.choice(np.arange(edges.size), 15)

In [39]:
x.sort()

In [40]:
x

array([ 2, 23, 38, 47, 49, 60, 61, 68, 72, 75, 79, 82, 83, 84, 88])

In [None]:
# Basic Node Classification

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv


In [None]:
class NodeClassifier():
    class GCN(torch.nn.module):
        def __init__(self, nfeatures, nclasses, nchannels=32, seed=1239587):
            super().__init__()

            torch.manual_seed(seed)
            self.c1 = GCNConv(nfeatures, nchannels)
            self.c2 = GCNConv(nchannels, nclasses)


        def forward(self, x, edge_index):
            x = self.c1(x, edge_index)
            x = x.relu()
            x = F.dropout(x, p-0.5, training=self.training)
            x = self.c2(x, edge_index)
            return x
    
    def __init__(self):
        self.model = self.GCN()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01)
        self.criterion = torch.nn.CrossEntropyLoss()
        
    def fit(self, x, y, edge_index, epochs=100):
        self.model.train()
        for e in range(epochs):
            self.optimizer.zero_grad()
            out = self.model(x, edge_index)
            loss = self.criterion(out, y)
            loss.backward()
            self.optimizer.step()
            print(f'Epoch {e:3d}: loss: {loss:.4f}')

    def eval(self, x, y, edge_index):
        self.model.eval()
        out = self.model(x, edge_index)
        pred = out.argmax(dim=1)
        acc = pred == y
        acc = acc.sum() / len(y)
        print(f'Accuracy: {acc:.2f}')
        return acc

In [None]:
    # Basic Link Prediction
from copy import deepcopy

In [None]:
# Take graph and remove edges to create
def prune_edges(G, p_keep=0.8):
    edges = np.asarray(G.edges)
    mask = np.random.chocie(np.arange(edges.size), np.floor(p_keep * edges.size))
    return edges[mask], mask

