In [None]:
# 1. Setup: imports, mount Drive, load cleaned IPEDS data

import os
import pandas as pd
from google.colab import drive

# Mount Drive
drive.mount('/content/drive', force_remount=True)

# Project paths
PROJ = "/content/drive/MyDrive/dissertation"
DATA_DIR = f"{PROJ}/data"
OUT_DIR = f"{PROJ}/outputs/embeddings"
os.makedirs(OUT_DIR, exist_ok=True)

# Load cleaned IPEDS data (from 05a)
ipeds_path = f"{DATA_DIR}/ipeds_data.csv"
ipeds = pd.read_csv(ipeds_path)

print("Loaded IPEDS data:", ipeds.shape)
ipeds.head()


Mounted at /content/drive
Loaded IPEDS data: (11004, 21)


Unnamed: 0,unitid,year,state_abbr,longitude,latitude,region,sector,inst_control,urban_centric_locale,inst_size,...,headcount,enrolled_undergrad_fulltime,enrolled_undergrad_parttime,enrolled_graduate_fulltime,enrolled_graduate_parttime,cbsa,cbsa_type,inst_affiliation,completers,cips
0,100654,2017,AL,-86.568504,34.783367,Southeast: AL AR FL GA KY LA MS NC SC TN VA an...,Public four-year or above,Public,City midsize,5,...,6508.0,1,1,1,1,26620.0,Metropolitan statistical area,Public,847,"1.0999,1.1001,1.9999,10.0202,11.0101,13.0101,1..."
1,100654,2018,AL,-86.568504,34.783367,Southeast: AL AR FL GA KY LA MS NC SC TN VA an...,Public four-year or above,Public,City midsize,5,...,6686.0,1,1,1,1,26620.0,Metropolitan statistical area,Public,810,"01.0999,01.1001,01.9999,03.0599,04.0301,09.019..."
2,100654,2019,AL,-86.568504,34.783367,Southeast: AL AR FL GA KY LA MS NC SC TN VA an...,Public four-year or above,Public,City midsize,5,...,6560.0,1,1,1,1,26620.0,Metropolitan statistical area,Public,896,"01.0999,01.1001,01.9999,03.0599,04.0301,09.019..."
3,100654,2020,AL,-86.568504,34.783367,Southeast: AL AR FL GA KY LA MS NC SC TN VA an...,Public four-year or above,Public,City midsize,5,...,6628.0,1,1,1,1,26620.0,Metropolitan statistical area,Public,896,"01.0999,01.1001,01.9999,03.0599,04.0301,05.029..."
4,100654,2021,AL,-86.568502,34.783368,Southeast: AL AR FL GA KY LA MS NC SC TN VA an...,Public four-year or above,Public,City midsize,5,...,2689.0,1,1,1,1,26620.0,Metropolitan statistical area,Public,821,"01.0999,01.1001,01.9999,03.0599,04.0301,09.019..."


In [None]:
# 2. Build CIP lists and normalize CIP codes

# Turn comma-separated CIPs into Python lists
ipeds["cips_list"] = (
    ipeds["cips"]
    .astype(str)
    .str.replace(" ", "")
    .str.split(",")
)

# Normalization helpers: e.g. "1.1" -> "01.1000"
def normalize_cip(code: str):
    code = str(code).strip()
    if code == "" or code.lower() == "nan":
        return None
    if "." not in code:
        # keep simple codes like "99" as-is
        return code
    left, right = code.split(".", 1)
    left = left.zfill(2)            # "1" -> "01"
    right = right.ljust(4, "0")     # pad to 4 places
    right = right[:4]               # ensure length 4
    return f"{left}.{right}"

def normalize_cip_list(cip_list):
    out = []
    for c in cip_list:
        norm = normalize_cip(c)
        if norm is not None:
            out.append(norm)
    return out

ipeds["cips_list_norm"] = ipeds["cips_list"].apply(normalize_cip_list)

# This is what we'll use for the graph + walks
cip_train = ipeds[["unitid", "cips_list_norm"]].copy()
print("Prepared CIP sequences:", cip_train.shape)
cip_train.head()


Prepared CIP sequences: (11004, 2)


Unnamed: 0,unitid,cips_list_norm
0,100654,"[01.0999, 01.1001, 01.9999, 10.0202, 11.0101, ..."
1,100654,"[01.0999, 01.1001, 01.9999, 03.0599, 04.0301, ..."
2,100654,"[01.0999, 01.1001, 01.9999, 03.0599, 04.0301, ..."
3,100654,"[01.0999, 01.1001, 01.9999, 03.0599, 04.0301, ..."
4,100654,"[01.0999, 01.1001, 01.9999, 03.0599, 04.0301, ..."


In [None]:
# 3. Build CIP co-occurrence graph

import networkx as nx
from itertools import combinations
from tqdm import tqdm

G = nx.Graph()

for cips in tqdm(cip_train["cips_list_norm"], desc="Building CIP graph"):
    unique_cips = list(set(cips))
    if len(unique_cips) < 2:
        continue
    for a, b in combinations(unique_cips, 2):
        if G.has_edge(a, b):
            G[a][b]["weight"] += 1.0
        else:
            G.add_edge(a, b, weight=1.0)

print("Graph built:")
print("  Nodes:", G.number_of_nodes())
print("  Edges:", G.number_of_edges())


Building CIP graph: 100%|██████████| 11004/11004 [00:46<00:00, 238.69it/s]

Graph built:
  Nodes: 1585
  Edges: 630214





In [None]:
# 4. Generate DeepWalk-style random walks on the CIP graph

import random

def generate_walks(G, num_walks=10, walk_length=20):
    """
    Simple DeepWalk-style random walks.
    For each node, we start `num_walks` walks of length `walk_length`.
    """
    nodes = list(G.nodes())
    walks = []

    for _ in range(num_walks):
        random.shuffle(nodes)
        for node in nodes:
            walk = [node]
            current = node
            for _ in range(walk_length - 1):
                neighbors = list(G.neighbors(current))
                if not neighbors:
                    break
                current = random.choice(neighbors)
                walk.append(current)
            walks.append(walk)

    return walks

walks = generate_walks(G, num_walks=10, walk_length=20)
print("Number of walks:", len(walks))
print("Example walk:", walks[0][:10])


Number of walks: 15850
Example walk: ['51.1006', '49.0104', '13.0411', '27.0301', '51.2314', '52.0205', '14.0901', '50.0699', '51.2301', '46']


In [None]:
# 5a. Install gensim for Word2Vec (run once per session)
!pip install gensim




In [None]:
# 5b. Train 64-D graph-based CIP embeddings with Word2Vec (skip-gram)

from gensim.models import Word2Vec
import numpy as np

w2v_model = Word2Vec(
    sentences=walks,
    vector_size=64,
    window=5,
    min_count=1,
    sg=1,          # 1 = skip-gram
    workers=2,
    epochs=5,
)

# Build embedding table: one row per CIP code
cip_emb_graph = {
    node: w2v_model.wv[node]
    for node in w2v_model.wv.index_to_key
}

cip_emb_graph_df = pd.DataFrame.from_dict(cip_emb_graph, orient="index")
cip_emb_graph_df.index.name = "cip"
cip_emb_graph_df.reset_index(inplace=True)

print("Graph embedding shape:", cip_emb_graph_df.shape)
cip_emb_graph_df.head()


Graph embedding shape: (1585, 65)


Unnamed: 0,cip,0,1,2,3,4,5,6,7,8,...,54,55,56,57,58,59,60,61,62,63
0,45.1001,0.175457,-0.163499,0.081543,0.132906,-0.012698,-0.160591,0.075328,-0.178769,-0.129021,...,0.0565,-0.116326,0.08024,-0.14457,-0.177305,0.20134,-0.083285,-0.105705,0.025078,-0.040732
1,38.0101,0.194948,-0.175085,-0.034399,0.131991,-0.024849,-0.064553,0.002092,-0.154861,-0.129862,...,-0.011082,0.000113,0.115436,-0.211784,-0.144867,0.060004,0.011364,-0.101752,0.1809,-0.053448
2,40.0801,0.225312,-0.074358,0.002041,0.046958,0.044317,-0.136468,0.040754,-0.247589,-0.051645,...,0.031199,-0.020668,0.145756,-0.174186,-0.193057,0.095246,-0.015475,-0.075143,0.060219,-0.008833
3,52.0201,0.226452,-0.177377,-0.044606,0.076762,0.061169,-0.158346,0.004388,-0.227178,-0.1167,...,0.056166,-0.042559,0.042718,-0.223725,-0.1483,0.144098,-0.100957,-0.161234,0.0232,-0.118456
4,14.1901,0.184591,-0.188838,-0.009937,0.06027,0.091668,-0.118682,-0.005631,-0.182081,-0.147366,...,0.078644,0.006334,0.069567,-0.214794,-0.111651,0.170695,-0.06984,-0.091317,0.12738,0.052038


In [None]:
# 6. Save graph-based CIP embeddings (64D)

# Local path in this Colab session
local_path = "/content/cip_embeddings_graph_64.csv"
cip_emb_graph_df.to_csv(local_path, index=False)
print("Saved locally to:", local_path)

# Path in your dissertation Drive folder
drive_path = f"{OUT_DIR}/cip_embeddings_graph_64.csv"
cip_emb_graph_df.to_csv(drive_path, index=False)
print("Also saved to Drive:", drive_path)


Saved locally to: /content/cip_embeddings_graph_64.csv
Also saved to Drive: /content/drive/MyDrive/dissertation/outputs/embeddings/cip_embeddings_graph_64.csv
