In [None]:
# ======================
# MULTI-MODEL MATCHING
# ======================

"""
All-in-one script that:
1) Loads the dataset, classifies students vs. alumni.
2) Builds profiles with clarifying text.
3) Generates embeddings from TWO different models (bge-large-en-v1.5 & all-mpnet-base-v2).
4) Combines the two similarity scores with weights.
5) Applies a smooth penalty + optional boost logic.
6) Outputs: all_matches.csv, top10_matches.csv, similarity_matrix.csv.
"""

import pandas as pd
from sentence_transformers import SentenceTransformer, util
from itertools import product
import re

#########################
# STEP 1: LOAD & PREPARE
#########################

file_path = r"C:\Users\Guill\Downloads\Help Guelph students_alum network with each other (Responses).xlsx"

df = pd.read_excel(file_path)
df.columns = df.columns.str.strip()

df = df.rename(columns={
    "Are you a student (looking for a job)": "is_student",
    "Are you an alum (looking to help UofG students)": "is_alum",
    "What is/was your major": "major",
    "What is your your field of interest/work  (separate with commas)": "field",
    "What job title do you wanna (or already) have  (separate with commas)": "job_title",
    "Submit your UOFG or WORK email (if possible)": "email"
})

# Normalize yes/no
df["is_student"] = df["is_student"].str.strip().str.lower() == "yes"
df["is_alum"] = df["is_alum"].str.strip().str.lower() == "yes"
df["email"] = df["email"].fillna("").astype(str).str.lower()

# Classification logic
def classify(row):
    is_student = row["is_student"]
    is_alum = row["is_alum"]
    email = row["email"]

    if is_student and not is_alum:
        return "student"
    elif not is_student and is_alum:
        return "alum"
    elif is_student and is_alum:
        return "student" if "@uoguelph.ca" in email else "alum"
    elif not is_student and not is_alum:
        return "student"
    else:
        return "exclude"

# Apply classification
df["classification"] = df.apply(classify, axis=1)
df = df[df["classification"] != "exclude"].copy()

# Split
students = df[df["classification"] == "student"].copy()
alumni = df[df["classification"] == "alum"].copy()

for col in ["major", "field", "job_title"]:
    students[col] = students[col].fillna("Not specified")
    alumni[col] = alumni[col].fillna("Not specified")

# Deduplicate alumni by email
alumni = alumni.sort_values(by=["job_title", "field"], ascending=False)
alumni = alumni.drop_duplicates(subset=["email"], keep="first")

#########################
# STEP 2: BUILD PROFILES
#########################

def build_profile(row, role="student"):
    if role == "student":
        return (
            f"This is a student. Their major is {row['major']}. "
            f"They want to work as: {row['job_title']}. "
            f"Their interests are: {row['field']}."
        )
    else:
        return (
            f"This is an alum. Their major was {row['major']}. "
            f"They work as: {row['job_title']}. "
            f"Their field of work includes: {row['field']}."
        )

students["profile"] = students.apply(lambda r: build_profile(r, "student"), axis=1)
alumni["profile"] = alumni.apply(lambda r: build_profile(r, "alum"), axis=1)

############################
# STEP 3: MULTI-MODEL SETUP
############################

model_bge = SentenceTransformer("BAAI/bge-large-en-v1.5")
model_mpnet = SentenceTransformer("all-mpnet-base-v2")

# Encode profiles
student_profiles = students["profile"].tolist()
alum_profiles = alumni["profile"].tolist()

student_emb_bge = model_bge.encode(student_profiles, convert_to_tensor=True)
alum_emb_bge = model_bge.encode(alum_profiles, convert_to_tensor=True)

student_emb_mpnet = model_mpnet.encode(student_profiles, convert_to_tensor=True)
alum_emb_mpnet = model_mpnet.encode(alum_profiles, convert_to_tensor=True)

#########################
# STEP 4: PENALTY SYSTEM
#########################

def get_keywords(text):
    return set(re.findall(r"[a-zA-Z]+", text.lower()))

def apply_penalty_and_boost(student_row, alum_row, base_score):
    student_keys = get_keywords(student_row["major"]) | get_keywords(student_row["field"]) | get_keywords(student_row["job_title"])
    alum_keys = get_keywords(alum_row["major"]) | get_keywords(alum_row["field"]) | get_keywords(alum_row["job_title"])
    overlap = student_keys.intersection(alum_keys)

    # Graduated penalty
    if len(overlap) < 2:
        penalty = 10
    elif len(overlap) < 5:
        penalty = 5
    else:
        penalty = 0

    # Apply penalty
    score = base_score - penalty

    # Boost for important shared keywords
    boost_keywords = student_keys & alum_keys & {"data", "finance", "marketing", "health", "research"}
    if len(boost_keywords) > 0:
        score += 5

    return max(0, min(100, round(score)))

#########################
# STEP 5: COMPUTE MATCHES
#########################

results = []

for i, student_row in students.iterrows():
    for j, alum_row in alumni.iterrows():
        s_idx = students.index.get_loc(i)
        a_idx = alumni.index.get_loc(j)

        # Compute similarity scores
        score_bge = util.cos_sim(student_emb_bge[s_idx], alum_emb_bge[a_idx]).item()
        score_mpnet = util.cos_sim(student_emb_mpnet[s_idx], alum_emb_mpnet[a_idx]).item()

        # Weighted combination
        combined_sim = 0.65 * score_bge + 0.35 * score_mpnet
        base_score = ((combined_sim + 1) / 2) * 100

        # Final score with penalty and boost
        final_score = apply_penalty_and_boost(student_row, alum_row, base_score)

        results.append({
            "student_email": student_row["email"],
            "alum_email": alum_row["email"],
            "match_score": final_score,
            "student_profile": student_row["profile"],
            "alum_profile": alum_row["profile"]
        })

match_df = pd.DataFrame(results)

# Sort and drop duplicates
match_df = match_df.sort_values(by="match_score", ascending=False)
match_df = match_df.drop_duplicates(subset=["student_email", "alum_email"], keep="first")

#########################
# STEP 6: SAVE OUTPUTS
#########################

downloads_path = r"C:\Users\Guill\Downloads"

# all matches
match_df.to_csv(f"{downloads_path}\\all_matches.csv", index=False)

# top 10 per student
top10_df = match_df.sort_values(by=["student_email", "match_score"], ascending=[True, False])
top10_df = top10_df.groupby("student_email").head(10)
top10_df.to_csv(f"{downloads_path}\\top10_matches.csv", index=False)

# matrix, using pivot_table in case duplicates
matrix_df = match_df.pivot_table(
    index="student_email",
    columns="alum_email",
    values="match_score",
    aggfunc="max"
)
matrix_df.to_csv(f"{downloads_path}\\similarity_matrix.csv")

print("✅ Multi-model matching complete! Check your Downloads folder.")

In [None]:

# Count unique alumni in full and top10 match files
all_alums = set(match_df["alum_email"].unique())
top10_alums = set(top10_df["alum_email"].unique())

# Find missing alumni
missing_alums = all_alums - top10_alums

# Output result
if len(missing_alums) == 0:
    print("✅ Every alum appears in at least one student's Top 10.")
else:
    print(f"⚠️ {len(missing_alums)} alumni do NOT appear in any top 10 list.")
    print("Emails of missing alumni:")
    print(missing_alums)

In [None]:
# Count how many times each alum appears in students' Top 10 matches
alum_popularity = top10_df["alum_email"].value_counts().reset_index()
alum_popularity.columns = ["alum_email", "times_appeared"]

# Merge with alum details for context (job title, field, etc.)
alum_details = alumni[["email", "major", "job_title", "field"]].rename(columns={"email": "alum_email"})
alum_ranked = pd.merge(alum_popularity, alum_details, on="alum_email", how="left")

# Sort by most popular
alum_ranked = alum_ranked.sort_values(by="times_appeared", ascending=False)

In [None]:
alum_ranked

In [None]:
pd.set_option("display.max_colwidth", None)


# Filter for a specific alum
alum_email = "cdervaric@sylvite.ca"
alum_matches = match_df[match_df["alum_email"] == alum_email]

# Top 10-15 matches
top_matches = alum_matches.sort_values(by="match_score", ascending=False).head(30)
top_matches[["student_email", "match_score", "student_profile"]]


In [None]:
import matplotlib.pyplot as plt

match_df["match_score"].hist(bins=30)
plt.title("Match Score Distribution")
plt.xlabel("Match Score")
plt.ylabel("Frequency")
plt.show()


In [None]:
import seaborn as sns
sns.histplot(match_df["match_score"], bins=30, kde=True)


In [None]:
spread_check = match_df.groupby("student_email")["match_score"].agg(["min", "max", "mean", "std"])
spread_check["range"] = spread_check["max"] - spread_check["min"]
spread_check = spread_check.sort_values(by="range", ascending=True)
print("📉 Students with lowest score range:")
spread_check

In [None]:
flat_check = top10_df.groupby("student_email")["match_score"].std().reset_index()
flat_check.columns = ["student_email", "top10_std"]
print("Students with flat top 10s (low std dev):")
print(flat_check.nsmallest(10, "top10_std"))


In [None]:
popular_alums = match_df["alum_email"].value_counts().head(10).index.tolist()
alum_subset = match_df[match_df["alum_email"].isin(popular_alums)]

plt.figure(figsize=(10, 5))
sns.boxplot(x="alum_email", y="match_score", data=alum_subset)
plt.title("📦 Score Spread for Top 5 Most Frequent Alumni")
plt.xlabel("Alum Email")
plt.ylabel("Match Score")
plt.xticks(rotation=45)
plt.show()


In [None]:
# === Step 1: Average Score & Match Count ===
alum_score_stats = match_df.groupby("alum_email")["match_score"].agg(["mean", "count", "max"]).reset_index()
alum_score_stats.columns = ["alum_email", "avg_score", "times_matched", "peak_score"]

# === Step 2: Top 10 Appearance Count ===
top10_counts = top10_df["alum_email"].value_counts().reset_index()
top10_counts.columns = ["alum_email", "top10_count"]

# === Step 3: Merge All Stats ===
alum_analysis = pd.merge(alum_score_stats, top10_counts, on="alum_email", how="left").fillna(0)
alum_analysis["top10_count"] = alum_analysis["top10_count"].astype(int)

# === Step 4: Add Alumni Info ===
alum_info = alumni[["email", "major", "job_title", "field"]].rename(columns={"email": "alum_email"})
alum_analysis = pd.merge(alum_analysis, alum_info, on="alum_email", how="left")

# === Step 5: Sort by Top 10 Count then Avg Score ===
alum_analysis = alum_analysis.sort_values(by=["top10_count", "avg_score"], ascending=[False, True])

# === Step 6: Save and Show ===
alum_analysis.to_csv(f"{downloads_path}\\alum_popularity_quality_full.csv", index=False)

print("📊 Alumni with high top 10 counts but low average scores (possible filler matches):")
print(alum_analysis[alum_analysis["top10_count"] >= 5].head(10))

print("\n🌟 Alumni with best single matches (peak score):")
print(alum_analysis.sort_values(by='peak_score', ascending=False).head(10))


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=alum_analysis, x="top10_count", y="avg_score")
plt.title("🎯 Alumni: Frequency in Top 10 vs. Average Match Score")
plt.xlabel("Times in Top 10 Lists")
plt.ylabel("Average Match Score")
plt.axhline(70, color="red", linestyle="--", label="Score Threshold")
plt.legend()
plt.show()


In [None]:
# Get the peak (max) match score per alum
alum_peak_scores = match_df.groupby("alum_email")["match_score"].max().reset_index()
alum_peak_scores.columns = ["alum_email", "peak_match_score"]

# Optional: Merge with alum info
alum_info = alumni[["email", "major", "job_title", "field"]].rename(columns={"email": "alum_email"})
alum_peak_scores = pd.merge(alum_peak_scores, alum_info, on="alum_email", how="left")

# Sort by peak match score
alum_peak_scores = alum_peak_scores.sort_values(by="peak_match_score", ascending=False)

# Show top 10
print("🌟 Alumni with highest single match scores:")
print(alum_peak_scores.head(10))


In [None]:
# === Step 1: Top 3 Matches per Student ===
top3_df = match_df.sort_values(by=["student_email", "match_score"], ascending=[True, False])
top3_df = top3_df.groupby("student_email").head(3)

# Save it

# === Step 2: Most Popular Alumni in Top 3 ===
top3_popularity = top3_df["alum_email"].value_counts().reset_index()
top3_popularity.columns = ["alum_email", "top3_count"]

# Merge with alum info
top3_popular_alums = pd.merge(top3_popularity, alumni.rename(columns={"email": "alum_email"}), on="alum_email", how="left")

# Save and show

print("🏅 Most popular alumni in Top 3 lists:")
print(top3_popular_alums.head(10))


In [None]:
# Count unique alumni in full and top10 match files
all_alums = set(match_df["alum_email"].unique())
top3_popular_alums = set(top3_popular_alums["alum_email"].unique())

# Find missing alumni
missing_alums = all_alums - top3_popular_alums

# Output result
if len(missing_alums) == 0:
    print("✅ Every alum appears in at least one student's Top 10.")
else:
    print(f"⚠️ {len(missing_alums)} alumni do NOT appear in any top 10 list.")
    print("Emails of missing alumni:")
    print(missing_alums)

In [None]:
def find_min_top_x(match_df, alumni_emails, max_x=20):
    for x in range(1, max_x + 1):
        top_x_df = match_df.sort_values(by=["student_email", "match_score"], ascending=[True, False])
        top_x_df = top_x_df.groupby("student_email").head(x)
        
        matched_alums = set(top_x_df["alum_email"].unique())
        missing = set(alumni_emails) - matched_alums
        
        if len(missing) == 0:
            print(f"✅ Minimum top X where all alumni are included: Top {x}")
            return x, top_x_df  # also return the DataFrame so you can save it
    print(f"❌ No X ≤ {max_x} includes all alumni.")
    return None, None


In [None]:
all_alumni_emails = alumni["email"].unique()

# Run search
top_x_value, top_x_df = find_min_top_x(match_df, all_alumni_emails, max_x=20)

In [None]:
# === Step 1: Top 5 Matches per Student ===
top5_df = match_df.sort_values(by=["student_email", "match_score"], ascending=[True, False])
top5_df = top5_df.groupby("student_email").head(5)

# Save it

# === Step 2: Most Popular Alumni in Top 5 ===
top5_popularity = top5_df["alum_email"].value_counts().reset_index()
top5_popularity.columns = ["alum_email", "top5_count"]

# Merge with alum info
top5_popular_alums = pd.merge(top5_popularity, alumni.rename(columns={"email": "alum_email"}), on="alum_email", how="left")

# Save and show

print("🏅 Most popular alumni in Top 5 lists:")
print(top5_popular_alums.tail(20))


In [None]:
from collections import defaultdict
import os

# ✅ Use top 5 matches (not balanced)
grouped_matches = top5_df.groupby("student_email")

email_bodies = []

for student_email, group in grouped_matches:
    matches = group.sort_values(by="match_score", ascending=False)

    # Compose match details
    match_lines = []
    for _, row in matches.iterrows():
        match_lines.append(
            f"- {row['alum_profile'].strip()} \n  → Match Score: {row['match_score']}%\n  → Email: {row['alum_email']}"
        )

    matches_text = "\n\n".join(match_lines)
    subject = "Your Top 5 Alumni Matches – UofG Memes Networking Initiative"

    # Email body with cleaned formatting
    body = f"""To: {student_email}
Subject: {subject}

Hi,

I've finished the alumni matching system based on your responses and a semantic AI-based algorithm. You've been matched with 5 alumni based on your major, career interests, and goals. Match scores range from 0 to 100 — with 100 being the best possible match.

Here are your top 5 matches:

{matches_text}

🎯 What to do next:
You should reach out to the people above.

If you're unsure how to reach out, here’s a message you can use:

---
Hi [Name],

We were matched through the UofG Memes initiative. I saw that you work in [insert company or field], and I’m currently studying [your major] and interested in [your career goal or field].

I was wondering if you’d be open to a quick 15-minute call to share a bit about what you do, your journey, and how you got into your current role. I’d really appreciate your insights.

Thanks so much!
---

⚠️ If none of these people reply within a week or if the matches don’t seem relevant, just let me know and I can send your full top 10 matches so you can explore more options.

📌 Just a quick heads-up about expectations:

Most of the people who filled out this form and were matched with you probably graduated in the last 5 years. That means it's unlikely they can get you a job right away — and that’s totally normal.

The point of this is networking. It’s about starting conversations, learning how people broke into your field, and having people know who you are. If they get to know you, they might think of you when an opportunity comes up, or they might connect you with someone more senior at their company.

But don’t go in expecting job offers. Think of this as planting seeds for the future — the kind that lead to opportunities down the line.

Let me know if you need anything else or want to try out more matches later on.

—
Memes
"""

    email_bodies.append({"student_email": student_email, "email_body": body})

# Save each email as a separate .txt file
output_folder = os.path.join(downloads_path, "student_emails_txt")
os.makedirs(output_folder, exist_ok=True)

for email in email_bodies:
    filename = os.path.join(output_folder, f"{email['student_email'].replace('@', '_at_')}.txt")
    with open(filename, "w", encoding="utf-8") as f:
        f.write(email["email_body"])

print(f"✅ Saved all student emails with 'To' and 'Subject' to: {output_folder}")


In [None]:
import os
import pandas as pd

# Group matched students by each alum
alum_groups = top5_df.groupby("alum_email")
alum_emails = []

# Output folder for .txt files
alum_email_folder = os.path.join(downloads_path, "alumni_emails_txt")
os.makedirs(alum_email_folder, exist_ok=True)

for alum_email, group in alum_groups:
    student_matches = group.sort_values(by="match_score", ascending=False)
    total_matches = len(student_matches)

    # Format matched students
 student_matches.iterrows():
        student_lines.append(
            f    student_lines = []
    for _, row in"- {row['student_profile']} \n  → Match Score: {row['match_score']}%\n  → Email: {row['student_email']}"
        )

    student_text = "\n\n".join(student_lines)

    # Subject line
    subject = "Thank you for joining the UofG Memes Networking Initiative"

    # Full email body
    body = f"""To: {alum_email}
Subject: {subject}

Hi,

Thank you so much for volunteering to support UofG students!

You’ve been matched with **{total_matches} student{'s' if total_matches > 1 else ''}** based on the interests, fields, and job titles you listed in your response. I built a matching system that uses semantic AI models to compare your answers to students' responses. It looks at the meaning behind what you wrote — your major, your job, and your industry — and finds students with similar goals or experiences.

🎯 **What this means** is that the following student(s) had profiles that were most similar to yours in terms of interests, career paths, or fields.


---

📋 **Note on Profile Expansion**

Some responses were a bit short or vague. In those cases, I used what was already written to expand the descriptions a little, just to make sure the matches made sense and that no one got overlooked due to missing info.

---

💬 **What happens next?**

It’s the student’s responsibility to reach out to you. When they do, all I ask is that you try to make time for a short chat — even just 15 minutes — to help them better understand what you do, how you got there, or what advice you might have.

🤝 **If possible**, please try to respond within a week — even if it’s just a quick message letting them know your availability.

😅 **Feeling overwhelmed?** That’s totally okay. If you’re too busy right now, you can always reply with a note asking them to follow up later — in a week, two weeks, or even a month. They’ll appreciate your honesty and it helps set clear expectations.

---

This project only works because of people like you who are willing to take a little time to support others. Even a quick chat can make a huge difference to someone just starting out.

Thank you again — I really appreciate it.

—
Memes
"""

    alum_emails.append({
        "alum_email": alum_email,
        "email_body": body
    })

    # Save each email as .txt file
    filename = os.path.join(alum_email_folder, f"{alum_email.replace('@', '_at_')}.txt")
    with open(filename, "w", encoding="utf-8") as f:
        f.write(body)

# Optional: Save all as .csv backup
alum_email_df = pd.DataFrame(alum_emails)

print(f"✅ Saved individual emails to: {alum_email_folder}")


In [None]:
# Get all unique alumni emails from top 5 matches
all_alum_emails = sorted(top5_df["alum_email"].unique().tolist())

# Print each one on a new line
for email in all_alum_emails:
    print(email)


In [None]:
# import os
# import base64
# from email.mime.text import MIMEText
# from google.oauth2.credentials import Credentials
# from google_auth_oauthlib.flow import InstalledAppFlow
# from googleapiclient.discovery import build

# # ================
# # SETUP GMAIL API
# # ================
# SCOPES = ["https://www.googleapis.com/auth/gmail.send"]
# CLIENT_SECRET_FILE = 
# TOKEN_FILE = "token.json"  # Saved token after first login

# creds = None
# if os.path.exists(TOKEN_FILE):
#     creds = Credentials.from_authorized_user_file(TOKEN_FILE, SCOPES)
# else:
#     flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, SCOPES)
#     creds = flow.run_local_server(port=0)
#     with open(TOKEN_FILE, "w") as token:
#         token.write(creds.to_json())

# service = build("gmail", "v1", credentials=creds)

# # ==========================
# # READ & SEND TXT EMAIL FILES
# # ==========================
# email_folder = os.path.join(os.path.expanduser("~"), "Downloads", "student_emails_txt")

# for filename in os.listdir(email_folder):
#     if filename.endswith(".txt"):
#         file_path = os.path.join(email_folder, filename)
#         with open(file_path, "r", encoding="utf-8") as f:
#             lines = f.read().splitlines()

#         # Parse To and Subject
#         to_line = lines[0].strip()
#         subject_line = lines[1].strip()
#         body = "\n".join(lines[3:])  # skip 'To:' and 'Subject:' and one blank line

#         to_email = to_line.replace("To:", "").strip()
#         subject = subject_line.replace("Subject:", "").strip()

#         # Build MIME message
#         message = MIMEText(body)
#         message["to"] = to_email
#         message["subject"] = subject
#         raw_message = base64.urlsafe_b64encode(message.as_bytes()).decode()

#         try:
#             send_result = service.users().messages().send(
#                 userId="me", body={"raw": raw_message}
#             ).execute()
#             print(f"✅ Sent to: {to_email}")
#         except Exception as e:
#             print(f"❌ Failed to send to: {to_email} → {str(e)}")


In [None]:
import os
import base64
from email.mime.text import MIMEText
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

# ================
# SETUP GMAIL API
# ================
SCOPES = ["https://www.googleapis.com/auth/gmail.send"]
CLIENT_SECRET_FILE = r"C:\Users\Guill\Downloads\client_secret_803122994384-i9niq835sd64kmvm4uft57fe04qq49ff.apps.googleusercontent.com.json"
TOKEN_FILE = "token.json"

creds = None
if os.path.exists(TOKEN_FILE):
    creds = Credentials.from_authorized_user_file(TOKEN_FILE, SCOPES)
else:
    flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, SCOPES)
    creds = flow.run_local_server(port=0)
    with open(TOKEN_FILE, "w") as token:
        token.write(creds.to_json())

service = build("gmail", "v1", credentials=creds)

# ==========================
# SEND FOLLOW-UP TO STUDENTS
# ==========================
email_folder = os.path.join(os.path.expanduser("~"), "Downloads", "student_emails_txt")

for filename in os.listdir(email_folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(email_folder, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            lines = f.read().splitlines()
            if not lines:
                continue

        # First line contains email like: "To: student@email.com"
        to_line = lines[0].strip()
        if not to_line.lower().startswith("to:"):
            continue
        to_email = to_line.replace("To:", "").strip()

        subject = "Email confirmation?"
        body = """Hey — Some people have told me they didnt get the last email with their matches. Im just checking if you got the alumni matches I sent earlier.

Can you reply “yes” if you got it, or “no” if you didn’t?

Thanks!"""

        # Build MIME message
        message = MIMEText(body)
        message["to"] = to_email
        message["subject"] = subject
        raw_message = base64.urlsafe_b64encode(message.as_bytes()).decode()

        try:
            send_result = service.users().messages().send(
                userId="me", body={"raw": raw_message}
            ).execute()
            print(f"✅ Sent follow-up to: {to_email}")
        except Exception as e:
            print(f"❌ Failed to send to: {to_email} → {str(e)}")


In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Subset for visualization: top N matches only
top_matches = match_df.sort_values(by="match_score", ascending=False).head(300)

# Create graph
G = nx.Graph()

# Add nodes and weighted edges
for _, row in top_matches.iterrows():
    student = f"S: {row['student_email']}"
    alum = f"A: {row['alum_email']}"
    score = row['match_score']

    G.add_node(student, type='student')
    G.add_node(alum, type='alum')
    G.add_edge(student, alum, weight=score)

# Draw graph
plt.figure(figsize=(14, 12))
pos = nx.spring_layout(G, k=0.25)  # Force-directed layout

node_colors = ['skyblue' if n.startswith("S:") else 'salmon' for n in G.nodes()]
edges = G.edges(data=True)
weights = [d['weight'] / 10 for (_, _, d) in edges]  # Scale down for visual

nx.draw(G, pos, node_color=node_colors, edge_color='gray', width=weights,
        with_labels=False, alpha=0.8, node_size=100)
plt.title("Student-Alumni Match Network", fontsize=16)
plt.show()


In [None]:
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import numpy as np
from matplotlib.patches import Patch

# -------------------------
# STEP 1: Top Matches Subset
# -------------------------
top_matches = match_df.sort_values(by="match_score", ascending=False).head(300)

# -------------------------
# STEP 2: Embed & Cluster Alumni
# -------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
alum_profiles = alumni.set_index("email").loc[top_matches["alum_email"].unique()]["profile"].tolist()
alum_embeddings = model.encode(alum_profiles)

kmeans = KMeans(n_clusters=6, random_state=42)
cluster_ids = kmeans.fit_predict(alum_embeddings)

alum_cluster_map = {
    email: cluster for email, cluster in zip(top_matches["alum_email"].unique(), cluster_ids)
}

cluster_labels = {
    0: "Finance / Data",
    1: "Marketing / Business",
    2: "Science / Health",
    3: "Engineering / Tech",
    4: "Education / Social",
    5: "Environment / Law"
}

# -------------------------
# STEP 3: Build Graph
# -------------------------
G = nx.Graph()

for _, row in top_matches.iterrows():
    student = f"S: {row['student_email']}"
    alum = f"A: {row['alum_email']}"
    score = row['match_score']

    G.add_node(student, type='student', cluster=None)
    G.add_node(alum, type='alum', cluster=alum_cluster_map[row['alum_email']])
    G.add_edge(student, alum, weight=score)

# -------------------------
# STEP 4: Draw Graph
# -------------------------
plt.figure(figsize=(14, 12))
pos = nx.spring_layout(G, k=0.25, seed=42)

# Colors: brighter students, deeper alumni cluster colors
node_colors = []
for n, attr in G.nodes(data=True):
    if attr['type'] == 'student':
        node_colors.append("#000000")  # Bright gold
    else:
        cluster_id = attr['cluster']
        cluster_color = plt.cm.Dark2(cluster_id % 8)  # Deeper tone
        node_colors.append(cluster_color)

# Edge weights – thinner
weights = [d['weight'] / 35 for (_, _, d) in G.edges(data=True)]

# Draw
nx.draw(
    G, pos, node_color=node_colors, edge_color='gray', width=weights,
    with_labels=False, node_size=120, alpha=0.9
)

# Legend
legend_elements = [
    Patch(facecolor="#000000", edgecolor='k', label='Student')
] + [
    Patch(facecolor=plt.cm.Dark2(i % 8), edgecolor='k', label=label)
    for i, label in cluster_labels.items()
]

plt.legend(handles=legend_elements, title="Node Type / Cluster", loc="upper left")
plt.title("Student-Alumni Match Network with Thematic Clusters", fontsize=16)
plt.axis("off")
plt.tight_layout()
plt.show()
