## **Importing Dependincies**

In [1]:
import os
import json
import torch
import random
from tqdm import tqdm
from transformers import BertModel, BertTokenizer
from pymilvus import MilvusClient, DataType

2024-12-28 13:46:40.224365: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-28 13:46:40.233685: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-28 13:46:40.245142: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-28 13:46:40.248610: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-28 13:46:40.258237: I tensorflow/core/platform/cpu_feature_guar

## **Configuration and Global Variables**

In [2]:
# Data paths
INPUT_DATA_PATH_STUDENTS = r"mock dataset generation/datasets/refined_final_dataset_student.json"
INPUT_DATA_PATH_MENTOR = r"mock dataset generation/datasets/refined_final_dataset_mentor.json"
OUTPUT_DATA_PATH_STUDENTS = r"mock dataset generation/datasets/enriched_dataset_student.json"
OUTPUT_DATA_PATH_MENTOR = r"mock dataset generation/datasets/enriched_dataset_mentor.json"

# Vector DB configs
STUDENT_COLLECTION = "student_collection"
MENTOR_COLLECTION = "mentor_collection"
VECTOR_DIM = 768
BERT_MODEL_NAME = "bert-base-uncased"

## **Loading Data**

In [3]:
with open(INPUT_DATA_PATH_STUDENTS, 'r') as student_file, open(INPUT_DATA_PATH_MENTOR, 'r') as mentor_file:
    student_data = json.load(student_file)
    mentor_data = json.load(mentor_file)

## **Feature Engineering**

#### **Helper functions**

In [4]:
def normalize_availability(availability):
    if 'weekend' in availability.lower() or 'evening' in availability.lower():
        return 'semi-flexible'
    elif 'daily' in availability.lower() or 'hours' in availability.lower():
        return 'flexible'
    else:
        return 'rigid'

#### **Parsing datasets and adding features**

In [5]:
for student_id, attributes in student_data.items():
    strengths = attributes.get("strengths", [])
    weaknesses = attributes.get("weaknesses", [])
    interests = attributes.get("interests", [])
    learning_style = attributes.get("learning_style", "")
    learning_challenges = attributes.get("learning_challenges", [])
    goals = attributes.get("goals", [])
    availability = attributes.get("avail", "")

    attributes['skill_level'] = 'advanced' if len(strengths) > len(weaknesses) else 'beginner'
    attributes['preferred_mentor_expertise'] = interests
    attributes['engagement_preference'] = 'group' if 'collaborative' in learning_style.lower() else 'one-on-one'
    attributes['challenge_index'] = len(learning_challenges)
    attributes['career_focus'] = 'STEM' if any(g in ['engineering', 'data science', 'math'] for g in goals) else 'Other'
    attributes['learning_time_flexibility'] = normalize_availability(availability)

In [6]:
for mentor_id, attributes in mentor_data.items():
    expertise = attributes.get("Expertise", [])
    weaknesses = attributes.get("Weaknesses", [])
    interests = attributes.get("Interests", [])
    teaching_style = attributes.get("Teaching Style", "")
    professional_goals = attributes.get("Professional Goals", [])
    availability = attributes.get("Availability", "")

    attributes['compatibility_index'] = 'high' if 'hands-on' in teaching_style.lower() else 'medium'
    attributes['mentorship_focus'] = 'career progression' if any(g in ['career', 'progress'] for g in professional_goals) else 'skill-building'
    attributes['subject_breadth'] = len(expertise)
    attributes['teaching_adaptability'] = 'high' if len(weaknesses) > 2 else 'low'
    attributes['session_availability_density'] = normalize_availability(availability)

#### **Saving updated files**

In [7]:
with open(OUTPUT_DATA_PATH_STUDENTS, 'w') as student_output, open(OUTPUT_DATA_PATH_MENTOR, 'w') as mentor_output:
    json.dump(student_data, student_output, indent=4)
    json.dump(mentor_data, mentor_output, indent=4)

print("Enriched datasets have been successfully generated.")

Enriched datasets have been successfully generated.


## **Establishing Vector Database**

#### **Helper functions**

In [8]:
def get_milvus_client():
    client = MilvusClient("vector_database.db")
    print("Connected to Milvus.")
    return client

In [9]:
def create_collection(client, collection_name):    
    schema = MilvusClient.create_schema(
        auto_id=False,
        enable_dynamic_field=True,
    )

    schema.add_field(field_name="id", datatype=DataType.INT64, is_nullable=True, is_primary=True)    
    schema.add_field(field_name="orig", datatype=DataType.VARCHAR, max_length=40)
    schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=VECTOR_DIM)

    index_params = client.prepare_index_params()
    
    index_params.add_index(
        field_name="vector", 
        index_type="AUTOINDEX",
        metric_type="COSINE"
    )
    
    # Create collection
    client.create_collection(
        collection_name=collection_name,
        schema=schema,
        index_params=index_params
    )
    print(f"Collection {collection_name} created with id and vector fields.")

In [10]:
def load_bert_encoder():
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
    model = BertModel.from_pretrained(BERT_MODEL_NAME)
    return tokenizer, model

In [11]:
def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

In [12]:
def format_student_text(student_data):
    attributes = student_data.get('attributes', {})
    
    core_text = f"Strengths: {attributes.get('Strengths', [])} | " \
                f"Weaknesses: {attributes.get('Weaknesses', [])} | " \
                f"Interests: {attributes.get('Interests', [])} | " \
                f"Learning Style: {attributes.get('Learning Style', '')} | " \
                f"Learning Challenges: {attributes.get('Learning Challenges', [])} | " \
                f"Goals: {attributes.get('Goals', [])} | " \
                f"Availability: {attributes.get('Availability', '')}"
    
    additional_text = f"Skill Level: {student_data.get('skill_level', '')} | " \
                     f"Preferred Mentor Expertise: {student_data.get('preferred_mentor_expertise', [])} | " \
                     f"Engagement Preference: {student_data.get('engagement_preference', '')} | " \
                     f"Challenge Index: {student_data.get('challenge_index', '')} | " \
                     f"Career Focus: {student_data.get('career_focus', '')} | " \
                     f"Learning Time Flexibility: {student_data.get('learning_time_flexibility', '')}"
    
    return f"{core_text} | {additional_text}"

def format_mentor_text(mentor_data):
    core_text = f"Expertise: {mentor_data.get('Expertise', [])} | " \
                f"Weaknesses: {mentor_data.get('Weaknesses', [])} | " \
                f"Interests: {mentor_data.get('Interests', [])} | " \
                f"Teaching Style: {mentor_data.get('Teaching Style', '')} | " \
                f"Professional Goals: {mentor_data.get('Professional Goals', [])} | " \
                f"Availability: {mentor_data.get('Availability', '')}"
    
    additional_text = f"Compatibility Index: {mentor_data.get('compatibility_index', '')} | " \
                     f"Mentorship Focus: {mentor_data.get('mentorship_focus', '')} | " \
                     f"Subject Breadth: {mentor_data.get('subject_breadth', '')} | " \
                     f"Teaching Adaptability: {mentor_data.get('teaching_adaptability', '')} | " \
                     f"Session Availability Density: {mentor_data.get('session_availability_density', '')}"
    
    return f"{core_text} | {additional_text}"

In [13]:
def insert_data(client, collection_name, data, tokenizer, model):
    format_func = format_student_text if collection_name == STUDENT_COLLECTION else format_mentor_text
    
    entities = []
    for idx, (id_key, attributes) in enumerate(tqdm(data.items())):
        text = format_func(attributes)
        vector = encode_text(text, tokenizer, model)
        
        entities.append({
            "id":idx,
            "orig": str(id_key),
            "vector": vector.tolist(),
        })
    
    batch_size = 50
    for i in range(0, len(entities), batch_size):
        batch = entities[i:i + batch_size]
        client.insert(
            collection_name=collection_name,
            data=batch
        )
    
    print(f"Inserted {len(entities)} records into {collection_name}.")

In [14]:
def search_similar(client, collection_name, query_vector, top_k):
    results = client.search(
        collection_name=collection_name,
        data=[query_vector],
        output_fields=["orig"],
        limit=top_k,
        search_params={"metric_type":"COSINE"}
    )
    
    print(f"Search results from {collection_name}:")
    for hits in results:
        for hit in hits:
            print(hit)

#### **Inserting data to DB**

In [15]:
client = get_milvus_client()

create_collection(client, STUDENT_COLLECTION)
create_collection(client, MENTOR_COLLECTION)

with open(OUTPUT_DATA_PATH_STUDENTS, "r") as student_file:
    student_data = json.load(student_file)
with open(OUTPUT_DATA_PATH_MENTOR, "r") as mentor_file:
    mentor_data = json.load(mentor_file)

tokenizer, model = load_bert_encoder()

insert_data(client, STUDENT_COLLECTION, student_data, tokenizer, model)
insert_data(client, MENTOR_COLLECTION, mentor_data, tokenizer, model)

Connected to Milvus.
Collection student_collection created with id and vector fields.
Collection mentor_collection created with id and vector fields.


100%|██████████| 300/300 [00:34<00:00,  8.76it/s]


Inserted 300 records into student_collection.


100%|██████████| 300/300 [00:33<00:00,  9.04it/s]


Inserted 300 records into mentor_collection.


## **Evaluation and Testing**

#### **Helper functions**

In [16]:
def get_random_student_vector(student_data, tokenizer, model):
    random_student_id = random.choice(list(student_data.keys()))
    student_full_data = student_data[random_student_id]
    student_attributes = student_full_data['attributes']
    student_text = format_student_text(student_full_data)
    student_vector = encode_text(student_text, tokenizer, model)
    return random_student_id, student_vector, student_full_data

#### **TEST 1: Random student**

In [17]:
def test_basic_matching(top_k):
    random_student_id, student_vector, student_full_data = get_random_student_vector(student_data, tokenizer, model)
    
    print("Selected Student Details:")
    print(f"Student ID: {random_student_id}")
    print("\nCore Attributes:")
    for key, value in student_full_data['attributes'].items():
        print(f"{key}: {value}")
    
    print("\nAdditional Information:")
    additional_fields = ['skill_level', 'engagement_preference', 'challenge_index', 
                        'career_focus', 'learning_time_flexibility']
    for field in additional_fields:
        print(f"{field}: {student_full_data.get(field, 'Not specified')}")
    
    print("\nFinding matching mentors...")
    results = search_similar(client, MENTOR_COLLECTION, student_vector, top_k=top_k)
    return results

test_basic_matching(top_k=3)

Selected Student Details:
Student ID: student_178

Core Attributes:
Strengths: ['Good at teamwork', 'Strong in languages', 'Creative']
Weaknesses: ['Struggles with math', 'Weak in science']
Interests: ['Languages', 'Art', 'Travel']
Learning Style: Auditory
Learning Challenges: ['Math anxiety']
Goals: ['Learn a new language', 'Improve math skills']
Availability: 1 hour daily

Additional Information:
skill_level: beginner
engagement_preference: one-on-one
challenge_index: 0
career_focus: Other
learning_time_flexibility: rigid

Finding matching mentors...
Search results from mentor_collection:
{'id': 294, 'distance': 0.9691210985183716, 'entity': {'orig': 'mentor_295'}}
{'id': 34, 'distance': 0.9690457582473755, 'entity': {'orig': 'mentor_35'}}
{'id': 59, 'distance': 0.9687049388885498, 'entity': {'orig': 'mentor_60'}}


#### **TEST 2: Challenged Student**

In [18]:
def test_challenge_focused_matching(top_k):
    challenged_students = {
        k: v for k, v in student_data.items() 
        if len(v.get('attributes', {}).get('Learning Challenges', [])) > 0 and 
        v.get('challenge_index', 0) > 0
    }
    
    if not challenged_students:
        print("No students found with learning challenges.")
        return
    
    random_student_id = random.choice(list(challenged_students.keys()))
    student_full_data = challenged_students[random_student_id]
    student_text = format_student_text(student_full_data)
    student_vector = encode_text(student_text, tokenizer, model)
    
    print("\nSelected Student with Learning Challenges:")
    print(f"Student ID: {random_student_id}")
    print("\nLearning Profile:")
    print(f"Learning Challenges: {student_full_data['attributes']['Learning Challenges']}")
    print(f"Learning Style: {student_full_data['attributes']['Learning Style']}")
    print(f"Challenge Index: {student_full_data.get('challenge_index', 'Not specified')}")
    print(f"Skill Level: {student_full_data.get('skill_level', 'Not specified')}")
    
    print("\nFinding mentors with matching teaching adaptability...")
    return search_similar(client, MENTOR_COLLECTION, student_vector, top_k=top_k)

test_challenge_focused_matching(top_k=3)

No students found with learning challenges.


#### **TEST 3: Based on _Career Focus_**

In [19]:
def test_career_focused_matching(top_k):
    student_id = random.choice(list(student_data.keys()))
    student_full_data = student_data[student_id]
    student_text = format_student_text(student_full_data)
    student_vector = encode_text(student_text, tokenizer, model)
    
    print("\nSelected Student Career Profile:")
    print(f"Student ID: {student_id}")
    print(f"Career Focus: {student_full_data.get('career_focus', 'Not specified')}")
    print(f"Goals: {student_full_data['attributes'].get('Goals', [])}")
    print(f"Skill Level: {student_full_data.get('skill_level', 'Not specified')}")
    print(f"Preferred Mentor Expertise: {student_full_data.get('preferred_mentor_expertise', [])}")
    
    print("\nFinding mentors with relevant expertise and career alignment...")
    return search_similar(client, MENTOR_COLLECTION, student_vector, top_k=top_k)

test_career_focused_matching(top_k=3)


Selected Student Career Profile:
Student ID: student_85
Career Focus: Other
Goals: ['Win a debating competition', 'Improve math skills']
Skill Level: beginner
Preferred Mentor Expertise: []

Finding mentors with relevant expertise and career alignment...
Search results from mentor_collection:
{'id': 59, 'distance': 0.9665457010269165, 'entity': {'orig': 'mentor_60'}}
{'id': 64, 'distance': 0.9654986262321472, 'entity': {'orig': 'mentor_65'}}
{'id': 219, 'distance': 0.9652574062347412, 'entity': {'orig': 'mentor_220'}}
