## **Importing Dependincies**

In [1]:
import os
import json
import torch
import random
from tqdm import tqdm
from transformers import BertModel, BertTokenizer
from pymilvus import MilvusClient

2024-12-27 20:26:24.161359: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-27 20:26:24.170995: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-27 20:26:24.182218: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-27 20:26:24.185616: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-27 20:26:24.194599: I tensorflow/core/platform/cpu_feature_guar

## **Configuration and Global Variables**

In [2]:
# Data paths
INPUT_DATA_PATH_STUDENTS = r"mock dataset generation/datasets/refined_final_dataset_student.json"
INPUT_DATA_PATH_MENTOR = r"mock dataset generation/datasets/refined_final_dataset_mentor.json"
OUTPUT_DATA_PATH_STUDENTS = r"mock dataset generation/datasets/enriched_dataset_student.json"
OUTPUT_DATA_PATH_MENTOR = r"mock dataset generation/datasets/enriched_dataset_mentor.json"

# Vector DB configs
STUDENT_COLLECTION = "student_collection"
MENTOR_COLLECTION = "mentor_collection"
VECTOR_DIM = 512
BERT_MODEL_NAME = "bert-base-uncased"

## **Loading Data**

In [3]:
with open(INPUT_DATA_PATH_STUDENTS, 'r') as student_file, open(INPUT_DATA_PATH_MENTOR, 'r') as mentor_file:
    student_data = json.load(student_file)
    mentor_data = json.load(mentor_file)

## **Feature Engineering**

#### **Helper functions**

In [4]:
def normalize_availability(availability):
    if 'weekend' in availability.lower() or 'evening' in availability.lower():
        return 'semi-flexible'
    elif 'daily' in availability.lower() or 'hours' in availability.lower():
        return 'flexible'
    else:
        return 'rigid'

#### **Parsing datasets and adding features**

In [5]:
for student_id, attributes in student_data.items():
    strengths = attributes.get("strengths", [])
    weaknesses = attributes.get("weaknesses", [])
    interests = attributes.get("interests", [])
    learning_style = attributes.get("learning_style", "")
    learning_challenges = attributes.get("learning_challenges", [])
    goals = attributes.get("goals", [])
    availability = attributes.get("avail", "")

    attributes['skill_level'] = 'advanced' if len(strengths) > len(weaknesses) else 'beginner'
    attributes['preferred_mentor_expertise'] = interests
    attributes['engagement_preference'] = 'group' if 'collaborative' in learning_style.lower() else 'one-on-one'
    attributes['challenge_index'] = len(learning_challenges)
    attributes['career_focus'] = 'STEM' if any(g in ['engineering', 'data science', 'math'] for g in goals) else 'Other'
    attributes['learning_time_flexibility'] = normalize_availability(availability)

In [6]:
for mentor_id, attributes in mentor_data.items():
    expertise = attributes.get("Expertise", [])
    weaknesses = attributes.get("Weaknesses", [])
    interests = attributes.get("Interests", [])
    teaching_style = attributes.get("Teaching Style", "")
    professional_goals = attributes.get("Professional Goals", [])
    availability = attributes.get("Availability", "")

    attributes['compatibility_index'] = 'high' if 'hands-on' in teaching_style.lower() else 'medium'
    attributes['mentorship_focus'] = 'career progression' if any(g in ['career', 'progress'] for g in professional_goals) else 'skill-building'
    attributes['subject_breadth'] = len(expertise)
    attributes['teaching_adaptability'] = 'high' if len(weaknesses) > 2 else 'low'
    attributes['session_availability_density'] = normalize_availability(availability)

#### **Saving updated files**

In [7]:
with open(OUTPUT_DATA_PATH_STUDENTS, 'w') as student_output, open(OUTPUT_DATA_PATH_MENTOR, 'w') as mentor_output:
    json.dump(student_data, student_output, indent=4)
    json.dump(mentor_data, mentor_output, indent=4)

print("Enriched datasets have been successfully generated.")

Enriched datasets have been successfully generated.


## **Establishing Vector Database**

#### **Helper functions**

In [8]:
def get_milvus_client():
    client = MilvusClient("vector_database.db")
    print("Connected to Milvus.")
    return client

In [9]:
def create_collection(client, collection_name):
    # CHECK
    if collection_name in client.list_collections():
        print(f"Collection {collection_name} already exists.")
        return

    client.create_collection(
        collection_name=collection_name,
        dimension=VECTOR_DIM,
        primary_field_name="id",
        vector_field_name="vector",
        other_fields=[
            {"name": "original_id", "type": "varchar", "max_length": 100}
        ]
    )
    print(f"Collection {collection_name} created.")

In [10]:
def load_bert_encoder():
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
    model = BertModel.from_pretrained(BERT_MODEL_NAME)
    return tokenizer, model

In [11]:
def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

In [12]:
def format_student_text(student_data):
    attributes = student_data.get('attributes', {})
    
    core_text = f"Strengths: {attributes.get('Strengths', [])} | " \
                f"Weaknesses: {attributes.get('Weaknesses', [])} | " \
                f"Interests: {attributes.get('Interests', [])} | " \
                f"Learning Style: {attributes.get('Learning Style', '')} | " \
                f"Learning Challenges: {attributes.get('Learning Challenges', [])} | " \
                f"Goals: {attributes.get('Goals', [])} | " \
                f"Availability: {attributes.get('Availability', '')}"
    
    additional_text = f"Skill Level: {student_data.get('skill_level', '')} | " \
                     f"Preferred Mentor Expertise: {student_data.get('preferred_mentor_expertise', [])} | " \
                     f"Engagement Preference: {student_data.get('engagement_preference', '')} | " \
                     f"Challenge Index: {student_data.get('challenge_index', '')} | " \
                     f"Career Focus: {student_data.get('career_focus', '')} | " \
                     f"Learning Time Flexibility: {student_data.get('learning_time_flexibility', '')}"
    
    return f"{core_text} | {additional_text}"

def format_mentor_text(mentor_data):
    core_text = f"Expertise: {mentor_data.get('Expertise', [])} | " \
                f"Weaknesses: {mentor_data.get('Weaknesses', [])} | " \
                f"Interests: {mentor_data.get('Interests', [])} | " \
                f"Teaching Style: {mentor_data.get('Teaching Style', '')} | " \
                f"Professional Goals: {mentor_data.get('Professional Goals', [])} | " \
                f"Availability: {mentor_data.get('Availability', '')}"
    
    additional_text = f"Compatibility Index: {mentor_data.get('compatibility_index', '')} | " \
                     f"Mentorship Focus: {mentor_data.get('mentorship_focus', '')} | " \
                     f"Subject Breadth: {mentor_data.get('subject_breadth', '')} | " \
                     f"Teaching Adaptability: {mentor_data.get('teaching_adaptability', '')} | " \
                     f"Session Availability Density: {mentor_data.get('session_availability_density', '')}"
    
    return f"{core_text} | {additional_text}"

In [13]:
def insert_data(client, collection_name, data, tokenizer, model):
    format_func = format_student_text if collection_name == STUDENT_COLLECTION else format_mentor_text
    
    entities = []
    for idx, (id_key, attributes) in enumerate(tqdm(data.items())):
        text = format_func(attributes)
        vector = encode_text(text, tokenizer, model)
        
        entities.append({
            "id": idx,
            "vector": vector.tolist(),
            "original_id": id_key
        })
    
    batch_size = 50
    for i in range(0, len(entities), batch_size):
        batch = entities[i:i + batch_size]
        client.insert(
            collection_name=collection_name,
            data=batch
        )
    
    print(f"Inserted {len(entities)} records into {collection_name}.")

In [14]:
def search_similar(client, collection_name, query_vector, top_k=5):
    results = client.search(
        collection_name=collection_name,
        data=[query_vector.tolist()],  # MilvusClient requires list format
        output_fields=["original_id"],
        limit=top_k,
        search_params={
            "metric_type": "IP",
            "params": {"nprobe": 10}
        }
    )
    
    print(f"Search results from {collection_name}:")
    for hit in results[0]:
        print(f"Original ID: {hit.entity.get('original_id')}, Similarity: {hit.score}")

#### **Inserting data to DB**

In [15]:
client = get_milvus_client()

create_collection(client, STUDENT_COLLECTION)
create_collection(client, MENTOR_COLLECTION)

with open(OUTPUT_DATA_PATH_STUDENTS, "r") as student_file:
    student_data = json.load(student_file)
with open(OUTPUT_DATA_PATH_MENTOR, "r") as mentor_file:
    mentor_data = json.load(mentor_file)

tokenizer, model = load_bert_encoder()

insert_data(client, STUDENT_COLLECTION, student_data, tokenizer, model)
insert_data(client, MENTOR_COLLECTION, mentor_data, tokenizer, model)

Connected to Milvus.
Collection student_collection already exists.
Collection mentor_collection already exists.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/bert-base-uncased/68d45e234eb4a928074dfd868cead0219ab85354cc53d20e772753c6bb9169d3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1735568963&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNTU2ODk2M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9iZXJ0LWJhc2UtdW5jYXNlZC82OGQ0NWUyMzRlYjRhOTI4MDc0ZGZkODY4Y2VhZDAyMTlhYjg1MzU0Y2M1M2QyMGU3NzI3NTNjNmJiOTE2OWQzP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=dlO7J7iEFzpZNuiRrFYOCgYvb1nyxpFikXoEcab-LRFDjq72KwmaVHeey6OyINaiLKA9-m6cB3jvw9L4z67jJVfHLurYxSt0f-05fOEMAbhSwKelIR2qCd12cFJ5NbstNM0T-zV9OC3xyonTkygjeMHfWZgvHhCN3pJDdCfiqHmQXDxslJVdkEPfnxjKq3%7E0tu6CQOqSbgVxhnpfXgcphAKe64fVwjb5dSuHLNyE9FiMGMxQKc0HFZl8vc54Na9UDsWYkpr56TpdAfFR9ROWozYlxOKUWIZFNZyv%7EEGBJD0zSWl1nnrRgnrImmruJJ2tgc0mpJgAGQ7G0YmLl4OboA__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConnectionPool(host='cdn-l

model.safetensors:  14%|#4        | 62.9M/440M [00:00<?, ?B/s]

100%|██████████| 300/300 [00:08<00:00, 34.01it/s]


Inserted 300 records into student_collection.


100%|██████████| 300/300 [00:27<00:00, 10.72it/s]


Inserted 300 records into mentor_collection.


## **Evaluation and Testing**

#### **Helper functions**

In [None]:
def get_random_student_vector(student_data, tokenizer, model):
    random_student_id = random.choice(list(student_data.keys()))
    student_full_data = student_data[random_student_id]
    student_attributes = student_full_data['attributes']
    student_text = format_student_text(student_full_data)
    student_vector = encode_text(student_text, tokenizer, model)
    return random_student_id, student_vector, student_full_data

#### **TEST 1: Random student**

In [None]:
def test_basic_matching(top_k):
    random_student_id, student_vector, student_full_data = get_random_student_vector(student_data, tokenizer, model)
    
    print("Selected Student Details:")
    print(f"Student ID: {random_student_id}")
    print("\nCore Attributes:")
    for key, value in student_full_data['attributes'].items():
        print(f"{key}: {value}")
    
    print("\nAdditional Information:")
    additional_fields = ['skill_level', 'engagement_preference', 'challenge_index', 
                        'career_focus', 'learning_time_flexibility']
    for field in additional_fields:
        print(f"{field}: {student_full_data.get(field, 'Not specified')}")
    
    print("\nFinding matching mentors...")
    results = search_similar(client, MENTOR_COLLECTION, student_vector, top_k=top_k)
    return results

test_basic_matching(top_k=3)

2024-12-27 20:41:28,981 [ERROR][handler]: RPC error: [search], <MilvusException: (code=1100, message=fail to search: metric type not match: invalid [expected=COSINE][actual=IP]: invalid parameter)>, <Time:{'RPC start': '2024-12-27 20:41:28.978410', 'RPC error': '2024-12-27 20:41:28.981024'}> (decorators.py:140)
2024-12-27 20:41:28,981 [ERROR][search]: Failed to search collection: mentor_collection (milvus_client.py:412)


Total number of students: 300
Sample of keys: ['student_1', 'student_2', 'student_3']
Sample of first student data: {'attributes': {'Strengths': ['Strong analytical skills', 'Good grasp of mathematical concepts', 'Quick at solving puzzles'], 'Weaknesses': ['Finds grammar rules difficult to remember', 'Struggles with word problems'], 'Interests': ['Astronomy', 'Video games', 'Space exploration'], 'Learning Style': 'Visual', 'Learning Challenges': ['Short attention span'], 'Goals': ['Become a software engineer', 'Learn coding'], 'Availability': '2 hours daily on weekdays'}, 'study_plan': {'Introduction': ['The student has not provided specific information regarding their strengths, weaknesses, interests, learning style, learning challenges, goals, or availability. Therefore, the study plan will be generalized to cover a broad range of potential areas of interest and development.'], 'Workshops': [{'name': 'Time Management and Study Skills', 'description': 'A workshop designed to help stud

#### **TEST 2: Challenged Student**

In [None]:
def test_challenge_focused_matching(top_k):
    challenged_students = {
        k: v for k, v in student_data.items() 
        if len(v.get('attributes', {}).get('Learning Challenges', [])) > 0 and 
        v.get('challenge_index', 0) > 0
    }
    
    if not challenged_students:
        print("No students found with learning challenges.")
        return
    
    random_student_id = random.choice(list(challenged_students.keys()))
    student_full_data = challenged_students[random_student_id]
    student_text = format_student_text(student_full_data)
    student_vector = encode_text(student_text, tokenizer, model)
    
    print("\nSelected Student with Learning Challenges:")
    print(f"Student ID: {random_student_id}")
    print("\nLearning Profile:")
    print(f"Learning Challenges: {student_full_data['attributes']['Learning Challenges']}")
    print(f"Learning Style: {student_full_data['attributes']['Learning Style']}")
    print(f"Challenge Index: {student_full_data.get('challenge_index', 'Not specified')}")
    print(f"Skill Level: {student_full_data.get('skill_level', 'Not specified')}")
    
    print("\nFinding mentors with matching teaching adaptability...")
    return search_similar(client, MENTOR_COLLECTION, student_vector, top_k=top_k)

test_challenge_focused_matching(top_k=3)

#### **TEST 3: Based on _Career Focus_**

In [None]:
def test_career_focused_matching(top_k):
    student_id = random.choice(list(student_data.keys()))
    student_full_data = student_data[student_id]
    student_text = format_student_text(student_full_data)
    student_vector = encode_text(student_text, tokenizer, model)
    
    print("\nSelected Student Career Profile:")
    print(f"Student ID: {student_id}")
    print(f"Career Focus: {student_full_data.get('career_focus', 'Not specified')}")
    print(f"Goals: {student_full_data['attributes'].get('Goals', [])}")
    print(f"Skill Level: {student_full_data.get('skill_level', 'Not specified')}")
    print(f"Preferred Mentor Expertise: {student_full_data.get('preferred_mentor_expertise', [])}")
    
    print("\nFinding mentors with relevant expertise and career alignment...")
    return search_similar(client, MENTOR_COLLECTION, student_vector, top_k=top_k)

test_career_focused_matching(top_k=3)