<a href="https://colab.research.google.com/github/e1y4d/topera/blob/main/topera_last_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence_transformers
!pip install annoy

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transform

In [2]:
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sentence_transformers import SentenceTransformer, util
from annoy import AnnoyIndex
from google.colab import drive

  from tqdm.autonotebook import tqdm, trange


In [3]:
##--uncomment if not using colab
''' with open('user_response.json', 'r') as json_file:
    jsondata = json.load(json_file)
    print("Original data:", jsondata) '''

##--comment if not using colab
drive.mount('/content/drive')
with open('/content/drive/MyDrive/user_response.json', 'r') as json_file:
    jsondata = json.load(json_file)

Mounted at /content/drive


In [9]:
le = LabelEncoder()
data = pd.DataFrame(jsondata).dropna().copy()
global recommendations, k

In [10]:
def recommend_collaborators(user_id, k=5):

    recommendations = {}
    if user_id not in data['UserId'].tolist():
        print(f"User with ID {user_id} not found in data.")
        return recommendations  # Empty dictionary

    user_technology = data[data["UserId"] == user_id][["TechnologyOfInterest"]]
    for technology in user_technology["TechnologyOfInterest"].explode().unique():
        filtered_users = data[data['TechnologyLearned'] == technology]
        filtered_users = filtered_users[filtered_users['UserId'] != user_id]

        if len(filtered_users) == 0:
            recommendations[technology] = "No users found for this technology"
            continue

        filtered = filtered_users.drop(columns=['TechnologyLearned', 'TechnologyOfInterest']).copy()
        filtered['UserId'] = filtered['UserId'].astype(str)  # Ensure string user IDs
        encoded_data = filtered.drop(columns=['UserId'])
        encoded_data = encoded_data.apply(le.fit_transform, axis=0)
        similarity_matrix = euclidean_distances(encoded_data)
        similar_df = pd.DataFrame({'UserId': filtered['UserId'], 'Score': similarity_matrix[0]})
        similar_df = similar_df.sort_values(by='Score', ascending=True)
        similar_df = similar_df['UserId'].tolist()  # Select only User IDs
        recommendations.setdefault(technology, similar_df)[:k] = similar_df[:k]

    return recommendations

# Example usage
recommend_collaborators('e17bda85-98e1-4dc5-a7e5-3fd790116fcf')

{'React': ['6a42e8d3-3e26-41b7-9a5c-0a1f1f2a5ec0',
  'f63b22fc-2de8-41a1-a45a-c34da9e547a5']}

In [25]:
print(data.head())

  PreferredLanguage  TrackLearned TechnologyLearned    TrackLevel  \
0           English    Full-stack         .NET Core  Intermediate   
1            Arabic      Frontend             React      Advanced   
2           English  Data Science            Python      Beginner   
3           English       Backend         .NET Core  Intermediate   
4           English    Full-stack           Angular      Advanced   

  ReferralSource EmploymentStatus BasicProgrammingLanguagesKnowledge  \
0   Social media         Employed                     C#, JavaScript   
1         Friend       Unemployed              HTML, CSS, JavaScript   
2  Search engine          Student                             Python   
3   Social media         Employed                            C#, SQL   
4         Friend         Employed  JavaScript, TypeScript, HTML, CSS   

  ProficientProgrammingLanguages PreferredLearningStyle LearningFrequency  \
0                             C#                 Videos             Daily  

In [27]:
#method 2.
#data['TechnologyOfInterest'] = data['TechnologyOfInterest'].apply(lambda x: x.split())

# --- Sentence Embedding ---
model = SentenceTransformer('all-mpnet-base-v2')
all_technologies = set(t for techs in data['TechnologyOfInterest'] for t in techs)
technology_embeddings = model.encode(list(all_technologies))

dim = technology_embeddings.shape[1]
ann_index = AnnoyIndex(dim, 'angular')
for i, embedding in enumerate(technology_embeddings):
    ann_index.add_item(i, embedding)
ann_index.build(10)

# --- Recommendation Function ---
def recommend_collaborators(user_id, data, k=5):
    if user_id not in data['UserId'].tolist():
        print(f"User with ID {user_id} not found in data.")
        return {}

    user_technologies = data[data["UserId"] == user_id]["TechnologyOfInterest"].tolist()[0]
    other_users = data[data['UserId'] != user_id]

    recommendations = {}
    for technology in user_technologies:
        user_embedding = model.encode([technology])
        potential_match_indices, distances = ann_index.get_nns_by_vector(user_embedding[0], 10, include_distances=True)
        potential_matches = other_users.iloc[potential_match_indices]
        other_user_embeddings = model.encode(potential_matches['TechnologyOfInterest'].apply(lambda x: ' '.join(x)).tolist())
        similarities = util.cos_sim(user_embedding, other_user_embeddings)[0].tolist()
        similar_df = pd.DataFrame({'UserId': potential_matches['UserId'], 'Score': similarities}).sort_values(by='Score', ascending=False)
        top_users = similar_df['UserId'].tolist()[:k]
        recommendations[technology] = top_users if top_users else ["No users found for this technology"]

    return recommendations

# --- Example Usage ---
recommend_collaborators('e17bda85-98e1-4dc5-a7e5-3fd790116fcf', data)




{'React': ['ad70ff26-d4d2-4e46-a789-305fc4f6b647',
  'e46d51bf-3680-4e38-9ee9-9721787317b4',
  'd72c4a24-5f9c-49d0-b8cb-d993cb47ef8b',
  '2e84d2b1-2317-4b9b-a4d1-1ef47770f4dc',
  '6a42e8d3-3e26-41b7-9a5c-0a1f1f2a5ec0']}