In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
# Generating 1000 user IDs
user_ids = [f'U{i}' for i in range(1, 1001)]

In [3]:
data = pd.read_csv('../backend/Skillset.csv')
skills = data['skills'].tolist()

In [4]:
data.head()

Unnamed: 0,skills
0,Python
1,Machine Learning
2,Data Analysis
3,Statistical Modeling
4,Data Visualization


In [5]:
# Remove duplicates (if any)
skills = list(set(skills))

In [6]:
# Initialize DataFrame with 0s
data = pd.DataFrame(0, index=user_ids, columns=skills)

In [7]:
# Assign each user between 10 and 30 random skills to increase overlap
np.random.seed(0)  # For reproducibility
for user_id in user_ids:
    num_skills = np.random.randint(10, 50)
    assigned_skills = np.random.choice(skills, num_skills, replace=False)
    data.loc[user_id, assigned_skills] = 1

In [8]:
# Function to get skills assigned to a specific user
def get_user_skills(user_id):
    assigned_skills = data.columns[data.loc[user_id] == 1].tolist()
    return assigned_skills

In [9]:
# Compute cosine similarity
cosine_sim = cosine_similarity(data)
cosine_sim_df = pd.DataFrame(cosine_sim, index=user_ids, columns=user_ids)

In [10]:
# Create a matrix showing the similarity percentage between each user pair
similarity_percentage_matrix = cosine_sim_df * 100

In [11]:
# Convert the matrix to a DataFrame for better readability
similarity_percentage_df = pd.DataFrame(similarity_percentage_matrix, index=user_ids, columns=user_ids)


In [12]:
# Save DataFrames to pickle files
with open('current_skills.pkl', 'wb') as f:
    pickle.dump(data, f)

with open('similarity_scores.pkl', 'wb') as f:
    pickle.dump(similarity_percentage_df, f)

In [13]:
# Function to get similar users with unique scores for a specific user
def get_unique_similar_users(user_id, threshold=20.0):
    similarity_percentages = similarity_percentage_df[user_id]
    similar_users = similarity_percentages[similarity_percentages >= threshold].to_dict()
    if user_id in similar_users:
        del similar_users[user_id]  # Remove the user itself from the dictionary
    return {user_id: similar_users}

In [14]:
# Function to recommend skills to a user based on users with at least a specified similarity threshold
def recommend_skills(user_id, similarity_threshold=20.0):
    similar_users_with_scores = get_unique_similar_users(user_id, similarity_threshold)
    
    # Print similar users with scores
    print(f"Similar users with scores for user {user_id}: {similar_users_with_scores}")

    user_skills = set(get_user_skills(user_id))
    print(f"Current skills of user {user_id}: {user_skills}")

    recommended_skills = set()
    user_scores = {}

    for similar_user, scores in similar_users_with_scores[user_id].items():
        similar_user_skills = set(get_user_skills(similar_user))
        new_skills = similar_user_skills - user_skills
        print(f"New skills from user {similar_user} to recommend: {new_skills}")
        recommended_skills.update(new_skills)
        # Store the score for each similar user
        user_scores[similar_user] = scores

    return recommended_skills, user_scores

In [15]:
# Example: Recommend skills for user U300
recommended_skills, user_scores = recommend_skills('U104', similarity_threshold=21.0)
print(f"Recommended skills for user : {recommended_skills}")
print(f"Scores of similar users : {user_scores}")

Similar users with scores for user U104: {'U104': {'U36': 23.159177829329273, 'U58': 21.053798026662975, 'U292': 21.290467263536573, 'U366': 23.159177829329273, 'U955': 21.053798026662975}}
Current skills of user U104: {'AWS', 'Succession Planning', 'Onboarding', 'Go-to-Market Strategy', 'Data Aggregation', 'Power BI', 'CCPA', 'User Research', 'CQRS', 'Benchmarking', 'Tableau', 'Qualitative Research', 'Data Transformation', 'Artificial Intelligence', 'Data Cleaning', 'Backlog Refinement', 'Gantt Charts', 'Data Sharing', 'Reporting', 'Agricultural Analytics', 'Social Media Analytics', 'Recommender Algorithms', 'Hybrid Filtering', 'Cybersecurity', 'Operations Research', 'Notebooks', 'Data Augmentation', 'Root Cause Analysis', 'SQL', 'Product Management', 'Data Integrity', 'CI/CD', 'Ethical Business Practices', 'Metrics', 'Workplace Culture', 'Compliance Management', 'SWOT Analysis', 'UI Design', 'Fraud Detection', 'Cost-Benefit Analysis', 'Database Sharding', 'Collaborations', 'R Program

In [16]:
pickle.load(open('current_skills.pkl','rb'))

Unnamed: 0,Data Integration,Scalability,HIPAA,Onboarding,Research and Development,Knowledge Base,Recommendation Systems,Risk Register,Machine Learning,User Stories,...,Knowledge Graphs,Supply Chain Analytics,Design of Experiments,Customer Segmentation,Trend Analysis,Project Scope,Media Relations,Talent Management,Data Synthesis,Alerting
U1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
U2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
U5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
U997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
U998,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
U999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
pickle.load(open('similarity_scores.pkl','rb'))

Unnamed: 0,U1,U2,U3,U4,U5,U6,U7,U8,U9,U10,...,U991,U992,U993,U994,U995,U996,U997,U998,U999,U1000
U1,100.000000,0.000000,0.000000,5.063697,8.451543,0.000000,0.000000,0.000000,5.345225,9.225312,...,0.000000,12.649111,6.324555,5.129892,0.000000,13.187609,16.035675,6.593805,0.000000,5.270463
U2,0.000000,100.000000,11.322770,18.490007,11.572751,0.000000,10.825318,17.407766,2.439750,8.421519,...,9.622504,11.547005,2.886751,9.365858,0.000000,6.019293,19.518001,3.009646,8.703883,9.622504
U3,0.000000,11.322770,100.000000,9.421114,0.000000,4.902903,0.000000,5.913124,6.629935,2.860648,...,9.805807,3.922323,11.766968,9.544271,8.770580,4.089304,3.314968,4.089304,11.826248,9.805807
U4,5.063697,18.490007,9.421114,100.000000,4.279605,4.003204,0.000000,14.484136,5.413320,4.671418,...,8.006408,3.202563,6.405126,2.597622,7.161149,10.016708,2.706660,13.355611,0.000000,16.012815
U5,8.451543,11.572751,0.000000,4.279605,100.000000,6.681531,0.000000,0.000000,0.000000,3.898406,...,13.363062,5.345225,10.690450,0.000000,0.000000,0.000000,4.517540,5.572782,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U996,13.187609,6.019293,4.089304,10.016708,0.000000,0.000000,10.425721,0.000000,3.524537,3.041495,...,6.950480,8.340577,8.340577,3.382550,0.000000,100.000000,7.049074,4.347826,0.000000,3.475240
U997,16.035675,19.518001,3.314968,2.706660,4.517540,0.000000,0.000000,5.096472,8.571429,7.396705,...,2.817181,10.141851,0.000000,24.678382,3.779645,7.049074,100.000000,10.573611,5.096472,5.634362
U998,6.593805,3.009646,4.089304,13.355611,5.572782,0.000000,10.425721,0.000000,7.049074,0.000000,...,0.000000,0.000000,8.340577,3.382550,0.000000,4.347826,10.573611,100.000000,12.573892,6.950480
U999,0.000000,8.703883,11.826248,0.000000,0.000000,7.537784,0.000000,0.000000,0.000000,8.795990,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.096472,12.573892,100.000000,0.000000


In [18]:
# Function to add a new user and recommend skills based on existing users
def add_new_user(new_user_id, current_skills, similarity_threshold=90.0):
    # Create a row for the new user in the existing DataFrame with current skills
    new_user_row = pd.Series(0, index=data.columns)
    for skill in current_skills:
        if skill in new_user_row.index:
            new_user_row[skill] = 1
    data.loc[new_user_id] = new_user_row

    # Recompute cosine similarity with the new user added
    cosine_sim = cosine_similarity(data)
    cosine_sim_df = pd.DataFrame(cosine_sim, index=data.index, columns=data.index)
    similarity_percentage_matrix = cosine_sim_df * 100
    similarity_percentage_df = pd.DataFrame(similarity_percentage_matrix, index=data.index, columns=data.index)
    
    # Save updated DataFrames to pickle files
    with open('current_skills.pkl', 'wb') as f:
        pickle.dump(data, f)
    
    with open('similarity_scores.pkl', 'wb') as f:
        pickle.dump(similarity_percentage_df, f)

    # Recommend skills for the new user
    recommended_skills, user_scores = recommend_skills(new_user_id, similarity_threshold)
    return recommended_skills, user_scores


In [19]:
# Example: Adding a new user U1001 with some skills
new_user_id = 'U104'
new_user_skills = ['Python', 'Data Visualization']
recommended_skills, user_scores = add_new_user(new_user_id, new_user_skills, similarity_threshold=21.0)

Similar users with scores for user U104: {'U104': {'U36': 23.159177829329273, 'U58': 21.053798026662975, 'U292': 21.290467263536573, 'U366': 23.159177829329273, 'U955': 21.053798026662975}}
Current skills of user U104: {'Python', 'Data Visualization'}
New skills from user U36 to recommend: {'Behavioral Analytics', 'Computational Statistics', 'CCPA', 'Data Loading', 'Backup and Recovery', 'Recommendation Systems', 'Benchmarking', 'Synthetic Data Generation', 'Qualitative Research', 'Ad Tech', 'Blue Teaming', 'Pharmacovigilance', 'Unani', 'Event Sourcing', 'Stream Processing', 'Social Media Analytics', 'Cybersecurity', 'Data Augmentation', 'User Feedback', 'Corporate Social Responsibility', 'Employee Wellness', 'Flowcharts', 'Diversity and Inclusion', 'Vulnerability Assessment', 'Usability Testing', 'Marketing Automation', 'Board of Directors', 'Batch Processing', 'Fault Tolerance', 'Ethical Hacking', 'Lab Experiments', 'Workplace Culture', 'Data Synthesis', 'Generative Adversarial Netwo

In [20]:
print(f"Recommended skills for new user {new_user_id}: {recommended_skills}")
print(f"Scores of similar users for new user {new_user_id}: {user_scores}")

Recommended skills for new user U104: {'Azure', 'Scalability', 'Power BI', 'Mediation', 'Recommendation Systems', 'Unsupervised Learning', 'Problem Management', 'Agricultural Analytics', 'Ethics in AI', 'Metadata Management', 'Data Transparency', 'Statistics', 'Data Augmentation', 'Load Testing', 'Flowcharts', 'Continuous Deployment', 'Vulnerability Assessment', 'Prototyping', 'Data Management Tools', 'Marketing Automation', 'Lab Experiments', 'Ethical Hacking', 'Smart Cities', 'Generative Adversarial Networks', 'Supervised Learning', 'Semi-supervised Learning', 'Experiment Tracking', 'Collaborations', 'Technical Writing', 'Content Analytics', 'Process Optimization', 'Business Rules', 'Documentation', 'Normalization', 'Time Series Analysis', 'Intellectual Property', 'Budgeting', 'Oracle', 'Go-to-Market Strategy', 'Data Aggregation', 'Data Scaling', 'Data Loading', 'Regression Analysis', 'Benchmarking', 'Ad Tech', 'Penetration Testing', 'Unani', 'Feature Extraction', 'Data Masking', 'Ma