In [1]:
!ls -l

total 56
-rw-rw-r-- 1 david david 37020 Jul 11 11:08 'Mentor_Mentee Tracker (4_20_23) - Sheet1.csv'
-rw-rw-r-- 1 david david 13458 Jul 12 09:40  VDSML_Mentoring_Program.ipynb


In [2]:
import pandas as pd
import operator
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Load data, 
mentors = pd.read_csv('Mentor_Mentee Tracker (4_20_23) - Sheet1.csv')
mentors.head(1)

Unnamed: 0,preferred name,submitted on,Mentor action,Mentee action,Mentor assigned,When was last manual contact made?,Who made last contact?,linkedin profile address,email,phone,...,How many years of Data Science and Machine Learning education do you have,Technical education years?,How many years of Data Science and Machine Learning work experience do you have,Technical employment years?,what are you looking for in a mentor,what industries are you most interested in,what use cases are you most interested in,Was there something more specific missing from the above categories,Have you looked at Veterati for data science and machine learning mentoring,Admin Notes
0,field_0 field_1,4/24/2023,,,,,,field_2,field_3@gmail.com,(444) 444-4444,...,field_8,,field_9,,field_10,Business/Administration,Analytics/Statistics,field_13,"I’m not on Veterati yet, but I will sign up.",


In [4]:
# Rename columns
mentors = mentors.rename(columns={
    'preferred name': 'Name',
    'submitted on': 'Date_Joined',
    'Mentor action': 'Mentor_Action',
    'Mentee action': 'Mentee_Action',
    'Mentor assigned': 'Mentor_Assigned',
    'When was last manual contact made?':'Date_Last_Contact',
    'Who made last contact?':'VDSML_Coordinator',
    'linkedin profile address':'LinkedIn',
    'email':'Email',
    'phone':'Phone',
    'how did you serve': 'Service',
    'Where are you in your Data Science and Machine Learning journey':'Journey',
    'Journey Level': 'Level',
    'why did you decide to pursue data science and machine learning':'Why',
    'How many years of Data Science and Machine Learning education do you have':'Education',
    'Technical education years?': 'Education_Years',
    'How many years of Data Science and Machine Learning work experience do you have':'Employment',
    'Technical employment years?': 'Employment_Years',
    'what are you looking for in a mentor':'Mentor_Wants',
    'what industries are you most interested in': 'Industries',
    'what use cases are you most interested in': 'Use_Cases',
    'Was there something more specific missing from the above categories':'Missing',
    'Have you looked at Veterati for data science and machine learning mentoring': 'Veterati',
    'Admin Notes':'Notes',
    })
list(mentors.columns)

['Name',
 'Date_Joined',
 'Mentor_Action',
 'Mentee_Action',
 'Mentor_Assigned',
 'Date_Last_Contact',
 'VDSML_Coordinator',
 'LinkedIn',
 'Email',
 'Phone',
 'Service',
 'Journey',
 'Level',
 'Why',
 'Education',
 'Education_Years',
 'Employment',
 'Employment_Years',
 'Mentor_Wants',
 'Industries',
 'Use_Cases',
 'Missing',
 'Veterati',
 'Notes']

In [5]:
# One time data clean up - drop private or unnecessary data 
mentors = mentors.drop(0) # drop first row of test data
mentors = mentors.drop(axis='columns',
    labels=['Date_Last_Contact',
            'VDSML_Coordinator',
            'LinkedIn',
            'Email',
            'Phone',
            'Journey',
            'Why',
            'Education',
            'Employment',
            'Mentor_Wants',
            'Missing',
            'Notes'
            ])

mentors.head(1)

Unnamed: 0,Name,Date_Joined,Mentor_Action,Mentee_Action,Mentor_Assigned,Service,Level,Education_Years,Employment_Years,Industries,Use_Cases,Veterati
1,David Ramirez,4/20/2023,Paired with mentees; Available for additional ...,Awaiting mentor,,Marine Corps,3.0,8.0,11.0,"Defense/Military,Aerospace/Telemetry/Sensors,G...","Data Capture/Serving,Pattern Detection,Modelin...","Yes, I signed up for both!"


In [7]:
# Convert multi-select data to one-hot encoded columns for new dataframe
service_1hot = mentors['Service'].str.get_dummies(sep = ',')
industries_1hot = mentors['Industries'].str.get_dummies(sep = ',')
usecases_1hot = mentors['Use_Cases'].str.get_dummies(sep = ',')

In [8]:
# Parse Veterati question for status
veterati_status = mentors['Veterati'].str.extract("(Yes|not)")[0]
veterati_status[veterati_status == 'Yes'] = True
veterati_status[veterati_status == 'not'] = False

In [9]:
# Update original dataframe
mentors['Veterati'] = veterati_status
mentors = mentors.drop(axis='columns',
                            labels=['Service',
                                    'Industries',
                                    'Use_Cases',
                                    ])
mentors.head(1)

Unnamed: 0,Name,Date_Joined,Mentor_Action,Mentee_Action,Mentor_Assigned,Level,Education_Years,Employment_Years,Veterati
1,David Ramirez,4/20/2023,Paired with mentees; Available for additional ...,Awaiting mentor,,3.0,8.0,11.0,True


In [None]:
# Create new dataframe with interests of mentors
interests = pd.merge(service_1hot,industries_1hot, left_index=True, right_index=True)
interests = pd.merge(interests,usecases_1hot, left_index=True, right_index=True)
interests.set_index(mentors['Name'], inplace=True)
interests = interests.drop(axis='columns', labels=['Undecided/Unknown'])
interests.head(1)

In [None]:
# Define function for calculating percentage match between mentor and mentee
def calc_percent_match(mentor,mentee,interests):
    assert isinstance(mentor, str)
    assert isinstance(mentee, str)

    compare = interests.loc[mentor] + interests.loc[mentee]
    match_count = (compare == 2).astype(int).sum()
    mentor_percent = int(round(match_count / interests.loc[mentor].sum(), 2)*100)
    mentee_percent = int(round(match_count / interests.loc[mentee].sum(), 2)*100)
    match_percent = max(mentor_percent, mentee_percent)
    return match_percent

In [None]:
# Get paired mentors and mentees
paired_mentors = mentors.loc[mentors['Mentor_Assigned'].notna(), 'Mentor_Assigned']
paired_mentees = mentors.loc[mentors['Mentor_Assigned'].notna(), 'Name']
edges = pd.DataFrame(
    {'Mentor': paired_mentors,
     'Mentee': paired_mentees,
    })
edges.drop(edges[edges['Mentor'] == 'NaN'].index, inplace=True)
edges['weight'] = edges.apply(lambda x: calc_percent_match(x['Mentor'], x['Mentee'], interests), axis=1)
edges

In [None]:
# Initilize mentor to mentee directional graph
Paired_Mentors = nx.from_pandas_edgelist(edges, 'Mentor', "Mentee", "weight", create_using=nx.DiGraph)

In [None]:
# Define function for plotting directional acyclic graph
def plot_graph(G):
    for layer, nodes in enumerate(nx.topological_generations(G)):
        for node in nodes:
            G.nodes[node]["layer"] = layer
    pos = nx.multipartite_layout(G, subset_key="layer")
    fig, ax = plt.subplots()
    nx.draw_networkx(G, pos=pos, ax=ax)
    ax.set_title("VDSML Mentoring in topological order")
    fig.tight_layout()
    plt.show()

In [None]:
plot_graph(Paired_Mentors)

In [None]:
# Find available MENTORS
mentors['Mentor_Action'] = mentors['Mentor_Action'].fillna('')
already_mentor = mentors['Mentor_Action'].str.contains("Paired with mentee", case=False)
wont_mentor = mentors['Mentor_Action'].str.contains("Won't", case=False) # JERKS!
too_junior = mentors['Mentor_Action'].str.contains("Too junior", case=False)
mentor_more = mentors['Mentor_Action'].str.contains("Available for additional mentee", case=False) # MOTO!
# Combine unavailable mentors, invert to find available mentors, and combine with mentors taking more mentees
available_mentors = mentors[(~(already_mentor | wont_mentor | too_junior)) | mentor_more]
available_mentors = available_mentors.drop(axis='columns', labels=['Date_Joined', 'Mentee_Action', 'Mentor_Assigned'])

# Put temp pairs in here for testing
#pending_mentors = ['John', 'Jane']
#for name in pending_mentors:
#    available_mentors = available_mentors[available_mentors['Name'] != name]

available_mentors

In [None]:
# Find unpaired MENTEES
got_mentor = mentors['Mentee_Action'].str.contains("Paired with mentor", case=False)
not_mentee = mentors['Mentee_Action'].str.contains("Not looking for mentor", case=False)
# Combine already paired mentees with those not looking for a mentor, and invert to find unpaired mentees
unpaired_mentees = mentors[~(got_mentor | not_mentee)]
unpaired_mentees = unpaired_mentees.drop(axis='columns', labels=['Mentor_Action'])

# Put temp pairs in here for testing
#pending_mentees = ['John', 'Jane']
#for name in pending_mentees:
#    unpaired_mentees = unpaired_mentees[unpaired_mentees['Name'] != name]

unpaired_mentees

In [None]:
# Search for an available mentor for each unpaired mentee

# Initialize new array for tracking best matches 
new_edges = pd.DataFrame()

for mentee_index, mentee_row in unpaired_mentees.iterrows():
    # Get a next row from unpaired_mentees

    # Initialize new temp dictionary for tracking potential match scores
    matches = {}

    for mentor_index, mentor_row in available_mentors.iterrows():
        # Get next row from available mentors

        # Ensure potential mentor and mentee not the same person
        if mentor_row['Name'] is not mentee_row['Name']:

            # Ensure mentor is higher level than mentee
            if mentor_row['Level'] >= mentee_row['Level']:

                #mentor_row = mentor_row.squeeze() # Is this necessary?

                # Add one-hot encoded interests together (0 or 1), resulting values will be 0, 1, or 2
                compare = interests.loc[mentor_row['Name']] + interests.loc[mentee_row['Name']]
                # For each interest of value 2, this identifies an overlapping interest
                match_count = (compare == 2).astype(int).sum()
                # Find the fraction of matches over all interests for both MENTOR and MENTEE
                mentor_percent = int(round(match_count / interests.loc[mentor_row['Name']].sum(), 2)*100)
                mentee_percent = int(round(match_count / interests.loc[mentee_row['Name']].sum(), 2)*100)
                # Score is the lower of these values, to account to overeager people
                match_percent = max(mentor_percent, mentee_percent)
                #match_percent = mentee_percent

                matches[mentor_row['Name']] = match_percent

    # Create new entry in edges based on greatest match
    mentee_name = mentee_row['Name']
    print('Mentee: '+mentee_name)
    if len(matches) != 0:
        # Find index in matches array with greatest match
        best_mentor_name = max(matches, key=matches.get)
        # Get percent match from dictionary key
        percent_match = matches[best_mentor_name]
        # Create new graph edge linking mentor to mentee
        new_edge = pd.DataFrame({"Mentor": [best_mentor_name],
                                "Mentee": [mentee_name],
                                "weight": [percent_match],})
        # Insert new graph edge into edges dataframe
        new_edges = pd.concat([new_edges,new_edge], ignore_index = True)
        print('  Best Mentor: '+best_mentor_name)
        print('  Match Percent: '+str(percent_match))
    print('')

new_edges.reset_index()
Potential_Pairs = nx.from_pandas_edgelist(new_edges, 'Mentor', "Mentee", "weight", create_using=nx.DiGraph)