In [1]:
import pandas as pd
import pickle
import random
from collections import OrderedDict
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# to open file
def getData(df_name):
    df = pd.read_csv('../FounderDating/data/%s.csv' % df_name)
    return df

In [3]:
co_primary_df = getData('co_primary')
co_primary_df = co_primary_df.drop('assigned_user_id', axis = 1).rename(columns = {'id' : 'user_id'}).set_index('user_id').rename(columns = {'primary_skillsets' : 'topics'})

following_tags_df = getData('following_tags')
following_tags_df = following_tags_df.rename(columns = {'GROUP_CONCAT(name)' : 'topics'}).set_index('user_id')

advisees_df = getData('advisees')
advisees_df = advisees_df.rename(columns = {'group_concat(tags.name)' : 'topics'}).set_index('user_id')

advisors_df = getData('advisors')
advisors_df = advisors_df.rename(columns = {'group_concat(tags.name)' : 'topics'}).set_index('user_id')

# unapproved and approved
advisors_un_df = getData('advisors_plus_unapproved')
advisors_un_df = advisors_un_df.rename(columns = {'group_concat(tags.name)' : 'topics'}).set_index('user_id')

In [4]:
advisors_all_df = pd.concat([advisors_df, advisors_un_df], axis = 0).reset_index().groupby('user_id').min()

In [5]:
def get_dummies(df):
    df = df['topics'].str.lower().str.get_dummies(sep=',')
    return df

In [6]:
co_primary_df = get_dummies(co_primary_df)
following_tags_df = get_dummies(following_tags_df)
advisees_df = get_dummies(advisees_df)
advisors_df = get_dummies(advisors_df)
advisors_all_df = get_dummies(advisors_all_df)

In [7]:
# matching names to tags in MySQL
co_p_matched_df = co_primary_df.rename(columns =
                     {'classroom teacher':
                      'education',
                      'engineering - biomedical' :
                      'biomedical engineering',
                      'engineering - electrical':
                      'electrical engineering',
                      'engineering - mechanical':
                      'mechanical engineering',
                      'ui/design':
                      'user interface design'})

In [8]:
co_p_matched_df.loc[:,'engineering'] = co_p_matched_df['engineering'] + co_p_matched_df['engineering - other']
co_p_matched_df.engineering[co_p_matched_df.engineering > 1] = 1
co_p_matched_df = co_p_matched_df.drop('engineering - other', axis = 1)

In [9]:
tdf_co = pd.concat([advisees_df, advisors_df,
                       following_tags_df, co_p_matched_df], axis = 0).fillna(0)

In [10]:
# matching columns to the total/master data frame
co_primary_df = co_primary_df.loc[:,tdf_co.columns].fillna(0)
following_tags_df = following_tags_df.loc[:,tdf_co.columns].fillna(0)
advisees_df = advisees_df.loc[:,tdf_co.columns].fillna(0)
advisors_df = advisors_df.loc[:,tdf_co.columns].fillna(0)
advisors_all_df = advisors_all_df.loc[:,tdf_co.columns].fillna(0)

In [11]:
tdf = pd.concat([advisees_df, advisors_df,
                       following_tags_df], axis = 0).fillna(0)

In [12]:
tdf_reset = tdf.reset_index()
tdf_cols = tdf_reset.ix[:,1:].columns
y_num = 500
for col in tdf_cols:
    col_vals = tdf_reset.loc[:,col]
    if sum(col_vals) > y_num:
        ratio = len(col_vals) / float(sum(col_vals))
        rows_keep = y_num * ratio
        rows_rand = int(max(0,(len(col_vals) - rows_keep) * .8))
        idx_to_0 = random.sample(tdf_reset.index, rows_rand)
        tdf_reset.ix[idx_to_0, col] = 0
tdf = tdf_reset.set_index('user_id')

In [13]:
# unique following

temp_df = following_tags_df.copy(deep = True)
temp_df = temp_df.applymap(lambda x: 0)

In [14]:
advisees_temp = advisees_df.copy(deep = True)
advisors_all_temp = advisors_all_df.copy(deep = True)

In [15]:
advisees_temp = advisees_temp.ix[temp_df.index].fillna(0)
advisors_all_temp = advisors_all_temp.ix[temp_df.index].fillna(0)

In [16]:
both_df = advisees_temp + advisors_all_temp

In [17]:
follow_unique_df = following_tags_df.copy(deep = True)

In [18]:
follow_unique_df = follow_unique_df - both_df

In [19]:
user_follows = follow_unique_df.transpose().sum()

In [20]:
temp_idx = user_follows[user_follows > 0].index

In [21]:
follow_unique_df = follow_unique_df.ix[temp_idx]

In [22]:
pca = PCA(n_components=200)
pca.fit(tdf.transpose().values)
pca_total = pca.transform(tdf.transpose().values)

In [23]:
def SuggestionsUnfollowedDF(user_id, df):
    # topics the user likes
    user_topic_lists = []
    if user_id in df.index:
        user_row = df.loc[user_id]
        user_topic_lists.append(user_row[user_row > 0].index.tolist())
        # user as a vector
        for topic in user_topic_lists:
            # make list for user space "topic" vectors

            # columns the user belongs to
            user_list_cols = tdf.loc[:,topic]

            # get the user space for user
            user_space = user_list_cols.transpose().sum().values.tolist()

        # tranform with PCA the "user_space" of the user to 200 dimensions
        pca_vector = pca.transform(user_space)

        # get cosine similarity of "user_space" of user
        cos_type = cosine_similarity(pca_vector, pca_total).tolist()[0]

    else:
        return
    # zipping, then sorting, the cosine similarity value and the topic
    suggested_topics = sorted(zip(tdf_cols,cos_type),
                              key = lambda x: x[1], reverse = True)
    
    # limits top 100 suggested topics to make sure it's not followed
    suggested_topics = suggested_topics[:100]

    if user_id in following_tags_df.index:
        user_row = following_tags_df.loc[user_id]
        followed = user_row[user_row > 0].index.tolist()
        
    # keeps not followed topics
        suggested_topics = [(topic, cos) for topic, cos in suggested_topics if topic not in followed]

    # shows top twenty suggestions of unfollowed topics
    return [item[0] for item in suggested_topics][:20]

In [24]:
print "starting to make suggestions"

starting to make suggestions


In [25]:
advisors_all_dict = {}
for user_id in advisors_all_df.index:
    advisors_all_dict[user_id] = SuggestionsUnfollowedDF(user_id, advisors_all_df)

In [26]:
with open('suggestions_advisors_all.pickle', 'wb') as handle:
  pickle.dump(advisors_all_dict, handle)
print "finished file from advisor"

finished file from advisor


In [27]:
advisees_dict = {}
for user_id in advisees_df.index:
    advisees_dict[user_id] = SuggestionsUnfollowedDF(user_id, advisees_df)

In [28]:
with open('suggestions_advisees.pickle', 'wb') as handle:
  pickle.dump(advisees_dict, handle)
print "finished file from advisee"

finished file from advisee


In [29]:
follow_unique_dict = {}
for user_id in follow_unique_df.index:
    follow_unique_dict[user_id] = SuggestionsUnfollowedDF(user_id, follow_unique_df)

In [30]:
with open('suggestions_following.pickle', 'wb') as handle:
  pickle.dump(follow_unique_dict, handle)
print "finished file from following topics"

finished file from following topics


In [31]:
# Co Primary

In [87]:
topics_df = pd.read_csv('../FounderDating/data/topics.csv')
topics_df['total'] = topics_df['advisor_all_count'] + topics_df['number of advisees']
topics_df = topics_df.sort(columns = ['group', 'total'], ascending = [1,0])

In [88]:
def topic_dict(list_of_groups):
    topic_list = []
    count_list = []
    for item in list_of_groups:
        for topic in topics_df[topics_df.group == item].topics:
            topic_list.append(topic)
    for item in list_of_groups:
        for num in topics_df[topics_df.group == item].total:
            count_list.append(num)
    d = dict(zip(topic_list, count_list))

    dict_sort = OrderedDict(sorted(d.items(),
                                   key=lambda t: t[1], reverse=True))
    return dict_sort

In [89]:
ad_sales_list = ['sales', 'advertising']
business_development_list = ['business development']
education_list = ['education']
engineering_list = ['engineering']
biomedical_engineering_list = ['healthcare', 'science']
electrical_engineering_list = ['hardware']
mechanical_engineering_list = ['hardware']
enterprise_sales_list = ['enterprise']
fundraising_list = ['fundraising']
industrial_design_list = ['hardware', 'design']
marketing_list = ['marketing', 'growth']
product_list = ['product management']
user_interface_design_list = ['design']

In [90]:
ad_sales = topic_dict(ad_sales_list)
business_development = topic_dict(business_development_list)
education = topic_dict(education_list)
engineering = topic_dict(engineering_list)
biomedical_engineering = topic_dict(biomedical_engineering_list)
electrical_engineering = topic_dict(electrical_engineering_list)
mechanical_engineering = topic_dict(mechanical_engineering_list)
enterprise_sales = topic_dict(enterprise_sales_list)
fundraising = topic_dict(fundraising_list)
industrial_design = topic_dict(industrial_design_list)
marketing = topic_dict(marketing_list)
product = topic_dict(product_list)
user_interface_design = topic_dict(user_interface_design_list)

In [91]:
groups_dict = {}
groups_dict['ad sales'] = ad_sales.keys()
groups_dict['business development'] = business_development.keys()
groups_dict['education'] = education.keys()
groups_dict['engineering'] = engineering.keys()
groups_dict['biomedical engineering'] = biomedical_engineering.keys()
groups_dict['electrical engineering'] = electrical_engineering.keys()
groups_dict['mechanical engineering'] = mechanical_engineering.keys()
groups_dict['enterprise sales'] = enterprise_sales.keys()
groups_dict['fundraising'] = fundraising.keys()
groups_dict['industrial design'] = industrial_design.keys()
groups_dict['marketing'] = marketing.keys()
groups_dict['product'] = product.keys()
groups_dict['user interface design'] = user_interface_design.keys()

In [92]:
def make_suggestions(user_id):
    user_co = cos_following.ix[user_id]
    user_primary = user_co[user_co > 0].index
    suggestion_list = []
    if len(user_primary) > 0:
        num = 30 / len(user_primary)
        for pri in user_primary:
            for topic in groups_dict[pri][:num]:
                if topic not in suggestion_list:
                    suggestion_list.append(topic)
    return sorted(suggestion_list)

In [96]:
co_following_idx = (set(co_p_matched_df.index) -
                    set(following_tags_df.index) -
                    set(advisees_df.index) -
                    set(advisors_df.index))

In [98]:
cos_following = co_p_matched_df.ix[co_following_idx]
cos_following = cos_following.sort_index()

In [99]:
user_dict = {}

In [100]:
for idx in cos_following.index:
    user_dict[idx] = make_suggestions(idx)

In [101]:
with open('suggestions_primary.pickle', 'wb') as handle:
  pickle.dump(user_dict, handle)
print "finished file from primary"