In [1]:
import implicit
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import math

In [5]:
df = pd.read_csv('members.csv', encoding = 'latin-1')

In [6]:
def get_city(city):
    """Take in the acronym of the city and return a dataframe containing only members and groups from that city."""
    
    #defense coding
    if city not in ['SF', 'NYC', 'CHI']:
        raise ValueError('Invalid City')
        
    #separating the members and groups by their corresponding city
    
    #SF
    sf = ['San Francisco', 'san francisco', 'South San Francisco']
    df_sf = df[df.city.isin(sf)]
    
    #NY
    ny = ['New York', 'West New York', 'New York Mills']
    df_ny = df[df.city.isin(ny)]
    
    #CHI
    chi = ['Chicago','Chicago Heights','West Chicago','Chicago Ridge','East Chicago','North Chicago','Chicago Park']
    df_chi = df[df.city.isin(chi)]

    
    if city == 'SF':
        return df_sf
    elif city == 'NYC':
        return df_ny
    elif city == 'CHI':
        return df_chi

In [41]:
#reading in SF user data
df_sf = get_city('SF')

In [42]:
#dropping unecessary groups from members dataframe
df_sf = df_sf.drop(['bio','country','hometown','lat','link','lon','member_name','state','member_status'], axis = 1)

#changing these two columns to datetime
df_sf['joined'] = pd.to_datetime(df_sf['joined'])
df_sf['visited'] = pd.to_datetime(df_sf['visited'])

#taking the time difference between vistited and joined and setting that timedelta by Months
df_sf['delta'] = (df_sf['visited'] - df_sf['joined']).astype('timedelta64[M]')

#groups that had less than 1 month of interaction will be set to 1 as needed to create the preference matrix
df_sf['delta'] = df_sf['delta'].replace(to_replace = 0.0, value = 1.0)

#setting up the user and item interaction with binary interaction
df_sf['dummy'] = 1

#selecting a subset of users
members = np.random.choice(df_sf['member_id'].unique(), size = 40000, replace = False)
df1 = df_sf[df_sf['member_id'].isin(members)]

#turning the member_id and group_id to category and giving it an idx
df1['m_code'] = df1['member_id'].astype('category').cat.codes
df1['g_code'] = df1['group_id'].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [43]:
df1.head()

Unnamed: 0,member_id,city,joined,visited,group_id,delta,dummy,m_code,g_code
120,428,San Francisco,2012-01-16 00:15:20,2012-01-16 00:15:20,1060260,1.0,1,0,376
121,428,San Francisco,2010-01-12 21:12:55,2010-01-12 22:48:15,1225993,1.0,1,0,440
122,428,San Francisco,2013-02-19 22:31:34,2014-02-10 22:04:34,1593190,11.0,1,0,791
123,428,San Francisco,2011-09-15 01:42:35,2013-02-25 16:37:11,1691571,17.0,1,0,930
124,428,San Francisco,2014-01-21 02:03:38,2017-05-17 13:41:41,1788730,39.0,1,0,1092


In [44]:
#creating two sparse matrix in item by user format with timedelta and binary interaction
item_user = csr_matrix((df1['delta'], (df1['g_code'], df1['m_code'])))
dummy_item_user = csr_matrix((df1['dummy'], (df1['g_code'], df1['m_code'])))

In [65]:
def train_test_split(matrix, perc, n_interactions):
    """Take in a sparse matrix and a percentage of the data desired to train with. Only users that participated in 
        more than n_interactions groups will be masked. Returns two sets of sparse matrix for train, test and a dictionary
        where the key(user_id) value(list of item_ids) pairs reference interactions that have been masked.
       
        Arguments:
       
        matrix: A sparse matrix with the format of item by user format.
        
        perc: A float that determines the percentage of users that satisfies of being in n_interactions 
              groups or more to be masked.
        
        Returns:
        
        Training Set: user by item sparse matrix
        
        Test Set: The original matrix
        
        mask: a dictionary with key value pairs of iteractions that were masked. Key = user, Value = list of items 
        
        """
    #list to hold all the idx of users that are in more than 15 groups
    users = []
    #dictionary that contains the key value pairs to reference back what interactions were masked
    mask = {}
    
    #test set is the original matrix
    test = matrix.copy()
    test[test !=0] = 1 # set to the binary preference
    
    #turning the sparse matrix into user x item format
    temp = matrix.transpose()
    
    #finding users who are in more than n_interactions groups by storing the index of those users
    for idx in range(temp.shape[0]):
        if temp[idx].count_nonzero() > n_interactions:
            users.append(idx)
            
    random.seed(0)          
    #get random users to mask
    n_samples = math.ceil(len(users)*perc)
    users_idx = random.sample(users, n_samples)
    
    #iterate through the list of users_idx to mask iteractions in the matrix
    for i in users_idx:
        mask[i] = list(random.sample(list(temp[i].nonzero()[1]), 5))
        temp[i,mask[i]] = 0
        
    return temp, test, mask

In [66]:
train, test, ref = train_test_split(item_user, .2, 15)

In [67]:
ref

{7: [143, 1261, 1147, 1236, 321],
 31: [4158, 3199, 51, 836, 838],
 46: [1236, 1068, 2234, 3991, 3026],
 81: [286, 4406, 507, 1554, 3342],
 138: [1417, 344, 2225, 3937, 2182],
 248: [1682, 4848, 3259, 1993, 226],
 276: [2557, 2512, 4068, 4114, 4591],
 301: [1282, 3545, 2368, 4197, 3316],
 385: [1149, 838, 2043, 303, 59],
 426: [1014, 1393, 1012, 1614, 1421],
 445: [2904, 3242, 732, 836, 727],
 468: [3127, 1394, 3620, 836, 31],
 473: [3041, 331, 872, 1304, 293],
 497: [658, 573, 3515, 1211, 357],
 520: [1958, 593, 89, 433, 294],
 579: [101, 930, 201, 322, 121],
 619: [1149, 299, 2323, 679, 425],
 620: [796, 391, 1197, 18, 847],
 645: [1356, 5060, 1560, 2274, 4471],
 649: [3858, 1889, 992, 878, 1327],
 657: [3179, 2406, 1540, 1586, 2722],
 819: [295, 649, 39, 421, 317],
 851: [4104, 4930, 1558, 5325, 4918],
 952: [198, 310, 3658, 2576, 3342],
 1022: [3320, 2221, 1658, 1462, 727],
 1060: [3007, 796, 840, 745, 1471],
 1091: [2776, 2760, 3316, 2221, 376],
 1133: [4035, 4444, 5403, 4240, 522

In [68]:
#setting up the model
model = implicit.als.AlternatingLeastSquares(factors = 10, regularization = 10, iterations = 50)

#setting alpha = 1
alpha = 1

#confidence matrix
confidence = train.transpose() * alpha

#model fit
model.fit(confidence)

#list holding precentile for every user in test
avg_percentile = []

#iterating through all users that had interactions hidden
for idx in tqdm(list(ref.keys())):
    ranked_items = model.rank_items(idx, train, list(df1['g_code'].unique()))#create a list of rank items for all items

    ranked = []
    #unpackaging the list of tuples to just a list of ranked item ids for the specific user
    for item_ids, scores in ranked_items:
        ranked.append(item_ids)
    
    #counter
    percentile = 0
    
    #looping through each hidden interaction and finding its percentile ranking in the ranked list
    for item_id in ref[idx]:
        for count, item in enumerate(ranked):
            if item_id == item:
                percentile += count/len(ranked)
                
    avg_percentile.append(percentile/5)

100%|██████████| 615/615 [00:44<00:00, 13.91it/s]


In [69]:
np.mean(avg_percentile)

0.12816903052588571

In [49]:
df2 = get_city('SF')

In [50]:
df2.groupby('group_id').count().sort_values(by ='member_id', ascending = False).head()

Unnamed: 0_level_0,member_id,bio,city,country,hometown,joined,lat,link,lon,member_name,state,member_status,visited
group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1569779,12306,12306,12306,12306,12304,12306,12306,12306,12306,12296,12306,12306,12306
2148441,8714,8712,8714,8714,8711,8714,8714,8714,8714,8694,8714,8714,8714
1619955,8571,8571,8571,8571,8569,8571,8571,8571,8571,8557,8571,8571,8571
18401856,8410,8410,8410,8410,8409,8410,8410,8410,8410,8401,8410,8410,8410
1615633,8255,8254,8255,8255,8254,8255,8255,8255,8255,8239,8255,8255,8255


In [59]:
pop = list(df2.groupby('group_id').count().sort_values(by ='member_id', ascending = False).head(10).index)
pop

[1569779,
 2148441,
 1619955,
 18401856,
 1615633,
 3483762,
 9226282,
 1060260,
 1625447,
 389014]

In [60]:
pop_code = list(df1[df1['group_id'].isin(pop)]['g_code'].unique())
pop_code

[376, 1462, 838, 754, 836, 2108, 1232, 3153, 200, 844]

In [63]:
#list holding precentile for every user in test
avg_percentile = []

#iterating through all users that had interactions hidden
for idx in tqdm(list(ref.keys())):
    ranked_items = model.rank_items(idx, train, list(df1['g_code'].unique()))#create a list of rank items for all items

    ranked = []
    #unpackaging the list of tuples to just a list of ranked item ids for the specific user
    for item_ids, scores in ranked_items:
        ranked.append(item_ids)
    
    #counter
    percentile = 0
    
    #looping through each hidden interaction and finding its percentile ranking in the ranked list
    for item_id in pop_code:
        for count, item in enumerate(ranked):
            if item_id == item:
                percentile += count/len(ranked)
                
    avg_percentile.append(percentile/5)

100%|██████████| 645/645 [00:15<00:00, 42.16it/s]


In [58]:
#MPR for most popular items by users
np.mean(avg_percentile)

0.18499959957595771