In [1]:
import implicit
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import math

In [5]:
df = pd.read_csv('members.csv', encoding = 'latin-1')

In [6]:
def get_city(city):
    """Take in the acronym of the city and return a dataframe containing only members and groups from that city."""
    
    #defense coding
    if city not in ['SF', 'NYC', 'CHI']:
        raise ValueError('Invalid City')
        
    #separating the members and groups by their corresponding city
    
    #SF
    sf = ['San Francisco', 'san francisco', 'South San Francisco']
    df_sf = df[df.city.isin(sf)]
    
    #NY
    ny = ['New York', 'West New York', 'New York Mills']
    df_ny = df[df.city.isin(ny)]
    
    #CHI
    chi = ['Chicago','Chicago Heights','West Chicago','Chicago Ridge','East Chicago','North Chicago','Chicago Park']
    df_chi = df[df.city.isin(chi)]

    
    if city == 'SF':
        return df_sf
    elif city == 'NYC':
        return df_ny
    elif city == 'CHI':
        return df_chi

In [7]:
#reading in SF user data
df_sf = get_city('SF')

In [8]:
#dropping unecessary groups from members dataframe
df_sf = df_sf.drop(['bio','country','hometown','lat','link','lon','member_name','state','member_status'], axis = 1)

#changing these two columns to datetime
df_sf['joined'] = pd.to_datetime(df_sf['joined'])
df_sf['visited'] = pd.to_datetime(df_sf['visited'])

#taking the time difference between vistited and joined and setting that timedelta by Months
df_sf['delta'] = (df_sf['visited'] - df_sf['joined']).astype('timedelta64[M]')

#groups that had less than 1 month of interaction will be set to 1 as needed to create the preference matrix
df_sf['delta'] = df_sf['delta'].replace(to_replace = 0.0, value = 1.0)

#setting up the user and item interaction with binary interaction
df_sf['dummy'] = 1

#selecting a subset of users
members = np.random.choice(df_sf['member_id'].unique(), size = 40000, replace = False)
df1 = df_sf[df_sf['member_id'].isin(members)]

#turning the member_id and group_id to category and giving it an idx
df1['m_code'] = df1['member_id'].astype('category').cat.codes
df1['g_code'] = df1['group_id'].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
df1.head()

Unnamed: 0,member_id,city,joined,visited,group_id,delta,dummy,m_code,g_code
200,1230,San Francisco,2013-12-15 19:59:59,2014-03-04 22:28:34,1240980,2.0,1,0,420
201,1230,San Francisco,2016-08-07 03:36:54,2016-08-24 06:37:46,2057811,1.0,1,0,1154
202,1230,San Francisco,2012-02-01 17:32:00,2013-12-16 21:18:29,2960812,22.0,1,0,1311
203,1230,San Francisco,2016-08-07 03:36:35,2016-08-07 03:36:35,8086692,1.0,1,0,1950
204,1230,San Francisco,2016-08-07 03:36:59,2016-12-06 00:20:04,9546582,3.0,1,0,2091


In [10]:
#creating two sparse matrix in item by user format with timedelta and binary interaction
item_user = csr_matrix((df1['delta'], (df1['g_code'], df1['m_code'])))
dummy_item_user = csr_matrix((df1['dummy'], (df1['g_code'], df1['m_code'])))

In [11]:
def train_test_split(matrix, perc, n_interactions):
    """Take in a sparse matrix and a percentage of the data desired to train with. Only users that participated in 
        more than n_interactions groups will be masked. Returns two sets of sparse matrix for train, test and a dictionary
        where the key(user_id) value(list of item_ids) pairs reference interactions that have been masked.
       
        Arguments:
       
        matrix: A sparse matrix with the format of item by user format.
        
        perc: A float that determines the percentage of users that satisfies of being in n_interactions 
              groups or more to be masked.
        
        Returns:
        
        Training Set: user by item sparse matrix
        
        Test Set: The original matrix
        
        mask: a dictionary with key value pairs of iteractions that were masked. Key = user, Value = list of items 
        
        """
    #list to hold all the idx of users that are in more than 15 groups
    users = []
    #dictionary that contains the key value pairs to reference back what interactions were masked
    mask = {}
    
    #test set is the original matrix
    test = matrix.copy()
    test[test !=0] = 1 # set to the binary preference
    
    #turning the sparse matrix into user x item format
    temp = matrix.transpose()
    
    #finding users who are in more than n_interactions groups by storing the index of those users
    for idx in range(temp.shape[0]):
        if temp[idx].count_nonzero() > n_interactions:
            users.append(idx)
            
    random.seed(0)          
    #get random users to mask
    n_samples = math.ceil(len(users)*perc)
    users_idx = random.sample(users, n_samples)
    
    #iterate through the list of users_idx to mask iteractions in the matrix
    for i in users_idx:
        mask[i] = list(random.sample(list(temp[i].nonzero()[1]), 3))
        temp[i,mask[i]] = 0
        
    return temp, test, mask

In [12]:
train, test, ref = train_test_split(item_user, .2, 15)

In [13]:
ref

{4: [1585, 275, 3123],
 16: [2746, 672, 2084],
 31: [173, 4951, 96],
 55: [3354, 4321, 2107],
 82: [4023, 292, 268],
 201: [3093, 2719, 1105],
 212: [756, 796, 3033],
 222: [1616, 87, 1308],
 303: [2874, 1927, 2298],
 361: [1090, 17, 1620],
 374: [1333, 2466, 1432],
 378: [4207, 390, 712],
 383: [351, 909, 433],
 414: [3919, 137, 4444],
 444: [1662, 673, 855],
 506: [2278, 216, 160],
 545: [1551, 69, 1211],
 552: [2049, 704, 2091],
 581: [1185, 1371, 3540],
 582: [3902, 410, 4065],
 606: [625, 1484, 1281],
 749: [721, 156, 251],
 768: [1684, 741, 2416],
 937: [691, 904, 146],
 958: [3799, 625, 1729],
 982: [4736, 1333, 1159],
 1041: [924, 1740, 760],
 1131: [3704, 3975, 2379],
 1138: [3477, 973, 3860],
 1140: [1236, 1427, 1625],
 1153: [2298, 3290, 1868],
 1157: [465, 625, 1071],
 1188: [2057, 1781, 347],
 1217: [889, 48, 2120],
 1226: [3536, 1762, 1122],
 1235: [5189, 5234, 4832],
 1254: [828, 3291, 416],
 1263: [3777, 1252, 3921],
 1281: [3854, 840, 390],
 1289: [4526, 1872, 4055],
 

In [25]:
#setting up the model
model = implicit.als.AlternatingLeastSquares(factors = 10, regularization = 10, iterations = 50)

#setting alpha = 1
alpha = 1

#confidence matrix
confidence = train.transpose() * alpha

#model fit
model.fit(confidence)

#list holding precentile for every user in test
avg_percentile = []

#iterating through all users that had interactions hidden
for idx in tqdm(list(ref.keys())):
    ranked_items = model.rank_items(idx, train, list(df1['g_code'].unique()))#create a list of rank items for all items

    ranked = []
    #unpackaging the list of tuples to just a list of ranked item ids for the specific user
    for item_ids, scores in ranked_items:
        ranked.append(item_ids)
    
    #counter
    percentile = 0
    
    #looping through each hidden interaction and finding its percentile ranking in the ranked list
    for item_id in ref[idx]:
        for count, item in enumerate(ranked):
            if item_id == item:
                percentile += count/len(ranked)
                
    avg_percentile.append(percentile/3)

100%|██████████| 639/639 [00:29<00:00, 21.30it/s]


In [27]:
np.mean(avg_percentile)

0.11191311847368744