In [19]:
import implicit
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import math

In [3]:
df = pd.read_csv('members.csv', encoding = 'latin-1')

In [4]:
def get_city(city):
    """Take in the acronym of the city and return a dataframe containing only members and groups from that city."""
    
    #defense coding
    if city not in ['SF', 'NYC', 'CHI']:
        raise ValueError('Invalid City')
        
    #separating the members and groups by their corresponding city
    
    #SF
    sf = ['San Francisco', 'san francisco', 'South San Francisco']
    df_sf = df[df.city.isin(sf)]
    
    #NY
    ny = ['New York', 'West New York', 'New York Mills']
    df_ny = df[df.city.isin(ny)]
    
    #CHI
    chi = ['Chicago','Chicago Heights','West Chicago','Chicago Ridge','East Chicago','North Chicago','Chicago Park']
    df_chi = df[df.city.isin(chi)]

    
    if city == 'SF':
        return df_sf
    elif city == 'NYC':
        return df_ny
    elif city == 'CHI':
        return df_chi

In [5]:
#get the chicago city members
df_chi = get_city('CHI')

#dropping unecessary groups from members dataframe
df_chi = df_chi.drop(['bio','country','hometown','lat','link','lon','member_name','state','member_status'], axis = 1)

In [6]:
#changing these two columns to datetime
df_chi['joined'] = pd.to_datetime(df_chi['joined'])
df_chi['visited'] = pd.to_datetime(df_chi['visited'])

In [7]:
df_chi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1206544 entries, 135 to 5893879
Data columns (total 5 columns):
member_id    1206544 non-null int64
city         1206544 non-null object
joined       1206544 non-null datetime64[ns]
visited      1206544 non-null datetime64[ns]
group_id     1206544 non-null int64
dtypes: datetime64[ns](2), int64(2), object(1)
memory usage: 55.2+ MB


In [10]:
#taking the time delta between vistited and joined and setting that timedelta by Months
df_chi['delta'] = (df_chi['visited'] - df_chi['joined']).astype('timedelta64[M]')

#groups that had less than 1 month of interaction will be set to 1 as needed to create the preference matrix
df_chi['delta'] = df_chi['delta'].replace(to_replace = 0.0, value = 1.0)

#setting up the user and item interaction with binary interaction
df_chi['dummy'] = 1

In [11]:
df_chi.head()

Unnamed: 0,member_id,city,joined,visited,group_id,delta,dummy
135,819,Chicago,2015-03-19 04:00:42,2015-03-19 04:00:42,514628,1.0,1
136,819,Chicago,2015-03-19 03:49:30,2015-03-19 03:49:30,1294245,1.0,1
137,819,Chicago,2015-03-19 04:02:45,2015-03-19 04:02:45,1455470,1.0,1
138,819,Chicago,2015-03-19 03:51:59,2015-03-19 03:51:59,1576866,1.0,1
139,819,Chicago,2015-03-19 03:58:14,2015-03-19 03:58:14,1681402,1.0,1


In [12]:
#selecting a random subset of 40k users from the pool
members = np.random.choice(df_chi['member_id'].unique(), size = 40000, replace = False)

In [13]:
dfs = df_chi[df_chi['member_id'].isin(members)]
dfs.shape

(202100, 7)

In [14]:
#turning the member_id and group_id to category and giving it an idx
dfs['m_code'] = dfs['member_id'].astype('category').cat.codes
dfs['g_code'] = dfs['group_id'].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
dfs.head()

Unnamed: 0,member_id,city,joined,visited,group_id,delta,dummy,m_code,g_code
333,4368,Chicago,2006-05-11 04:12:27,2013-04-17 17:28:36,122646,83.0,1,0,43
334,4368,Chicago,2016-08-11 21:12:04,2016-08-11 21:12:04,1527867,1.0,1,0,568
335,4368,Chicago,2013-05-31 20:45:48,2014-04-25 15:43:51,1787911,10.0,1,0,836
336,4368,Chicago,2014-02-02 02:18:39,2016-08-30 14:50:07,2415292,30.0,1,0,951
337,4368,Chicago,2014-02-02 02:17:10,2017-07-18 19:13:46,2597122,41.0,1,0,971


In [16]:
#create the 2 separate sparse matrix in item x user format where 1 is a binary interaction the other by timedelta
item_user = csr_matrix((dfs['delta'], (dfs['g_code'],dfs['m_code']))) #binary representation of interaction
dummy_item_user = csr_matrix((dfs['dummy'], (dfs['g_code'], dfs['m_code']))) #timedelta interaction

In [74]:
def train_test_split(matrix, perc, n_interactions):
    """Take in a sparse matrix and a percentage of the data desired to train with. Only users that participated in 
        more than n_interactions groups will be masked. Returns two sets of sparse matrix for train, test and a dictionary
        where the key(user_id) value(list of item_ids) pairs reference interactions that have been masked.
       
        Arguments:
       
        matrix: A sparse matrix with the format of item by user format.
        
        perc: A float that determines the percentage of users that satisfies of being in n_interactions 
              groups or more to be masked.
        
        Returns:
        
        Training Set: user by item sparse matrix
        
        Test Set: The original matrix
        
        mask: a dictionary with key value pairs of iteractions that were masked. Key = user, Value = list of items 
        
        """
    #list to hold all the idx of users that are in more than 15 groups
    users = []
    #dictionary that contains the key value pairs to reference back what interactions were masked
    mask = {}
    
    #test set is the original matrix
    test = matrix.copy()
    test[test !=0] = 1 # set to the binary preference
    
    #turning the sparse matrix into user x item format
    temp = matrix.transpose()
    
    #finding users who are in more than n_interactions groups by storing the index of those users
    for idx in range(temp.shape[0]):
        if temp[idx].count_nonzero() > n_interactions:
            users.append(idx)
            
    random.seed(0)          
    #get random users to mask
    n_samples = math.ceil(len(users)*perc)
    users_idx = random.sample(users, n_samples)
    
    #iterate through the list of users_idx to mask iteractions in the matrix
    for i in users_idx:
        mask[i] = list(random.sample(temp[i].nonzero()[1], 5))
        temp[i,mask[i]] = 0
        
    return temp, test, mask

In [100]:
def precision(train_set, user_ids, ref_keys, N, alpha, reg, n_factors):
    """This function calculates the number of success predictions the recommendation system predicts. A successful
       prediction is determined if an item recommended to the user is an actual item the user has interacted with.
       The actual items that the user iteracted with were hidden from train_test_split.
       
       Arguments:
       
       train_set: a sparse matrix in the format of users by item
       
       user_ids: list of users that had 5 of their interactions hidden
       
       ref_keys: a dictionary from train_test_split that has key value pairs to reference what user item was hidden
       
       N: number of recommended items
       
       alpha: learning rate for confidence matrix
       
       reg: regularization on the lost function
       
       n_factors: number of latent factors desired
       
       Returns:
       
       number of successful predictions over all users iteractions that were hidden"""
    
    #initializing the ALS model
    model = implicit.als.AlternatingLeastSquares(factors = n_factors 
                                                 ,regularization = reg
                                                 ,iterations = 50) 

    #confidence c is defined by 1 + alpha*item_user where item_user is the interaction between an item and user
    #Note: the model takes care of the 1 so any negative preference will be set to a confidence of 1 when fitting
    c_item_user = train_set.transpose() * alpha

    #fitting the model using the item_user matrix after it has been multiplied by alpha
    model.fit(c_item_user)
    
    #counter for successful predictions
    success = 0
    for ids in user_ids:
        recommend = model.recommend(ids, train_set, N = N)
        
        #unpacking recommend (list of tuples) into their own list
        idx = []
        for item in recommend:
            item_id, score = item
            idx.append(item_id)
        
        s1 = set(idx)
        s2 = set(ref_keys[ids])
        
        success += len(s1.intersection(s2))
    
    return success

### Binary interactions

In [123]:
d_train, d_test, d_ref = train_test_split(dummy_item_user, .2, 20)

In [80]:
d_train.eliminate_zeros()
d_train

<40000x3790 sparse matrix of type '<class 'numpy.int64'>'
	with 199140 stored elements in Compressed Sparse Column format>

In [140]:
#tuning parameters with binary interactions
alphas = [1, 10, 50, 100, 500, 1000]
regs = [1e-3, 1e-1, 0, 1e1, 1e2]
factors = [10, 20, 40, 80, 120]

params = {}
most_pred = 0
temp = 0
for a in tqdm(alphas):
    for l in regs:
        for n in factors:
            temp = precision(d_train, list(d_ref.keys()), d_ref, 10, a, l, n)
            if temp > most_pred:
                most_pred = temp
                params['alpha'] = a
                params['reg'] = l
                params['factors'] = n





100%|██████████| 6/6 [20:19<00:00, 203.21s/it]


In [111]:
#n_interaction = 15
most_pred

492

In [112]:
#n_interaction = 15
params

{'alpha': 50, 'factors': 20, 'reg': 100.0}

In [113]:
#n_interaction = 15
492/(len(d_ref)*5)

0.1662162162162162

In [126]:
#n_interaction = 20
most_pred

309

In [129]:
#n_interaction = 20
309/(len(d_ref)*5)

0.186144578313253

In [142]:
#n_interaction = 20
params

{'alpha': 1, 'factors': 80, 'reg': 0.001}

### TimeDelta Interactions

In [131]:
train, test, ref = train_test_split(item_user, .2, 20)
train.eliminate_zeros()
train

<40000x3790 sparse matrix of type '<class 'numpy.float64'>'
	with 181678 stored elements in Compressed Sparse Column format>

In [115]:
test

<3790x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 184248 stored elements in Compressed Sparse Row format>

In [132]:
#tuning parameters with timedelta interactions
alphas = [1, 10, 50, 100, 500, 1000]
regs = [1e-3, 1e-1, 0, 1e1, 1e2]
factors = [10, 20, 40, 80, 120]

params = {}
most_pred = 0
temp = 0
for a in tqdm(alphas):
    for l in regs:
        for n in factors:
            temp = precision(train, list(ref.keys()), ref, 10, a, l, n)
            if temp > most_pred:
                most_pred = temp
                params['alpha'] = a
                params['reg'] = l
                params['factors'] = n





100%|██████████| 6/6 [15:34<00:00, 155.82s/it]


In [117]:
#n_interaction = 15
params

{'alpha': 1, 'factors': 10, 'reg': 10.0}

In [118]:
most_pred

193

In [121]:
193/(len(ref)*5)

0.1195046439628483

In [133]:
#n_interaction = 20
params

{'alpha': 1, 'factors': 40, 'reg': 0.001}

In [134]:
most_pred

107

In [139]:
107/(len(ref)*5)

0.11204188481675392

### SF users

In [143]:
df_sf = get_city('SF')

In [144]:
#dropping unecessary groups from members dataframe
df_sf = df_sf.drop(['bio','country','hometown','lat','link','lon','member_name','state','member_status'], axis = 1)

#changing these two columns to datetime
df_sf['joined'] = pd.to_datetime(df_sf['joined'])
df_sf['visited'] = pd.to_datetime(df_sf['visited'])

#taking the time delta between vistited and joined and setting that timedelta by Months
df_sf['delta'] = (df_sf['visited'] - df_sf['joined']).astype('timedelta64[M]')

#groups that had less than 1 month of interaction will be set to 1 as needed to create the preference matrix
df_sf['delta'] = df_sf['delta'].replace(to_replace = 0.0, value = 1.0)

#setting up the user and item interaction with binary interaction
df_sf['dummy'] = 1

#selecting a subset of users
members = np.random.choice(df_sf['member_id'].unique(), size = 40000, replace = False)
df1 = df_sf[df_sf['member_id'].isin(members)]

#turning the member_id and group_id to category and giving it an idx
df1['m_code'] = df1['member_id'].astype('category').cat.codes
df1['g_code'] = df1['group_id'].astype('category').cat.codes

In [145]:
df1.head()

Unnamed: 0,member_id,city,joined,visited,group_id,delta,dummy,m_code,g_code
79,65,San Francisco,2012-03-20 05:29:10,2017-06-03 06:22:28,2701562,62.0,1,0,1300
80,65,San Francisco,2014-08-06 22:16:15,2014-09-05 02:29:37,14177122,1.0,1,0,2576
81,65,San Francisco,2014-08-14 00:25:38,2017-04-16 17:00:01,14638342,32.0,1,0,2619
196,883,San Francisco,2014-09-18 21:43:44,2014-10-10 19:42:14,17009192,1.0,1,1,2832
238,3045,San Francisco,2011-03-30 00:39:36,2013-05-28 21:15:17,54659,25.0,1,2,16


In [146]:
#create the 2 separate sparse matrix in item x user format where 1 is a binary interaction the other by timedelta
item_user_sf = csr_matrix((df1['delta'], (df1['g_code'],df1['m_code']))) #binary representation of interaction
dummy_item_user_sf = csr_matrix((df1['dummy'], (df1['g_code'], df1['m_code']))) #timedelta interaction

### SF Users Binary Interaction

In [148]:
#n_interaction = 15
sf_train, sf_test, sf_ref = train_test_split(dummy_item_user_sf, .2, 15)

In [156]:
#tuning parameters with timedelta interactions
alphas = [1, 10, 50, 100, 500, 1000]
regs = [1e-3, 1e-1, 0, 1e1, 1e2]
factors = [10, 20, 40, 80, 120]

params = {}
most_pred = 0
temp = 0
for a in tqdm(alphas):
    for l in regs:
        for n in factors:
            temp = precision(sf_train, list(sf_ref.keys()), sf_ref, 10, a, l, n)
            if temp > most_pred:
                most_pred = temp
                params['alpha'] = a
                params['reg'] = l
                params['factors'] = n





100%|██████████| 6/6 [19:19<00:00, 193.24s/it]


In [150]:
params

{'alpha': 1, 'factors': 120, 'reg': 0}

In [151]:
most_pred

593

In [162]:
593/(len(sf_ref)*5)

0.18051750380517503