In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('./data/kaggle/kaggle_visible_evaluation_triplets.txt', delimiter='\t', names=['user_id', 'song_id', 'play_count'])

In [4]:
data = pd.read_csv('./data/test/user-graph-triplets.tsv', delimiter='\t', names=['user_id', 'song_id', 'play_count'])

In [3]:
groupby_users = data.groupby('user_id')

In [5]:
# # Run only for testing purposes

NUM_USERS = 1000

i = 0

test_data = pd.DataFrame(columns=('user_id', 'song_id', 'play_count'))

for user, songs in groupby_users:
    if i < NUM_USERS:
        test_data = pd.concat([test_data, songs], axis='index')
    else:
        break

    i += 1
    
users = test_data.groupby('user_id')

In [6]:
class Edge:
    def __init__(self, user, weight):
        self.user = user
        self.weight = weight
        
    def __eq__(self, other):
        type_ = type(other)
        
        if type_ is str:
            return self.user == other
        elif type_ is Edge:
            return self.user == other.user
        else:
            return False
        
    def __lt__(self, other):
        if type(other) is Edge:
            return self.weight < other.weight
        else:
            return False
        
    def __le__(self, other):
        if type(other) is Edge:
            return self.weight <= other.weight
        else:
            return False
        
    def __gt__(self, other):
        if type(other) is Edge:
            return self.weight > other.weight
        else:
            return False
        
    def __ge__(self, other):
        if type(other) is Edge:
            return self.weight >= other.weight
        else:
            return False
    
    def __str__(self):
        return str((self.user, self.weight))

In [7]:
# initialize nodes

graph = {user: [] for user, _ in groupby_users}

total_users = len(graph)
total_users

110000

In [8]:
import csv

def flatten(list_):
    return [item for sublist in list_ for item in sublist]

with open('./data/test/users.txt', 'r') as f:
    reader = csv.reader(f)
    users_list = flatten(list(reader))
    
len(users_list)

6

In [9]:
# initialize edges

from IPython.display import clear_output, display

cur_user = 0

for source_user, source_songs_df in users:
    
    songs = set(source_songs_df['song_id'])
    
    cur_target_user = 0
        
    for target_user, target_songs_df in users:
        if cur_target_user % 500 == 0:
            clear_output(wait=True)
            display('{}/{}     {}/{}'.format(cur_user, total_users, cur_target_user, total_users))
            
        if source_user != target_user:
            num_songs_in_common = target_songs_df['song_id'].isin(songs).sum()
            if num_songs_in_common != 0:
                graph[source_user].append(Edge(target_user, num_songs_in_common))
                
        cur_target_user += 1
    
    cur_user += 1

'999/110000     500/110000'

In [13]:
# initialize edges

from IPython.display import clear_output, display

NUM_USERS = 100

cur_user = 0

for user in users_list[:NUM_USERS]:
    source_songs_df = groupby_users.get_group(user)['song_id']
    
#     data[user_i]
    
#     songs = set(source_songs_df['song_id'])
    
#     cur_target_user = 0
        
#     for target_user, target_songs_df in users:
#         if cur_target_user % 500 == 0:
#             clear_output(wait=True)
#             display('{}/{}     {}/{}'.format(cur_user, total_users, cur_target_user, total_users))
            
#         if source_user != target_user:
#             num_songs_in_common = target_songs_df['song_id'].isin(songs).sum()
#             if num_songs_in_common != 0:
#                 graph[source_user].append(Edge(target_user, num_songs_in_common))
                
#         cur_target_user += 1
    
#     cur_user += 1

00007a02388c208ea7176479f6ae06f8224355b3
00014a76ed063e1a749171a253bca9d9a0ff1782
00015189668691680bb1a2e58afde1541ec92ced
0001ff7aa2667c8d8b945317b88adaed1c0b9dc2
00020fcd8b01986a6a85b896ccde6c49f35142ad
0003477fcf455dc4fcae3d7ca5e329cef811c868


In [None]:
# save graph and current user

import pickle

pickle.dump((graph, cur_user, cur_target_user), './save.pkl')

In [113]:
# Create song to index mappings

song_to_index = pd.read_csv('./data/kaggle/kaggle_songs.txt', index_col=0, names=('index',), delim_whitespace=True).to_dict()['index']

def songs_to_indicies(songs):
    indicies = []
    for song in songs:
        indicies.append(str(song_to_index[song]))
    return indicies

In [114]:
# Create user ordered mapping

import csv

def flatten(list_):
    return [item for sublist in list_ for item in sublist]

with open('./data/kaggle/kaggle_users.txt', 'r') as f:
    reader = csv.reader(f)
    users_list = flatten(list(reader))
    
len(users_list)

110000

In [106]:
# Find recommendations for each user

saved_progress = 'saved.txt'
out_file_name = 'submission.txt'

import pickle

try:
    idx = pickle.loads(open(saved_progess, 'rb'))
except:
    idx = 0

try:
    with open(out_file_name, 'w') as out_file:
        for i in range(idx, len(users_list)):
            user = users_list[i]
            source_songs = set(users.get_group(user)['song_id'])
            
            recommendations = []
            
            visited = {user: False for user in users_list}
            queue = []

            queue.append(user)
            visited[user] = True

            while queue: 
                s = queue.pop()
                
                if s != user:
                    target_songs = set(users.get_group(s)['song_id'])
                    recommendations.extend(songs_to_indicies(target_songs - source_songs))
                
                if len(recommendations) > 500:
                    recommendations = recommendations[:500]
                    break

                neighbors = sorted(graph[s], reverse=True)
                for neighbor in neighbors: 
                    if not visited[neighbor.user]: 
                        queue.append(neighbor.user) 
                        visited[neighbor.user] = True
                        
            out_file.write(' '.join(recommendations) + '\n')
            
            idx = i

except KeyboardInterrupt:
    pickle.dumps(idx, open(saved_progess, 'wb'))

In [10]:
# Format graph for d3.js

users = list(graph.keys())
connections = np.array([len(graph[user]) for user in users])

max_connections = np.max(connections)

nodes = pd.DataFrame({ 'id': users, 'label': users, 'level': np.ones(len(users), dtype=np.int8), 'numConnections': connections / max_connections })

In [11]:
links = {'source': [], 'target': [], 'strength': []}

max_weight = 0

for user in users:
    if graph[user]:
        max_ = np.max(graph[user])
        if max_.weight > max_weight:
            max_weight = max_.weight
            
for source in users:
    for target_edge in graph[source]:
        links['source'].append(source)
        links['target'].append(target_edge.user)
        links['strength'].append(target_edge.weight / max_weight)
    
links = pd.DataFrame(links)

In [12]:
nodes.to_json('../Muse/data/nodes-1000.json', orient='records')
links.to_json('../Muse/data/links-1000.json', orient='records')