In [14]:
import numpy as np
import json
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [15]:
def build_genre_sims_jac(num_artists, input_data):
    """Returns a matrix of size num_artists x num_artists where entry [i,j]
       should be the Jaccard similarity between the category sets for artists i and j. 
        
    Notes: 
        - Artists sometimes contain *duplicate* genres! You should only count a category once.
        - An artist should have a Jaccard similarity of 1.0 with itself.
        - If a artist has no genres, then its Jaccard similarity with other movies is 0.
    
    Params: {num_artists: Integer,
             input_data: List<Dictionary>}
    Returns: np.ndarray 
    """
    jac_matrix = np.zeros((num_artists,num_artists))
    for i in range(num_artists):
        for j in range(num_artists):
            if i == j:
                jac_matrix[i][j] = 1.0
            else:
                genres_i = set(input_data[i])
                genres_j = set(input_data[j])
                if genres_i != set() and genres_j != set():
                    jac_matrix[i][j] = len(genres_i.intersection(genres_j)) / len(genres_i.union(genres_j))
    return jac_matrix

In [16]:
df = pd.read_csv('../removed_dups_new.csv', delimiter=',')

In [17]:
genres_list = []
for index, row in df.iterrows():
    genres_str = row['genres'][1:-1]
    genres = genres_str.split("', '")
    for i in range(len(genres)):
        genres[i] = genres[i].strip(" ''")
    genres_list.append(genres)

In [18]:
jaccard = build_genre_sims_jac(len(genres_list), genres_list)

In [19]:
pd.DataFrame(jaccard).to_csv('jaccard.csv')

In [78]:
import zipfile
with zipfile.ZipFile('lyrics_data/tfidf_svd.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [79]:
# Here, we will assign an index for each artist_id. 
# This index will help us access data in numpy matrices.
artist_id_to_index = {artist_id:index for index, artist_id in enumerate([d['artist_id'] for d in data])}

# We will also need a dictionary mapping artist names to artist ids
artist_name_to_id = {name:mid for name, mid in zip([d['artist_name'] for d in data],
                                                     [d['artist_id'] for d in data])}
artist_id_to_name = {v:k for k,v in artist_name_to_id.items()}

# and because it might be useful...
artist_name_to_index = {name:artist_id_to_index[artist_name_to_id[name]] for name in [d['artist_name'] for d in data]}
artist_index_to_name = {v:k for k,v in artist_name_to_index.items()}

tfidf_mat = pd.read_csv('tfidf_mat_compressed.csv').to_numpy

TypeError: string indices must be integers

In [None]:
def rocchio_update(query, query_obj, input_doc_mat, \
            movie_name_to_index=movie_name_to_index,a=.3, b=.3, c=.8):
    """Returns a vector representing the modified query vector. 
    
    Note: 
        Be sure to handle the cases where relevant and irrelevant are empty lists.
        
    Params: {query: String (the name of the movie being queried for),
             query_obj: Dict (storing the names of relevant and irrelevant artists for query),
             input_doc_mat: Numpy Array,
             artist_name_to_index: Dict,
             a,b,c: floats (weighting of the original query, relevant artists,
                             and irrelevant artists, respectively)}
    Returns: np.ndarray
    """
    q = input_doc_mat[artist_name_to_index[query]]
    dimension = len(q)
    rel_d = np.zeros(dimension)
    irrel_d = np.zeros(dimension)
    relevant = query_obj['relevant_artists']
    irrelevant = query_obj['irrelevant_artists']
    len_rel = len(relevant)
    len_irrel = len(irrelevant)
    for r in range(len_rel):
        artist = relevant[r]
        index = artist_name_to_index[artist]
        rel_d += input_doc_mat[index]
    for i in range(len_irrel):
        artist = irrelevant[i]
        index = artist_name_to_index[artist]
        irrel_d += input_doc_mat[index] 
    rocchio = (a * q) + (b * rel_d / len_rel) - (c * irrel_d / len_irrel)
    updated_rocchio = np.clip(rocchio,0,None)
    return updated_rocchio

In [68]:
df2 = pd.read_csv('jaccard.csv')

In [69]:
del df2['Unnamed: 0']

In [76]:
df2.to_csv('jaccard.csv', index=False)

In [80]:
df2 = pd.read_csv('jaccard.csv',index_col=[0])

In [82]:
df2.to_numpy()

array([[1.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 1.  , 0.25, ..., 0.  , 0.  , 0.  ],
       [0.  , 0.25, 1.  , ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 1.  , 1.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 1.  , 1.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 1.  ]])

In [46]:
query_vec_genres = rocchio_update(idx,query_obj,input_doc_mat=jaccard)

array([[0.  , 0.25, 1.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ]])

In [49]:
df2 = jaccard[[0]]

In [50]:
df2

array([[1., 0., 0., ..., 0., 0., 0.]])

In [44]:
summed = np.sum(df2,axis=0)

In [None]:
summed = np