In [3]:
import numpy as np
import json
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [60]:
def build_genre_sims_jac(num_artists, input_data):
    """Returns a matrix of size num_artists x num_artists where entry [i,j]
       should be the Jaccard similarity between the category sets for artists i and j. 
        
    Notes: 
        - Artists sometimes contain *duplicate* genres! You should only count a category once.
        - An artist should have a Jaccard similarity of 1.0 with itself.
        - If a artist has no genres, then its Jaccard similarity with other movies is 0.
    
    Params: {num_artists: Integer,
             input_data: List<Dictionary>}
    Returns: np.ndarray 
    """
    jac_matrix = np.zeros((num_artists,num_artists))
    for i in range(num_artists):
        for j in range(num_artists):
            if i == j:
                jac_matrix[i][j] = 1.0
            else:
                genres_i = set(input_data[i])
                genres_j = set(input_data[j])
                if genres_i != set() and genres_j != set():
                    jac_matrix[i][j] = len(genres_i.intersection(genres_j)) / len(genres_i.union(genres_j))
    return jac_matrix

In [61]:
df = pd.read_csv('compiled-w-songs-filtered.csv', delimiter=',')

In [62]:
duplicate = df[df.duplicated('Songs')]

In [63]:
duplicate = duplicate.sort_values('Songs', ascending=True)

In [64]:
duplicate.to_csv('duplicates.csv')

In [65]:
genres_list = []
for index, row in df.iterrows():
    genres_str = row['Genres'][1:-1]
    genres = genres_str.split("', '")
    for i in range(len(genres)):
        genres[i] = genres[i].strip(" ''")
    genres_list.append(genres)

In [67]:
jaccard = build_genre_sims_jac(len(genres_list), genres_list)