In [1]:
import pandas as pd
import os

In [2]:
def drop_duplicated(df):
    """
    Removes duplicated entries and nan values from the dataset.
     
    Input: 
        df: dataframe; the entire dataset
    Output:
        df: dataframe; the dataframe without duplicates.    
    """
    
    df = df.drop_duplicates()
    df = df[~df.index.duplicated(keep='first')]
    df = df.dropna(how = 'any')
    return(df)

In [3]:
def refine_demographics(path):
    """
    Loads in user demographic and refines the stats to contain the required stats (uuid, age, country)
    
    Input
        path: string; the file path for the user_demographic.csv file.
    """
    df = pd.read_csv(path, delimiter = '\t')
    # Extracts required stats.
    refined_df = df[['uuid','age','country']]    
    refined_df = refined_df.set_index('uuid')    
    return(refined_df)


In [4]:
def join_files(path,refined_demographics):
    """
    Combines the demographics files with the newly loaded users file with the redfined_demographic files
    
    """
    files = []
    for file in os.listdir(path):
        if file.endswith('.csv'):
            # Loads each of the users in the users.pkl
            df = pd.read_csv(path + "/" + file, index_col = [0])
            df = drop_duplicated(df)
            # Uses an left in join to make sure only required demographics are loaded.
            df = pd.concat([df,refined_demographics], axis = 1, join = 'inner')
            files.append(df)

    users = pd.concat(files)
    users.to_pickle("../../cleaned_data/users.pkl") 
    return(users)

In [5]:
def load_users():
    """
    Loads each of the users, if the users pickle file is not found then it will load users from the origianl dataset.
    """
    if "users.pkl" in os.listdir("../cleaned_data"):
        return(pd.read_pickle("../cleaned_data/users.pkl"))
    else:
        print("Cleaned File not found ....")
        print("Generating File")
        refined = refine_demographics("..")
        refined = drop_duplicated(refined)
        return(join_files("../cleaned_data/users.pkl", refined))
#users = load_users()

In [6]:
def add_artist_name(artist_lookup, artists):
    """
    Adds the artist name to each of the tracks based off the MBID.
    
    Input 
        artist_lookup: dictionary {artistMBID:artist_name} the lookup dictionary to obtain an artist's name
        artists: the dataframe of artists.
    Return
        merged: Dataframe, the merged file.
    """
    merged = pd.merge(artist_lookup, artists, how='left', on = 'MBID')
    return(merged)

In [7]:
def read_artists():
    """
    Reads the aritists dataframe, if the artists.pkl file is not found then the data is loaded using the mbdump.
    
    Output
        artists: Dataframe, the stored artists metadata.
    """
    if "artists.pkl" in os.listdir('../cleaned_data'):
        print("Cleaned data found")
        artists = pd.read_pickle('../cleaned_data/artists.pkl', index_col = 0)
        return(artists)
        
    else:
        print("Cleaned data not found: Loading file.....")
        print("Loading File....")
        artists = pd.read_csv('../dirty_data/mbdump/mbdump/artist', '\t', header = None)
        artists.columns = ['pkID', 'MBID', 'Artist', '3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18']
        artists = artists[['pkID','MBID', 'Artist']]
        artists.to_pickle('../cleaned_data/artists.pkl')
        return(artists)
    
#artists = read_artists()

In [8]:
def read_recording():
    """
    Reads the recordings file. If the recordings.pkl file is not found then it is generated using the mbdump.
    
    Output:
        recordings: pandas Dataframe, the recordings metadata.
    
    """
    if "recordings.pkl" in os.listdir('../cleaned_data'):
        print("Cleaned data found")
        recordings = pd.read_pkl('../cleaned_data/recordings.pkl', index_col = 0)
        return(recordings)
        
    else:
        print("Cleaned data not found: Loading file.....")
        print("Loading File....")
        recordings = pd.read_csv('../dirty_data/mbdump/mbdump/recording', '\t', header = None)
        recordings.columns = ['pkID', 'MBID', 'Artist', '3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18']
        # Reduce the dataset to just required files.
        recordings = recordings[['pkID','MBID', 'Recording']]
        recordings.to_pickle('../cleaned_data/recordings.pkl')
        return(recor)
    
#artists = read_artists()

In [9]:
def user_id_convert(df):
    """
    Replaces the string based uuid used for identifying users to an interger based ID
    
    Inputs
        df: Dataframe, the sample dataset used within this file
    
    Output
        df: Dataframe, the modified dataset with userID added.
    """
    
    #Obtains the unique values.
    item_lookup = pd.Series(dict(enumerate(df.item.unique())), name = 'MBID')
    user_lookup = pd.Series(dict(enumerate(df.userID.unique())))
    
    # Creates a lookup pandas series
    user_lookup_2 = (pd.Series(user_lookup.index.values, index=user_lookup ))
    lookup_2 = (pd.Series(item_lookup.index.values, index=item_lookup))
    
    # Renames the column to be more understandble  
    df = df.rename(columns = {'userID':'userMBID', item:(item-'ID')+('MBID')})
    
    # Maps the lookup values to the correlating columns.
    df['userID'] = df['userMBID'].map(user_lookup_2)
    df[item] = df[(item-'ID')+('MBID')].map(item_lookup_2)

    return(df)

In [10]:
def add_lookups_tracks(df,tracks):
    """
    Maps an integer based trackID to each recording-MBID, further more the track name is added to the dataset
    
    Inputs
        df: Dataframe, the sample dataset used within this file
    
    Output
        df: Dataframe, the modified dataset with trackID and track name added.
    """
    
    item_lookup = pd.Series(dict(enumerate(df["recording-MBID"].unique())))
    user_lookup = pd.Series(dict(enumerate(df.uuid.unique())))
    
    user_lookup_2 = (pd.Series(user_lookup.index.values, index=user_lookup ))
    lookup_2 = (pd.Series(item_lookup.index.values, index=item_lookup))
     
    df = df.rename(columns = {'uuid':'userMBID', 'recording-MBID':'recordingMBID'})
    
    df['userID'] = df['userMBID'].map(user_lookup_2)
    df['songID'] = df['recordingMBID'].map(lookup_2)

    # Adds track name to the dataset.
    df['trackName'] = pd.merge(df,tracks[['recordingMBID','trackName']], on = 'recordingMBID', how = 'inner')

    return(df)