# Data preparation: convert [idomaar](https://github.com/crowdrec/idomaar/wiki/DATA-FORMAT) to CSV

Convert [30Music dataset](http://crowdrec.eu/2015/11/30music-dataset-release/) from idomaar format to CSV format.

In [None]:
import sys, csv, json
import numpy as np
import pandas as pd

In [None]:
COLUMNS = ['Type', 'ID', 'Timestamp', 'Properties', 'LinkedEntities']

## Convert albums data to CSV

In [None]:
falbums = 'albums.idomaar'

In [None]:
#with open(falbums) as tsvin:
#    tsvin = csv.reader(tsvin, delimiter='\t')
#    for row in tsvin:
#        if len(row[4]) > 2:
#            print(row)

In [None]:
albums = pd.read_csv(falbums, delimiter='\t', header=None)

In [None]:
albums.columns = COLUMNS

In [None]:
albums.head()

In [None]:
albums[albums['Timestamp'] != -1]

In [None]:
albums[albums['LinkedEntities'] != '{}']

In [None]:
albums.set_index('ID', inplace=True)

In [None]:
albums.head()

Deal with illegally formated JSON.

In [None]:
cnt = 0
for ix in albums.index:
    try:
        prop = json.loads(albums.loc[ix, 'Properties'])
        aa = (prop['MBID'], prop['title'])
    except:
        cnt += 1
        #print(ix)
print(cnt)

In [None]:
def parse_properties(props, debug=False): 
    try:
        prop = json.loads(props)
    except:
        # deal with duplicated " in json string
        props = props.replace('"title":', '"TITLE":').replace('"', '').replace('\\', '')\
                     .replace('MBID:', '"MBID":"').replace(', TITLE:', '", "title":"').replace('}', '"}')
        if debug is True: print(props)
        prop = json.loads(props)
    return pd.Series({'MBID': prop['MBID'], 'Title': prop['title']})

In [None]:
props = albums.loc[708, 'Properties']
props

In [None]:
#json.loads(props)  # causes exception

In [None]:
parse_properties(props)

In [None]:
prop_df = albums['Properties'].apply(lambda s: parse_properties(s, debug=False))

In [None]:
prop_df.head()

In [None]:
albums = albums.merge(prop_df, left_index=True, right_index=True)

In [None]:
albums.head()

In [None]:
albums.drop(['Timestamp', 'Properties', 'LinkedEntities'], axis=1, inplace=True)

In [None]:
albums.head()

In [None]:
fcsv_falbums = 'albums.csv'
albums.to_csv(fcsv_falbums)

In [None]:
pd.read_csv(fcsv_falbums, index_col=0).head()

## Convert artists data to CSV

In [None]:
fartists = 'persons.idomaar'

In [None]:
artists = pd.read_csv(fartists, delimiter='\t', header=None)

In [None]:
artists.columns = COLUMNS
artists.head()

In [None]:
artists.set_index('ID', inplace=True)

In [None]:
artists[artists['Timestamp'] != -1]

In [None]:
artists[artists['LinkedEntities'] != '{}']

In [None]:
artists.drop(['Timestamp', 'LinkedEntities'], axis=1, inplace=True)

In [None]:
print(artists.shape)
artists.head()

In [None]:
artists_prop = artists['Properties'].apply(lambda s: \
                                           pd.Series({'MBID': json.loads(s)['MBID'], 'Name': json.loads(s)['name']}))

In [None]:
set(list(artists.index)) - set(list(artists_prop.index))

In [None]:
print(artists_prop.shape)
artists_prop.head()

In [None]:
artists_prop['Type'] = 'person'
artists_prop.head()

In [None]:
#artists.sort_index(inplace=True)

In [None]:
#artists = artists.merge(artists_prop, left_index=True, right_index=True)
#print(artists.shape)

In [None]:
artists_prop.loc[297899]

In [None]:
fcsv_artists = 'persons.csv'
artists_prop.to_csv(fcsv_artists)

In [None]:
pd.read_csv(fcsv_artists, index_col=0).head()

## Convert users data to CSV

In [None]:
fusers = 'users.idomaar'

In [None]:
users = pd.read_csv(fusers, header=None, delimiter='\t')

In [None]:
users.columns = COLUMNS[:-1]
users.set_index('ID', inplace=True)
users.head()

In [None]:
def parse_user_properties(props):
    try:
        prop = json.loads(props)
    except:
        props = props.replace('""', 'null').replace(':,', ':null,')
        try: prop = json.loads(props)
        except: print(props); sys.exit(0)
    return pd.Series({'Username': prop['lastfm_username'],
                          'Gender': str.upper(prop['gender']) if prop['gender'] is not None else None,
                          'Age': prop['age'],
                          'Country': prop['country'],
                          'Playcount': prop['playcount'],
                          '#Playlists': prop['playlists'],
                          'Subscribertype': prop['subscribertype']})    

In [None]:
user_prop = users['Properties'].apply(lambda s: parse_user_properties(s))

In [None]:
user_prop.shape

In [None]:
users.drop('Properties', axis=1, inplace=True)

In [None]:
users = users.merge(user_prop, left_index=True, right_index=True)
print(users.shape)

In [None]:
users.head()

In [None]:
fcsv_users = 'users.csv'
users.to_csv(fcsv_users)

In [None]:
pd.read_csv(fcsv_users, index_col=0).head()