In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras import Input, layers
tf.keras.backend.clear_session() # for easy reset of notebook state

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from IPython import display

In [2]:
tf.__version__

'2.2.0'

Workflow:

- [x] establish DF with columns with: list of genres, length of list, gender
- [x] encode genres as integers (ordinal encoding) in pandas (don't use 1-num_genre_labels)
- [x] create arrays of fixed length (max length of genre list) with integer encodings of genre labels and padded with zeros
- [x] load dataframe into TF Dataset with columns for codes of genre labels (num_cols = max_num genre labels), length of list, gender
    - https://www.tensorflow.org/tutorials/load_data/pandas_dataframe
- [ ] transform gender to 0,1 (just to practice this in TF)
- [ ] split into train and validation
- [ ] define model with embedding layer of the genrelist column
- [ ] train
- [ ] tune



May be helpful:

https://www.tensorflow.org/tutorials/structured_data/feature_columns


Later try using sparse tensors:

https://www.tensorflow.org/api_docs/python/tf/sparse/SparseTensor


First try to go directly from the dataframe with the ordinal encoding of the genre labels to a sparse matrix by extracting the index, value info. If that doesn't work:

Use the scipy sparse matrix that and convert from scipy sparse matrix to tf sparse tensor:

https://stackoverflow.com/questions/40896157/scipy-sparse-csr-matrix-to-tensorflow-sparsetensor-mini-batch-gradient-descent

Also read:

https://www.tensorflow.org/guide/data

Import the DF of genre lists and gender:

In [3]:
%store -r now
now
X_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now), index_col = ['artist'])
y_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now), index_col = ['artist'])

### Genre Labels -- as a list

Each value of the genre column is a _string_ of comma separated genre labels using the spotify abbreviations. We want to convert it to a _list_ of strings.

In [4]:
"""This function takes in a string of the form
appearing in the genrelist of the dataframe.
It strips the square brackets and extra quotes and
returns a list of strings where each string is a genre label."""
def genrelist(string):
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_"))
    while (str("") in L_new):
        L_new.remove("")
    return L_new

Now we apply it to the whole column and put the lists in a new column:

In [5]:
X_train['genrelist']= X_train['genrelist'].apply(genrelist)

In [6]:
data_train = X_train.join(y_train, how = 'inner', on = 'artist')

In [7]:
data_train.reset_index(inplace = True)
data_train.index.name = 'artist_id'

In [8]:
data_train.head()

Unnamed: 0_level_0,artist,genrelist,genrelist_length,gender
artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Pablo_Holman,"[pop, rock, emo_pop]",3,male
1,Bobby_Edwards,[country],1,male
2,La_Palabra,"[afro_cuban_jazz, son_montuno, guaracha, salsa...",4,male
3,Sherrick,"[r_and_b, soul]",2,male
4,Allen_Collins,[southern_rock],1,male


Full genre_list (not just that for the training set)

In [9]:
genre_list = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_{}.csv'.format(now))
genre_list.drop(['Unnamed: 0'], axis = 1, inplace = True)
genre_list['genre_id'] = list(range(1,genre_list.shape[0]+1))

In [10]:
genre_list.head(10)

Unnamed: 0,genre_list,genre_id
0,chilean,1
1,zamba,2
2,afro_punk_blues,3
3,crunk,4
4,spanish_guitar,5
5,chanson,6
6,cybergrind,7
7,outsider,8
8,folk_alternative,9
9,alternative_music,10


Create a dictionary {genre_label: genre_id}

In [11]:
genre_list.set_index(['genre_list'], inplace = True)

In [12]:
genre_list.head()

Unnamed: 0_level_0,genre_id
genre_list,Unnamed: 1_level_1
chilean,1
zamba,2
afro_punk_blues,3
crunk,4
spanish_guitar,5


In [13]:
label_id_dict = genre_list['genre_id'].to_dict()

In [14]:
label_id_dict['pop']

1007

Find max length of genre lists:

In [15]:
data_train.genrelist_length.max()

73

In [16]:
def encode_list(row):
    return [label_id_dict[item] for item in row.genrelist]

In [17]:
data_train['genres_encoded_as_list'] = data_train.apply(encode_list, axis = 1)

In [18]:
data_train.head()

Unnamed: 0_level_0,artist,genrelist,genrelist_length,gender,genres_encoded_as_list
artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Pablo_Holman,"[pop, rock, emo_pop]",3,male,"[1007, 1431, 794]"
1,Bobby_Edwards,[country],1,male,[465]
2,La_Palabra,"[afro_cuban_jazz, son_montuno, guaracha, salsa...",4,male,"[1442, 1357, 1004, 809]"
3,Sherrick,"[r_and_b, soul]",2,male,"[1426, 359]"
4,Allen_Collins,[southern_rock],1,male,[1186]


Check that the encoding is consistent:

In [19]:
n = np.random.randint(data_train.shape[0])
[label_id_dict[item] for item in data_train.genrelist.iloc[n]], data_train.genres_encoded_as_list.iloc[n]

([561, 1007, 896], [561, 1007, 896])

not used: Create the column with the array of fixed length padded with zeros:

In [20]:
# def encode_array(row):
#     arr = np.zeros((73), dtype = 'int64')
#     for i, id in enumerate(row.genres_encoded_as_list):
#         arr[i] = id
#     return arr

Encode into new columns

In [21]:
for i in range(73):
    data_train['label_{}'.format(i)] = np.zeros((data_train.shape[0]), dtype = 'int64')

In [22]:
def encode_columns(row):
    for i, id in enumerate(row.genres_encoded_as_list):
        row.loc['label_{}'.format(i)] = id
    return row

In [23]:
# data_train['genres_encoded'] = data_train.apply(encode_array, axis = 1)

In [24]:
data_train = data_train.apply(encode_columns, axis = 1)

In [25]:
data_train.head()

Unnamed: 0_level_0,artist,genrelist,genrelist_length,gender,genres_encoded_as_list,label_0,label_1,label_2,label_3,label_4,...,label_63,label_64,label_65,label_66,label_67,label_68,label_69,label_70,label_71,label_72
artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Pablo_Holman,"[pop, rock, emo_pop]",3,male,"[1007, 1431, 794]",1007,1431,794,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Bobby_Edwards,[country],1,male,[465],465,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,La_Palabra,"[afro_cuban_jazz, son_montuno, guaracha, salsa...",4,male,"[1442, 1357, 1004, 809]",1442,1357,1004,809,0,...,0,0,0,0,0,0,0,0,0,0
3,Sherrick,"[r_and_b, soul]",2,male,"[1426, 359]",1426,359,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Allen_Collins,[southern_rock],1,male,[1186],1186,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Encode targets. The categories still appear as strings. To see the encoding use df.column.cat.codes.

In [31]:
data_train['gender'] = pd.Categorical(data_train['gender'], categories = ['male','female'])

In [35]:
data_train_tf = data_train.drop(['artist','genrelist', 'genres_encoded_as_list'], axis = 1)

In [36]:
data_train_tf.head()

Unnamed: 0_level_0,genrelist_length,gender,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,...,label_63,label_64,label_65,label_66,label_67,label_68,label_69,label_70,label_71,label_72
artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3,male,1007,1431,794,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,male,465,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,male,1442,1357,1004,809,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,male,1426,359,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,male,1186,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
data_train_tf.shape, data_train_tf.dtypes

((12376, 75),
 genrelist_length       int64
 gender              category
 label_0                int64
 label_1                int64
 label_2                int64
                       ...   
 label_68               int64
 label_69               int64
 label_70               int64
 label_71               int64
 label_72               int64
 Length: 75, dtype: object)

Now convert to a tf.data.Dataset

In [38]:
target = data_train_tf.pop('gender')

In [39]:
target.head()

artist_id
0    male
1    male
2    male
3    male
4    male
Name: gender, dtype: category
Categories (2, object): [male, female]

In [40]:
data_train_tf.head()

Unnamed: 0_level_0,genrelist_length,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,...,label_63,label_64,label_65,label_66,label_67,label_68,label_69,label_70,label_71,label_72
artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3,1007,1431,794,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,465,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,1442,1357,1004,809,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,1426,359,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1186,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
data_set = tf.data.Dataset.from_tensor_slices((data_train_tf.values, target.values)) 

In [43]:
for feature, target in data_set.take(2):
    print(feature, target)

tf.Tensor(
[   3 1007 1431  794    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0], shape=(74,), dtype=int64) tf.Tensor(b'male', shape=(), dtype=string)
tf.Tensor(
[  1 465   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0], shape=(74,), dtype=int64) tf.Tensor(b'male', shape=(), dtype=string)


Shuffle dataset:

In [44]:
train_dataset = data_set.shuffle(len(data_train_tf)).batch(1) 

## Build Model

### Train Model

In [None]:
model.fit(train_dataset, 
          epochs = 1
)

Ignore for now: Another possibility: Create TF Dataset using the corpus from gensim as a generator:

In [None]:
def corpus_gen():
    for i in len(corpus):
        yield (i, corpus[i])

In [None]:
dataset = tf.data.Dataset.from_generator(
    corpus_gen,
    (tf.int64, --tensor_one_hot--)
)