# Structured data classification from scratch
https://www.tensorflow.org/tutorials/structured_data/feature_columns

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from scipy.io import arff
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold

import tensorflow as tf
from tensorflow import keras

print(tf.__version__)

2023-08-17 10:17:13.944861: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2.13.0


In [3]:
raw_data = arff.loadarff('./assets/genre.arff')
dataframe = pd.DataFrame(raw_data[0])

dataframe.info()
dataframe.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Columns: 241 entries, highlevel_danceability_value to genre
dtypes: float64(237), object(4)
memory usage: 1.5+ MB


Unnamed: 0,highlevel_danceability_value,highlevel_equalization_profile_value,highlevel_excitement_value,highlevel_intensity_value,highlevel_speech_music_value,highlevel_voice_instrumental_value,loudness_dynamic_complexity_dvar,loudness_dynamic_complexity_mean,loudness_dynamic_complexity_var,loudness_larm_dvar,...,tonal_chords_strength_dvar,tonal_chords_strength_mean,tonal_chords_strength_var,tonal_dissonance_dvar,tonal_dissonance_mean,tonal_dissonance_var,tonal_key_mode_value,tonal_key_strength_value,tonal_tuning_equal_tempered_deviation_value,genre
0,0.206472,0.372766,0.029087,0.093526,b'music',b'voice',-0.411385,0.024215,-0.201421,-0.532833,...,-0.255337,1.140774,-1.490395,0.494393,0.149725,-0.013634,b'minor',0.526779,-0.205793,b'blu'
1,-0.368479,0.372766,1.341977,0.093526,b'music',b'voice',-0.349921,-0.015672,-0.173045,-0.452769,...,-0.121114,0.497749,-0.901662,0.347729,-0.51764,0.184709,b'major',-0.129683,-0.667979,b'blu'
2,-0.631747,0.372766,-1.283803,-1.324366,b'speech',b'instrumental',4.235612,5.779127,3.973258,1.713613,...,-0.190639,1.408974,-0.502029,1.174496,-1.45416,2.144489,b'major',-1.022781,0.937795,b'blu'
3,-0.190116,0.372766,1.341977,0.093526,b'music',b'voice',-0.307576,0.068818,-0.216986,-0.519222,...,0.257633,0.622749,1.399033,0.791524,-0.601227,0.873673,b'major',0.074454,-0.406373,b'blu'
4,0.588278,0.372766,1.341977,0.802473,b'music',b'instrumental',-0.332032,-1.078238,0.35068,-0.765884,...,-0.309481,1.301291,-0.708195,-0.093155,0.287901,-0.644858,b'major',0.919307,-0.917709,b'blu'


In [4]:
val_dataframe = dataframe.sample(frac=0.2, random_state=1337)
train_dataframe = dataframe.drop(val_dataframe.index)

print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)

Using 640 samples for training and 160 for validation


In [5]:
def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("genre")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)

In [6]:
for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

Input: {'highlevel_danceability_value': <tf.Tensor: shape=(), dtype=float64, numpy=0.200762>, 'highlevel_equalization_profile_value': <tf.Tensor: shape=(), dtype=float64, numpy=0.372766>, 'highlevel_excitement_value': <tf.Tensor: shape=(), dtype=float64, numpy=0.029087>, 'highlevel_intensity_value': <tf.Tensor: shape=(), dtype=float64, numpy=1.511419>, 'highlevel_speech_music_value': <tf.Tensor: shape=(), dtype=string, numpy=b'music'>, 'highlevel_voice_instrumental_value': <tf.Tensor: shape=(), dtype=string, numpy=b'voice'>, 'loudness_dynamic_complexity_dvar': <tf.Tensor: shape=(), dtype=float64, numpy=-0.238809>, 'loudness_dynamic_complexity_mean': <tf.Tensor: shape=(), dtype=float64, numpy=0.048441>, 'loudness_dynamic_complexity_var': <tf.Tensor: shape=(), dtype=float64, numpy=-0.182564>, 'loudness_larm_dvar': <tf.Tensor: shape=(), dtype=float64, numpy=-0.130532>, 'loudness_larm_mean': <tf.Tensor: shape=(), dtype=float64, numpy=1.593277>, 'loudness_larm_var': <tf.Tensor: shape=(), dt

In [7]:
train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)

In [8]:
from keras.layers import IntegerLookup
from keras.layers import Normalization
from keras.layers import StringLookup


def encode_numerical_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = Normalization()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    # Normalize the input feature
    encoded_feature = normalizer(feature)
    return encoded_feature


def encode_categorical_feature(feature, name, dataset, is_string):
    lookup_class = StringLookup if is_string else IntegerLookup
    # Create a lookup layer which will turn strings into integer indices
    lookup = lookup_class(output_mode="binary")

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    lookup.adapt(feature_ds)

    # Turn the string input into integer indices
    encoded_feature = lookup(feature)
    return encoded_feature

## A few functions

In [9]:
def divide_numeric_categorical():
    num = []
    cat = []
    # numeric cols
    for column in train_ds.select_dtypes(exclude='object').columns:
        print(column)
        num.append(column)

    # indicator_columns
    for column in train_ds.select_dtypes(include='object').columns:
        print(column)
        cat.append(column)

    return num, cat

In [10]:
def keras_sequential_model():
    # Model structure
    model = tf.keras.Sequential([
        keras.Input(shape=(32,32)),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(.1),
        keras.layers.Dense(1)
    ])

    # Compile the model
    model.compile(optimizer='adam',
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=['accuracy'])

    # Train the model
    model.fit(train_ds,
            validation_data=val_ds,
            epochs=10)

In [None]:
numerical_features = []
for feature in NUMERIC_FEATURE_NAMES:
    encode_numerical_feature(dataframe[feature], feature, train_ds)
    numerical_features.append(feature)

categorical_features = []
for feature in CATEGORICAL_FEATURE_NAMES:
    encode_categorical_feature(dataframe[feature], feature, train_ds, is_string=True)
    categorical_features.append(feature)