In [101]:
# Predict preferred music genre depending on age and gender using a decision tree
# Based on a Python ML tutorial (https://www.youtube.com/watch?v=7eh4d6sabA0)

In [41]:
import os

base_path = os.getcwd()

In [42]:
# Prepare data
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv(base_path + '/data/music.csv')

# Cleaning of data not necessary here, data was constructed manually for this example

# Split data into input and output sets (output is to be predicted by input)
X = data.drop(columns=['genre'])  # Input
y = data['genre']  # Target

# Split data into training and test sets
# Use 20% of the data for testing (samples are picked randomly over the whole set)
# A 80/20 or 70/30 split is common
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Train model (decision tree in this case)
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [None]:
# Persist model (demonstrates how to save and load a model)
import joblib

model_filename = base_path + '/output/music-recommender.joblib'
joblib.dump(model, model_filename)

In [45]:
# Load model
loaded_model = joblib.load(model_filename)

In [49]:
# Measure accuracy of model (calculate accuracy scores for test data)
from sklearn.metrics import accuracy_score

predictions = loaded_model.predict(X_test)
scores = accuracy_score(y_test, predictions)
scores

1.0

In [47]:
# Visualize decision tree
from sklearn import tree

tree.export_graphviz(
    loaded_model,
    out_file=base_path + '/output/music-recommender.dot',
    feature_names=['age', 'gender'],
    class_names=sorted(y.unique()),
    label='all',
    rounded=True,
    filled=True
)