In [55]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [56]:
# cleaning data involves removing duplicates, null values, etc
# since we have none of that all we have to do is split the data into two data sets
# when training a model you must give it an 'input' set and an 'output' set... the 'input' consists of the columns you are using to predict the column/columns that make up the 'output' set
music_data = pd.read_csv('music.csv')

In [57]:
# you can hover over functions and press shift + tab to see a description of the function try it with the drop function below

X = music_data.drop(columns=['genre'])
X

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1
5,30,1
6,31,1
7,33,1
8,37,1
9,20,0


In [58]:
y = music_data['genre']
y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

In [59]:
# after building a model it is imperative that the accuracy of the model is tested and to do this we must split the data set into a 'training' set and a 'testing' set
# general rule of thumb is to use 70% of the data for training and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [60]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


In [61]:
# ask model to predict what kind of music a 21 year old male and female would like since we do not have either example in the data set.

# predictions = model.predict([[21, 1], [22, 0]])

In [62]:
# testing the accuracy of the model
predictions = model.predict(X_test)

from sklearn.metrics import accuracy_score

score = accuracy_score(y_test, predictions)
score

1.0

In [63]:
# 'from sklearn.externals import joblib' does not work use 'import joblib' instead
import joblib

# persist the model using joblib dump
# joblib.dump(model, 'music-recommender.joblib')

In [64]:
# import your model that in theory is tested and ready to use whenever applicabe and do it like shown below by importin the tested joblib file containing the model

model = joblib.load('music-recommender.joblib')
predictions = model.predict([[21,1]])
predictions



array(['HipHop'], dtype=object)

In [66]:
# Visualize the data

from sklearn import tree

tree.export_graphviz(
    model,
    out_file='music-recommender.dot',
#     set feature names to 'age' and 'gender' so we can see the rules in our boxes/nodes
    feature_names=['age', 'gender'],
#     set class names to the unique list of 'genres'
    class_names=sorted(y.unique()),
#     puts labels on all the boxes/nodes 
    label='all',
#     set to true rounds the corners of the boxes/nodes
    rounded=True,
#     set to true fills each boxes/nodes with color
    filled=True
)