In [1]:
# Creating a machine learning program with Python coding and jupyter framework for data analysis/machine learning.
# The program will predict a new users music genre based on the new users input e.g. age, and gender.

# Step 1: Import sample data. 
# Gathering this locally via a mock-up data csv set. 
# Column B1 - Gender; 1 = Male, 0 = Female

import pandas as pd
music_data = pd.read_csv('music.csv')
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


In [8]:
# Step 2: Clean the data.
# We are splitting the csv file down-further: Input and Output
# We will then ask our model to predict music types via these splits

# Drop predections column for Input data set X (age and gender)
import pandas as pd
music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])
X

# Gather output data set using y (predections of genres)
y = music_data['genre']
y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

In [11]:
# Step 3: Create Model using a python algorithm (Decision-Tree).

import pandas as pd
from sklearn.tree import DecisionTreeClassifier

music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])
y = music_data['genre']

# Create an instance of the algorithm model
model = DecisionTreeClassifier()

# Ask the model to intake 2 datasets. NOTE: X is age AND gender. y is the results of prediction
model.fit(X, y)
# music_data

# Ask the model to make a prediction for a 21 year old male (male = 1) 
# and a 22 year old female (0 = female)
predictions = model.predict([ [21, 1], [22, 0] ])
predictions

# Result: array(['HipHop', 'Dance'], dtype=object) - this is correct as per the import data

array(['HipHop', 'Dance'], dtype=object)

In [29]:
# Step 4: Measure accuracy of the AI Model.
# Will be splitting the data set further into: Training, and Testing

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])
y = music_data['genre']

# Call training and testing split function and pass 3 placeholders
# Allocating 20% data for testing. Returns a tuple so can be split-up
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = DecisionTreeClassifier()

# Now we only want to pass the Training data sets
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Call accuracy score and provide two arguements
score = accuracy_score(y_test, predictions)
score

1.0

In [33]:
# Step 5: Train (persistance), and load Model for predictions.

# Save model to file via import of joblib object
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib

# No need to create model everytime code is called - we store it on an object below
# music_data = pd.read_csv('music.csv')
# X = music_data.drop(columns=['genre'])
# y = music_data['genre']

# model = DecisionTreeClassifier()
# model.fit(X, y)

# Provide 2 arguements to the object to create the model initially in a joblib file
#joblib.dump(model, 'music-recommender.joblib')

# Load the created model
model = joblib.load('music-recommender.joblib')
predictions = model.predict([[21, 1]])
predictions

array(['HipHop'], dtype=object)

In [34]:
# Step 6: Visualize Data Model via tree import.

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])
y = music_data['genre']

model = DecisionTreeClassifier()
model.fit(X, y)

# Pass parameters to tree export for visualization
tree.export_graphviz(model, out_file='music-recommender.dot',
                    feature_names=['age', 'gender'],
                    class_names=sorted(y.unique()),
                    label='all',
                    rounded=True,
                    filled=True)