In [12]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#import the data
music_data = pd.read_csv('music.csv')
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


In [5]:
#cleaning or preparing the data by removing duplicates if any
# input dataset
X = music_data.drop(columns=['genre'])
X

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1
5,30,1
6,31,1
7,33,1
8,37,1
9,20,0


In [6]:
# output dataset only have predictions for our answer
y = music_data['genre']
y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

In [7]:
# build a model using machine learning alogorithm
# very simple algorithm descision tree

model = DecisionTreeClassifier()
# since we 20,23,25 for male we dont have 21 age male similar to female let see our model is predict or not
model.fit(X,y)
# our data is 2d array so outer array inner array inner arrays are inputs
predictions = model.predict([ [21, 1], [22, 0]])
predictions



array(['HipHop', 'Dance'], dtype=object)

In [8]:
#after we build a model we need to measure its accuracy
# if its not accurate enough we should either fine tune it or build a model using a different alogorithm


#calculating accuracy of a model
# first we need to split our dataset into 2 sets, one for traininf other for testing, because rightnow we are passing entire data set
# for training the model and we are using 2 samples for making predictions. that is not enough to calculate the accuracy of a model.
# general rule of thumb is to annotate 70- 80 % of our data to training another 20-30% for testing. instead of passing only 2 samples for
# making predictions, we can pass dataset for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 20% test data set size
model.fit(X_train, y_train)
predictions = model.predict(X_test)

score = accuracy_score(y_test, predictions) # expectedvalues, actual values
score # 1.0 => 100% 0.26=> 26%

1.0

In [9]:
# model persistence
# once done training we can export it and use that without reading always from dataset and do predictions
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib

#import the data
music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])
y = music_data['genre']

model = DecisionTreeClassifier()
model.fit(X,y)

joblib.dump(model, 'music-recommender.joblib')
# predictions = model.predict([ [21, 1], [22, 0]])
# predictions

['music-recommender.joblib']

In [10]:
# load yrained model
joblib.load('music-recommender.joblib')
predictions = model.predict([ [21, 1], [22, 0]])
predictions



array(['HipHop', 'Dance'], dtype=object)

In [13]:
# visulaizing descision tree, export our model to visual format
# decision trees are easy to understand

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
#import the data
music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])
y = music_data['genre']

model = DecisionTreeClassifier()
model.fit(X,y)

tree.export_graphviz(model, out_file='music-recommender.dot',
                    feature_names=['age', 'gender'],
                    label='all',
                    rounded=True,
                    filled=True)

<module 'sklearn.tree' from '/home/deepak/.local/lib/python3.9/site-packages/sklearn/tree/__init__.py'>