# **Training a maximum entropy classifier**

This code bit predicts music preference based on age and gender and returns dataset entropy and average prediction accuracy over 30 runs.<br>
Run the code by clicking <b>Run All</b>.

In [54]:
#import pandas, scipy and sklearn packages

import pandas as pd
import scipy.stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from itertools import repeat
import numpy as np

**1. Read in the dataset**

In [55]:
df = pd.read_csv('cleanedfile.csv')
df.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q30,Q31,Q32,affiliative,selfenhancing,agressive,selfdefeating,age,gender,accuracy
0,2,2,3,1,4,5,4,3,4,3,...,4,2,2,4.0,3.5,3.0,2.3,25,2,100
1,2,3,2,2,4,4,4,3,4,3,...,4,3,1,3.3,3.5,3.3,2.4,44,2,90
2,3,4,3,3,4,4,3,1,2,4,...,5,4,2,3.9,3.9,3.1,2.3,50,1,75
3,3,3,3,4,3,5,4,3,-1,4,...,5,3,3,3.6,4.0,2.9,3.3,30,2,85
4,1,4,2,2,3,5,4,1,4,4,...,5,4,2,4.1,4.1,2.9,2.0,52,1,80


**2. Split the dataset**

In [56]:
# Run this section to inspect X
X = df.drop(columns = ['accuracy'])
X

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q29,Q30,Q31,Q32,affiliative,selfenhancing,agressive,selfdefeating,age,gender
0,2,2,3,1,4,5,4,3,4,3,...,2,4,2,2,4.0,3.5,3.0,2.3,25,2
1,2,3,2,2,4,4,4,3,4,3,...,4,4,3,1,3.3,3.5,3.3,2.4,44,2
2,3,4,3,3,4,4,3,1,2,4,...,2,5,4,2,3.9,3.9,3.1,2.3,50,1
3,3,3,3,4,3,5,4,3,-1,4,...,4,5,3,3,3.6,4.0,2.9,3.3,30,2
4,1,4,2,2,3,5,4,1,4,4,...,2,5,4,2,4.1,4.1,2.9,2.0,52,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,3,2,3,3,2,5,3,2,3,4,...,5,4,4,4,2.5,3.3,2.9,3.0,18,2
1067,1,4,5,2,4,4,1,2,2,5,...,1,4,1,2,4.8,3.9,2.5,2.4,31,1
1068,1,4,4,5,4,4,3,5,4,3,...,2,4,1,5,4.4,3.9,3.0,4.3,15,1
1069,3,4,4,3,3,4,3,2,4,3,...,3,4,3,3,3.1,3.6,2.9,2.8,21,2


In [57]:
# Uncomment this section to inpect y
y = df['accuracy']
y

0       100
1        90
2        75
3        85
4        80
       ... 
1066     95
1067     95
1068     95
1069     87
1070     75
Name: accuracy, Length: 1071, dtype: int64

**3. Compute entropy of data set**

In [58]:
# Compute the maximum entropy value
k = y.unique().size
maxE = np.log2(k)
p_data = y.value_counts(normalize=True)           # counts occurrence of each value
entropy = scipy.stats.entropy(p_data)  # get entropy from counts

# normalize the value to be between 0 and 1.
normalizedE = entropy/maxE

**4. Testing: entropy-based decision tree classifier averaged over 30 runs**

In [59]:
avg_score = 0.0
ntimes = 30

for i in repeat(None, ntimes):

    # train model with 80% of the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # prediction using entropy
    # Note: You can replace 'entropy' by 'gini' to get the classifier to use the gini index criterion.
    model = DecisionTreeClassifier(criterion='entropy')
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)

    # compute model accuracy
    avg_score += accuracy_score(y_test, predictions)

avg_score /= ntimes

**5. print outputs**

In [60]:
print('normalized entropy value: %.3f'% normalizedE)
print('average accuracy score: %.3f' % avg_score)

# output visual (can be visualized with visual code)
tree.export_graphviz(model, out_file='SupervisedModel.dot',
                    feature_names=['Q1','Q2','Q3','Q4','Q5','Q6','Q7','Q8','Q9','Q10','Q11','Q12','Q13','Q14','Q15','Q16','Q17','Q18','Q19','Q20','Q21','Q22','Q23','Q24','Q25','Q26','Q27','Q28','Q29','Q30','Q31','Q32','affiliative', 'selfenhancing','agressive','selfdefeating','age','gender'],
                    class_names=sorted(str(y.unique())),
                    label='all',
                    rounded=True,
                    filled=True)

normalized entropy value: 0.453
average accuracy score: 0.136
