In [None]:
'''
	Package: cs771
	Module: assn2Utils
	Author: Puru
	Institution: CSE, IIT Kanpur
	License: GNU GPL v3.0
	
	Various utilities for multi-label learning problems
'''

import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.datasets import dump_svmlight_file
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from scipy import sparse as sps
from sklearn.model_selection import train_test_split
import time as tm
import pandas as pd

def loadData( filename, dictSize = 225 ):
	X, y = load_svmlight_file( filename, multilabel = False, n_features = dictSize, offset = 1 )
	print(y)
	return (X, y)


In [None]:


# This file is intended to demonstrate how we would evaluate your code
# The data loader needs to know how many feature dimensions are there
dictSize = 225
(X, y) = loadData("train", dictSize = dictSize )
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)

X=X.todense()
print(X)


[ 1.  2.  4. ...  2. 32.  3.]
[[0. 0. 2. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
decision_tree = DecisionTreeClassifier(criterion='gini', splitter='best',max_depth = 20)
decision_tree = decision_tree.fit(Xtrain, ytrain)
predict = (decision_tree.predict(Xtest))
print(np.average(ytest == predict ))


print('Model accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(ytest, predict)))
print('Training set score: {:.4f}'.format(decision_tree.score(Xtrain, ytrain)))

print('Test set score: {:.4f}'.format(decision_tree.score(Xtest, ytest)))
decision_tree.predict_proba(Xtest)

0.752
Model accuracy score with criterion gini index: 0.7520
Training set score: 0.9184
Test set score: 0.7520


array([[0.02697095, 0.96680498, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.5       , 0.5       , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [None]:
def precision_k(k,x,y):
  if y in x[:k]:
    return 1
  else:
    return 0

precsion_value_k = 5
for k in range(1, precsion_value_k+1):
  precision_array = []
  for i in range(Xtest.shape[0]):
    b = decision_tree.predict_proba(Xtest[i])
    b = np.insert(b, 0, -1)
    x_sorted = np.argsort(b)[-precsion_value_k:]
    s = x_sorted[::-1]
    precision_array.append(precision_k(k,s,ytest[i]))
  print(np.average(precision_array))


0.7386666666666667
0.7856666666666666
0.7956666666666666
0.8026666666666666
0.8076666666666666


In [None]:
c = 50

def macro_precision_k(k,x,y):
  if y in x[:k]:
    return y
  else:
    return x[0]

precsion_value_k = 5
for k in range(1, precsion_value_k+1):
  predict_class_predict = np.zeros(c+1, dtype=int)
  predict_class_actual = np.zeros(c+1, dtype= int)
  for i in range(Xtest.shape[0]):
    b = decision_tree.predict_proba(Xtest[i])
    b = np.insert(b,0,-1)
    x_sorted = np.argsort(b)[-5:]
    s = x_sorted[::-1]
    m = macro_precision_k(k,s,ytest[i])
    predict_class_actual[int(ytest[i])] += 1
    predict_class_predict[int(m)] += 1

  fractions = []
  for i in range(c+1):
    # print(i, predict_class_predict[i], predict_class_actual[i])
    if predict_class_predict[i] != 0 and predict_class_actual[i] != 0:
      fractions.append(min(predict_class_predict[i], predict_class_actual[i])/max(predict_class_predict[i], predict_class_actual[i]))
    else:
      fractions.append(0)

  print(np.average(fractions))

0.608605256836401
0.6509264904198881
0.6667030984355073
0.6719975148178279
0.672500199206449
