In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import sys
import math
np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')
vectorizer = TfidfVectorizer(max_features=2000)
vect_train = vectorizer.fit_transform(newsgroups_train.data)
vect_train_dense = vect_train.todense()
print(vect_train_dense.shape)

(11314, 2000)


In [3]:
newsgroups_test = fetch_20newsgroups(subset='test')
vect_test = vectorizer.transform(newsgroups_test.data)
vect_test_dense = vect_test.todense()
print(vect_test_dense.shape)

(7532, 2000)


In [4]:
class Perceptron():

    def __init__(self, input_num, lr, steps, num_of_bit):
        self.input_num = input_num
        self.steps = steps
        self.lr = lr
        self.num_of_bit = num_of_bit
        self.w = np.random.random_sample((self.num_of_bit, self.input_num))
        self.b = np.random.random_sample((self.num_of_bit, 1))
    
    def change_label_to_1_and_m1s(self, labels, num_of_cls=20):
        res = []
        for label in labels:
            tmp = [-1] * num_of_cls
            tmp[label] = 1
            res.append(tmp)
        return np.array(res)
    
    def precision(self, a, b, num_of_bin, num_of_cls):
        correct = 0
        pred = a.tolist()
        for i in range(len(pred)):
            answer = pred[i].index(max(pred[i]))
            if answer==int(b[i]):
                correct+=1
                    
        return float(correct) / len(a)
        
           
    def compute_unit(self, input_feature, w, b, threshold):
        wx_plus_b = np.dot(w, input_feature)
        y = np.sign(wx_plus_b)
        return y
    
    def train(self, input_features, labels, num_of_bit, threshold, num_of_cls):
        multi_cls_label = self.change_label_to_1_and_m1s(labels, num_of_cls)
        for step in range(self.steps):
            y = self.compute_unit(input_features, self.w, self.b, threshold)
            self.w += self.lr * np.sign(multi_cls_label.T-y) * input_features.T
            if step % 20 == 0:
                print('precision: ', self.precision(y.T, labels, num_of_bit, num_of_cls))
        return self.w
    
    def test_all(self, input_features, labels, num_of_bit, threshold, num_of_cls, stored_w):
        y = self.compute_unit(input_features, stored_w, self.b, threshold)
        print('precision: ', self.precision(y.T, labels, num_of_bit, num_of_cls))
    

In [5]:
num_of_cls = len(newsgroups_train.target_names)
model = Perceptron(int(vect_train.shape[1]), lr=0.2, steps=200, num_of_bit=num_of_cls)
weight_list = model.train(vect_train_dense.T, newsgroups_train.target, num_of_bit=5, threshold=0.5, num_of_cls=num_of_cls)

precision:  0.04242531377054976
precision:  0.5979317659536857
precision:  0.7782393494785221
precision:  0.8497436803959696
precision:  0.8901361145483472
precision:  0.9216015555948383
precision:  0.9346826940074244
precision:  0.9749867420894467
precision:  0.983295032702846
precision:  0.9871840197984798


In [6]:
num_of_cls = len(newsgroups_test.target_names)
test_acc = model.test_all(vect_test_dense.T, newsgroups_test.target, 5, 0.5, num_of_cls, weight_list)

precision:  0.6028943175783325
