In [43]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import sys
import math
np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)

In [44]:
newsgroups_train = fetch_20newsgroups(subset='train')
vectorizer = TfidfVectorizer(max_features=1000)
vect_train = vectorizer.fit_transform(newsgroups_train.data)
vect_train_dense = vect_train.todense()
print(vect_train_dense.shape)

(11314, 1000)


In [45]:
newsgroups_test = fetch_20newsgroups(subset='test')
vectorizer_test = TfidfVectorizer(max_features=1000)
vect_test = vectorizer_test.fit_transform(newsgroups_test.data)
vect_test_dense = vect_test.todense()
print(vect_test_dense.shape)

(7532, 1000)


In [51]:
class Perceptron():

    def __init__(self, input_num, lr, steps, num_of_bit):
        self.input_num = input_num
        self.steps = steps
        self.lr = lr
        self.num_of_bit = num_of_bit
        self.w = np.random.random_sample((self.num_of_bit, self.input_num))
        self.b = np.random.random_sample((self.num_of_bit, 1))
    
    def sigmoid(self, x):
        return 1.0 / (1 + np.exp(-x))
    
    def change_label_to_1_and_m1s(self, labels, num_of_cls=20):
        res = []
        for label in labels:
            tmp = [-1] * num_of_cls
            tmp[label] = 1
            res.append(tmp)
        return np.array(res)
    
    def precision(self, a, b, num_of_bin, num_of_cls):
        correct = 0
        pred = a.tolist()
        for i in range(len(pred)):
            answer = pred[i].index(max(pred[i]))
            if answer==int(b[i]):
                correct+=1
                    
        return float(correct) / len(a)
        
           
    def compute_unit(self, input_feature, w, b, threshold):
        wx_plus_b = np.dot(w, input_feature)
        y = np.sign(wx_plus_b)
        return y
    
    def train(self, input_features, labels, num_of_bit, threshold, num_of_cls):
        multi_cls_label = self.change_label_to_1_and_m1s(labels, num_of_cls)
        for step in range(self.steps):
            y = self.compute_unit(input_features, self.w, self.b, threshold)
            self.w += self.lr * np.sign(multi_cls_label.T-y) * input_features.T
#             if step % 20 == 0:
            print('precision: ', self.precision(y.T, labels, num_of_bit, num_of_cls))
        return self.w
    
    def test_all(self, input_features, labels, num_of_bit, threshold, num_of_cls, stored_w):
        y = self.compute_unit(input_features, stored_w, self.b, threshold)
        print('precision: ', self.precision(y.T, labels, num_of_bit, num_of_cls))
    

In [52]:
num_of_cls = len(newsgroups_train.target_names)
model = Perceptron(int(vect_train.shape[1]), lr=0.1, steps=200, num_of_bit=num_of_cls)
weight_list = model.train(vect_train_dense.T, newsgroups_train.target, num_of_bit=5, threshold=0.5, num_of_cls=num_of_cls)

precision:  0.04242531377054976
precision:  0.04330917447410288
precision:  0.04330917447410288
precision:  0.04330917447410288
precision:  0.04330917447410288
precision:  0.04330917447410288
precision:  0.0434859466148135
precision:  0.04410464910730069
precision:  0.04543044016263037
precision:  0.04772847799186848
precision:  0.0522361675799894
precision:  0.062400565670850276
precision:  0.08511578575216545
precision:  0.13284426374403394
precision:  0.1950680572741736
precision:  0.26763302103588477
precision:  0.36255966059748984
precision:  0.42602085911260384
precision:  0.47003712214954924
precision:  0.511048258794414
precision:  0.5273112957397914
precision:  0.5466678451476047
precision:  0.5586883507159272
precision:  0.5735372105356196
precision:  0.5820222732897296
precision:  0.593777620646986
precision:  0.6013788226975428
precision:  0.6067703729892169
precision:  0.6133993282658653
precision:  0.6169347710800778
precision:  0.6138412586176418
precision:  0.6471628071

In [40]:
num_of_cls = len(newsgroups_test.target_names)
test_acc = model.test_all(vect_test_dense.T, newsgroups_test.target, 5, 0.5, num_of_cls, weight_list)

precision:  0.5683224323846562
