In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import dump_svmlight_file
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import numpy as np
import json

In [2]:
np.random.seed(0)

In [3]:
def create_data(categories, num_features):
    
    newsgroups = fetch_20newsgroups(subset='all', 
                                    remove = ('headers', 'footers', 'quotes'), 
                                    categories=categories)
    
    X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, 
                                                        newsgroups.target, 
                                                        test_size=0.2,
                                                        random_state=0)
    vectorizer = TfidfVectorizer()
    kbest = SelectKBest(k=num_features)
    
    X_train = vectorizer.fit_transform(X_train)
    y_train[y_train == 0] = -1
    X_train_reduced = kbest.fit_transform(X_train, y_train)

    X_test = vectorizer.transform(X_test)
    y_test[y_test == 0] = -1
    X_test_reduced = kbest.transform(X_test)
    
    dump_svmlight_file(X_train_reduced, y_train, 'data_train')
    dump_svmlight_file(X_test_reduced, y_test, 'data_test')
    
    idx = np.arange(X_train_reduced.shape[0])
    folds = np.array_split(idx, 5)
    for i, f_idx in enumerate(folds):
        dump_svmlight_file(X_train_reduced[f_idx], y_train[f_idx], f'CVfolds/fold{i+1}')
    
    selected_idx = kbest.get_support()
    all_features = np.array(vectorizer.get_feature_names())
    selected_features = all_features[selected_idx]
    selected_dict = {}
    for i, w in enumerate(selected_features):
        selected_dict[str(i)] = w
    
    dict_json = json.dumps(selected_dict)
    f = open("vocab_idx.json","w")
    f.write(dict_json)
    f.close()

In [4]:
cats = ['sci.med', 'sci.space']
create_data(cats, 250)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
with open('vocab_idx.json', 'r') as f:
    selected_features = json.load(f)

In [6]:
def read_libsvm(fname, num_features=0):
    data = []
    y = []
    row_ind = []
    col_ind = []
    with open(fname) as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            elements = line.split()
            y.append(int(elements[0]))
            for el in elements[1:]:
                row_ind.append(i)
                c, v = el.split(":")
                col_ind.append(int(c))
                data.append(float(v))
    if num_features == 0:
        num_features = max(col_ind) + 1
    X = csr_matrix((data, (row_ind, col_ind)), shape=(len(y), num_features))
    return X, np.array(y), num_features    

In [7]:
X_train, y_train, num_features = read_libsvm('data_train')
X_test, y_test, _ = read_libsvm('data_test', num_features)

In [8]:
w = np.squeeze(np.asarray(clf_mine.w))
sorted_w = np.argsort(w)

NameError: name 'clf_mine' is not defined

In [11]:
pos20 = sorted_w[:20]
neg20 = sorted_w[-20:]

In [12]:
top_pos = []
top_negs = []
for i, j in zip(pos20, neg20):
    top_pos.append(selected_features[str(i)])
    top_negs.append(selected_features[str(j)])

In [16]:
clf = Perceptron()



In [17]:
clf.fit(X_train, y_train)

Perceptron(alpha=0.0001, class_weight=None, eta0=0.1, fit_intercept=True,
      max_iter=5, n_iter=None, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=0, warm_start=False)

In [18]:
y_pred = clf.predict(X_test)

In [19]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

         -1       0.82      0.94      0.88       191
          1       0.94      0.81      0.87       205

avg / total       0.88      0.87      0.87       396



In [20]:
accuracy_score(y_test, y_pred)

0.8737373737373737

In [23]:
w = np.squeeze(clf.coef_)
sorted_w = np.argsort(w)

In [25]:
pos20 = sorted_w[:20]
neg20 = sorted_w[-20:]
top_pos = []
top_negs = []
for i, j in zip(pos20, neg20):
    top_pos.append(selected_features[str(i)])
    top_negs.append(selected_features[str(j)])

In [26]:
top_pos

['med',
 'health',
 'experience',
 'medical',
 'msg',
 'her',
 'edu',
 'person',
 'normal',
 'blood',
 'information',
 'skin',
 'common',
 'effect',
 'surgery',
 'prevent',
 'oral',
 'candida',
 'dangerous',
 'results']

In [27]:
top_negs

['inflatable',
 'launched',
 'pluto',
 'russian',
 'international',
 'moon',
 'software',
 'sky',
 'hst',
 'solar',
 'tom',
 'galaxy',
 'station',
 'spacecraft',
 'orbit',
 'nasa',
 'earth',
 'rocket',
 'astro',
 'space']