In [3]:
import numpy as np
import sklearn.svm
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, GridSearchCV

In [87]:
print(sklearn.__version__)

0.18


In [4]:
newsgroups = datasets.fetch_20newsgroups(
    subset='all', 
    categories=['alt.atheism', 'sci.space']
)

In [5]:
newsgroups.target

array([0, 0, 1, ..., 1, 1, 0])

In [7]:
tvect = TfidfVectorizer()

data = tvect.fit_transform(newsgroups.data)

In [10]:
X, y = data, newsgroups.target

# Classificator

In [16]:
clf = sklearn.svm.SVC(kernel='linear', random_state=241)

In [17]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)

In [18]:
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)

In [None]:
gs.fit(X, y)

In [68]:
for a in gs.grid_scores_:
    print(a.mean_validation_score, a.parameters)
    # a.mean_validation_score — оценка качества по кросс-валидации
    # a.parameters — значения параметров

0.552631578947 {'C': 1.0000000000000001e-05}
0.552631578947 {'C': 0.0001}
0.552631578947 {'C': 0.001}
0.552631578947 {'C': 0.01}
0.950167973124 {'C': 0.10000000000000001}
0.993281075028 {'C': 1.0}
0.993281075028 {'C': 10.0}
0.993281075028 {'C': 100.0}
0.993281075028 {'C': 1000.0}
0.993281075028 {'C': 10000.0}
0.993281075028 {'C': 100000.0}




# omg

In [117]:
C = 1.0

In [118]:
clf_final = sklearn.svm.SVC(kernel='linear', random_state=241, C=C)

In [120]:
clf_final.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [121]:
coefs = clf_final.coef_ 

In [122]:
feature_mapping = tvect.get_feature_names()

In [123]:
weights = np.abs(coefs.toarray()[0])

In [124]:
top_indices = np.argsort(-weights)[:10]

In [125]:
coefs_lin = coefs.toarray()[0]

In [126]:
top_indices

array([24019, 12871,  5088,  5093, 17802, 23673, 21850,  5776, 15606, 22936])

In [127]:
for i in top_indices:
    print(weights[i], feature_mapping[i])

2.66316478848 space
1.92037940023 god
1.25468995124 atheism
1.24918000738 atheists
1.20161118175 moon
1.18013159514 sky
1.1390808379 religion
1.13061234466 bible
1.09709364664 keith
1.02930692719 sci


In [128]:
words = sorted([feature_mapping[i] for i in top_indices])

In [129]:
print(words)

['atheism', 'atheists', 'bible', 'god', 'keith', 'moon', 'religion', 'sci', 'sky', 'space']


In [130]:
with open('a2.txt', 'w') as fd:
    print(' '.join(words))
    fd.write(' '.join(words))

atheism atheists bible god keith moon religion sci sky space
