In [41]:
import math

from collections import Counter
from itertools import chain

from requests import Session
import requests

In [2]:
access_token = '' # SPECIFY ACCESS TOKEN HERE

In [4]:
import urllib
import retrying
from urllib.parse import urljoin
from requests import Session
from traceback import format_exc


class TooManyRequests(Exception):
    code = 6
    message = 'Too many requests per second'

class AccessDenied(Exception):
    code = 15
    message = 'Access denied: this profile is private'

class UserDeleted(Exception):
    code = 18
    message = 'User was deleted or banned'

class UnknownException(Exception):
    pass


class ApiMethod():
    URL = 'https://api.vk.com'

    def __init__(self, session, request_args):
        self._session = session
        self._request_args = request_args

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=5)
    def _request(self, method, **params):
#         url = '{}?{}'.format(
#             urljoin(self.URL, '.'.join((self.method, 'get'))),
#             urllib.urlencode(params)
#         )
        url = urljoin(self.URL, '.'.join((self.method, method)))
        try:
            response = self._session.get(url, params=dict(params, **self._request_args))
            _json = response.json()
            if 'response' in _json:
                return _json['response']
            elif 'error' in _json:
                if _json['error']['error_code'] == TooManyRequests.code:
                    raise TooManyRequests()
                elif _json['error']['error_code'] in (AccessDenied.code, UserDeleted.code):
                    return {'items': []}  # quite unsafe 
                else:
                    raise UnknownException # Todo                    
            else:
                raise UnknownException # Todo
        except TooManyRequests:
            raise
        except (Exception, UnknownException):
            # debug
            print(format_exc())
            print(response.content)
            raise


class Users(ApiMethod):
    method = '/method/users'

    def get(self, user_id, **params):
        return self._request('get', user_id=user_id, **params)


class Groups(ApiMethod):
    method = '/method/groups'

    def get(self, user_id, **params):
        return self._request('get', user_id=user_id, **params)

    def get_members(self, group_id, **params):
        return self._request('getMembers', group_id=group_id, **params)


class Friends(ApiMethod):
    method = '/method/friends'

    def get(self, user_id, **params):
        return self._request('get', user_id=user_id, **params)
    
class VkAPI():
    VERSION = '5.87'

    def __init__(self, token):
        self._session = Session()
        self.token = token
        request_args = {
            'v': self.VERSION,
            'access_token': token
        }
        self.users = Users(self._session, request_args)
        self.groups = Groups(self._session, request_args)
        self.friends = Friends(self._session, request_args)



In [8]:
from itertools import count
from functools import partial


def poll(api_method_call, max_count=1000):
    current_count = 0
    for i in count(1):
        response = api_method_call(count=max_count, offset=current_count)
#         response = api.groups.get_members(group_id=SIRIUS_GROUP_ID, )
        yield response

        current_count += max_count
        if current_count > response['count']:
            break

In [5]:
api = VkAPI(access_token)

In [6]:
api.users.get(user_id=4642878)

[{'id': 4642878, 'first_name': 'Сергей', 'last_name': 'Анчутин'}]

In [92]:
sirius_url = 'https://vk.com/siriusdeti'
cdmo_urlf = 'https://vk.com/cdmo45'

serj_users = (
    ('https://vk.com/lenin1112', 150292958),
    ('https://vk.com/eg_75', 651277),
    ('https://vk.com/id16011', 16011), # Oleg Yuzhakov
    ('https://vk.com/andreymen', 37601956), # Andrey Menschikov 
)
serj_math_recomended_groups = {
    41447: 'https://vk.com/smskirov',
    87264276: 'https://vk.com/kirovurtur',    
}
SIRIUS_GROUP_ID = 71991592

In [93]:
# users = set(api.friends.get(user_ids=','.join([i[1] for i in serj_users])))
math_users = set()
for _, user_id in serj_users:
    math_users.update(api.friends.get(user_id=user_id)['items'])
    
for group_id in serj_math_recomended_groups:
    for r in poll(partial(api.groups.get_members, group_id=group_id)):
        math_users.update(r['items'])
print('Math users:', len(math_users))

Math users: 4004


In [16]:
responses = poll(partial(api.groups.get_members, group_id=SIRIUS_GROUP_ID))

sirius_users = set()
for r in responses:
    sirius_users.update(r['items'])

print('Sirius users:', len(sirius_users))

Sirius users: 39377


In [119]:
math_in_sirius_users = math_users & sirius_users
len(math_in_sirius_users)

642

In [158]:
math_in_sirius_user_groups = {}
for u in math_in_sirius_users:
    math_in_sirius_user_groups[u] = api.groups.get(user_id=u)['items']

In [24]:
print([len(i) for i in math_in_sirius_user_groups.values()])

[139, 59, 196, 0, 41, 159, 89, 0, 43, 32, 42, 86, 1626, 56, 14, 68, 964, 17, 11, 648, 247, 3, 118, 74, 197, 35, 28, 16, 201, 7, 211, 513, 162, 63, 0, 45, 17, 27, 8, 106, 21, 5, 27, 104, 94, 232, 32, 48, 30, 17, 222, 0, 26, 187, 52, 163, 77, 32, 154, 126, 182, 257, 7, 25, 17, 83, 146, 3, 90, 8, 14, 44, 170, 24, 95, 66, 60, 50, 30, 93, 55, 35, 5, 0, 25, 559, 53, 11, 42, 6, 73, 179, 331, 110, 39, 47, 83, 41, 78, 0, 0, 23, 147, 37, 33, 24, 136, 56, 0, 0, 217, 99, 346, 69, 46, 187, 34, 30, 114, 18, 18, 6, 15, 43, 105, 68, 52, 0, 14, 0, 182, 42, 76, 346]


In [29]:
c = Counter(chain.from_iterable(math_in_sirius_user_groups.values()))
c.most_common(10)

[(132950129, 52),
 (71991592, 42),
 (47597391, 40),
 (48976701, 32),
 (65937233, 31),
 (37887819, 25),
 (31480508, 24),
 (17413682, 23),
 (62655504, 22),
 (109232148, 20)]

In [31]:
for group_id, _count in c.most_common():
    print(_count, 'https://vk.com/public{}'.format(group_id))
    if _count < 5:
        break

52 https://vk.com/public132950129
42 https://vk.com/public71991592
40 https://vk.com/public47597391
32 https://vk.com/public48976701
31 https://vk.com/public65937233
25 https://vk.com/public37887819
24 https://vk.com/public31480508
23 https://vk.com/public17413682
22 https://vk.com/public62655504
20 https://vk.com/public109232148
19 https://vk.com/public29559271
19 https://vk.com/public29534144
19 https://vk.com/public33682639
17 https://vk.com/public1008672
17 https://vk.com/public1441
17 https://vk.com/public65652356
17 https://vk.com/public61516226
16 https://vk.com/public30365645
16 https://vk.com/public41437811
16 https://vk.com/public56106344
15 https://vk.com/public35904602
15 https://vk.com/public78273068
14 https://vk.com/public38683579
14 https://vk.com/public54295855
14 https://vk.com/public24086381
14 https://vk.com/public109196188
13 https://vk.com/public89780700
13 https://vk.com/public39321576
13 https://vk.com/public7826394
13 https://vk.com/public76552532
13 https://vk

In [32]:
serj_progers = (
    ('https://vk.com/sicamp', 53187592),
    ('https://vk.com/sis',     222235)
)

In [33]:
proger_users = set()
for _, group_id in serj_progers:
    for r in poll(partial(api.groups.get_members, group_id=group_id)):
        proger_users.update(r['items'])
print(len(proger_users))

3952


In [34]:
proger_users_intersection = proger_users & sirius_users
print(len(proger_users_intersection))

580


In [35]:
proger_in_sirius_user_groups = {}
for u in proger_users_intersection:
    proger_in_sirius_user_groups[u] = api.groups.get(user_id=u)['items']

In [37]:
cp = Counter(chain.from_iterable(proger_in_sirius_user_groups.values()))
for group_id, _count in cp.most_common():
    print(_count, 'https://vk.com/public{}'.format(group_id))
    if _count < 25:
        break

232 https://vk.com/public73843185
153 https://vk.com/public71991592
148 https://vk.com/public137565779
145 https://vk.com/public222235
134 https://vk.com/public37887819
131 https://vk.com/public30666517
122 https://vk.com/public76552532
121 https://vk.com/public132950129
113 https://vk.com/public72495085
97 https://vk.com/public20629724
96 https://vk.com/public63282215
92 https://vk.com/public54530371
87 https://vk.com/public29534144
83 https://vk.com/public47597391
82 https://vk.com/public48976701
80 https://vk.com/public109196188
79 https://vk.com/public103581427
78 https://vk.com/public932
75 https://vk.com/public22822305
75 https://vk.com/public11283947
75 https://vk.com/public30365645
72 https://vk.com/public31976785
71 https://vk.com/public30610911
65 https://vk.com/public63731512
64 https://vk.com/public41208167
64 https://vk.com/public24086381
63 https://vk.com/public29559271
61 https://vk.com/public30558759
60 https://vk.com/public36507793
58 https://vk.com/public41437811
58 h

In [38]:
mentions = {}

for user_id, groups in proger_in_sirius_user_groups.items():
    for group_id in groups:
        mentions.setdefault(user_id, dict())[group_id] = 1



In [43]:
def distCosine (vecA, vecB):
    def dotProduct (vecA, vecB):
        d = 0.0
        for dim in vecA:
            if dim in vecB:
                d += vecA[dim]*vecB[dim]
        return d
    return dotProduct (vecA,vecB) / math.sqrt(dotProduct(vecA,vecA)) / math.sqrt(dotProduct(vecB,vecB))


def makeRecommendation(userID, userRates, nBestUsers, nBestProducts):
    matches = [(u, distCosine(userRates[userID], userRates[u])) for u in userRates if u != userID]
    bestMatches = sorted(matches, key=lambda x: (x[1], x[0]), reverse=True)[:nBestUsers]
    print("Most correlated with '%s' users:" % userID)
    for line in bestMatches:
        print("  UserID: %6s  Coeff: %6.4f" % (line[0], line[1]))
    sim = dict()
    sim_all = sum([x[1] for x in bestMatches])
    bestMatches = dict([x for x in bestMatches if x[1] > 0.0])        
    for relatedUser in bestMatches:
        for product in userRates[relatedUser]:
            if not product in userRates[userID]:
                if not product in sim:
                    sim[product] = 0.0
                sim[product] += userRates[relatedUser][product] * bestMatches[relatedUser]
    for product in sim:
        sim[product] /= sim_all
    bestProducts = sorted(sim.items(), key=lambda x: (x[1], x[0]), reverse=True)[:nBestProducts]
    print("Most correlated products:")
    for prodInfo in bestProducts:    
        print("  ProductID: %6s  CorrelationCoeff: %6.4f" % (prodInfo[0], prodInfo[1]))
    return [(x[0], x[1]) for x in bestProducts]

In [44]:
makeRecommendation(sorted(mentions.keys())[100], mentions, 5, 5)

Most correlated with '11848507' users:
  UserID: 122686669  Coeff: 0.2359
  UserID: 193933060  Coeff: 0.2000
  UserID: 267072827  Coeff: 0.1925
  UserID: 168842373  Coeff: 0.1661
  UserID: 164277806  Coeff: 0.1581
Most correlated products:
  ProductID: 79746948  CorrelationCoeff: 0.6320
  ProductID: 109232148  CorrelationCoeff: 0.6241
  ProductID: 107068423  CorrelationCoeff: 0.4576
  ProductID: 122114391  CorrelationCoeff: 0.4221
  ProductID: 65937233  CorrelationCoeff: 0.4221


[(79746948, 0.6320004031443106),
 (109232148, 0.624074991088986),
 (107068423, 0.4575873749875527),
 (122114391, 0.4220532177668723),
 (65937233, 0.4220532177668723)]

In [85]:
DIMA_USER_ID = 38370578
dima_groups = api.groups.get(user_id=DIMA_USER_ID)['items']



In [46]:
SERJ_USER_ID = 4642878
serj_groups = api.groups.get(user_id=SERJ_USER_ID)['items']
for group_id in serj_groups:
    mentions.setdefault(SERJ_USER_ID, dict())[group_id] = 1

In [47]:
makeRecommendation(SERJ_USER_ID, mentions, 5, 5)

Most correlated with '4642878' users:
  UserID: 274243  Coeff: 0.0721
  UserID: 297582804  Coeff: 0.0624
  UserID: 32779086  Coeff: 0.0457
  UserID: 5150346  Coeff: 0.0399
  UserID: 93664533  Coeff: 0.0398
Most correlated products:
  ProductID: 91635769  CorrelationCoeff: 1.0000
  ProductID: 20629724  CorrelationCoeff: 0.8465
  ProductID: 73843185  CorrelationCoeff: 0.8241
  ProductID: 91933860  CorrelationCoeff: 0.7227
  ProductID: 41208167  CorrelationCoeff: 0.7227


[(91635769, 1.0),
 (20629724, 0.8464509734081572),
 (73843185, 0.8240553222104327),
 (91933860, 0.7226867909041017),
 (41208167, 0.7226867909041017)]

In [48]:
historical_groups = (
    ('https://vk.com/history_0z', 53782768),
    ('https://vk.com/history_moments', 74938476),
    ('https://vk.com/souffrantmittelalter', 51254145),
    ('https://vk.com/europa_vetus', 80200319),
    ('https://vk.com/worldamongthechaosss', 121052281),
    ('https://vk.com/pamiattt', 33938434),
)


In [50]:
from itertools import islice


history_users = set()
for _, group_id in historical_groups:
    # подписчиков слишком много, сделаем несколько запросов
    for r in islice(poll(partial(api.groups.get_members, group_id=group_id)), 0, 50):
        history_users.update(r['items'])
print(len(history_users))

233203


In [51]:
history_users_intersection = history_users & sirius_users
print(len(history_users_intersection))

312


In [53]:
history_in_sirius_user_groups = {}
for u in history_users_intersection:
    history_in_sirius_user_groups[u] = api.groups.get(user_id=u)['items']

In [None]:
# history_in_sirius_user_groups

In [55]:
mentions = {}

for user_id, groups in history_in_sirius_user_groups.items():
    for group_id in groups:
        mentions.setdefault(user_id, dict())[group_id] = 1

for group_id in serj_groups:
    mentions.setdefault(SERJ_USER_ID, dict())[group_id] = 1


In [58]:
serj_history_recomendations = makeRecommendation(SERJ_USER_ID, mentions, 5, 5)
print(serj_history_recomendations)
for group_id, _ in serj_history_recomendations:
    print('https://vk.com/public{}'.format(group_id))

Most correlated with '4642878' users:
  UserID: 5807440  Coeff: 0.0332
  UserID: 857630  Coeff: 0.0255
  UserID: 748485  Coeff: 0.0248
  UserID: 2401542  Coeff: 0.0241
  UserID: 6100365  Coeff: 0.0233
Most correlated products:
  ProductID: 28261334  CorrelationCoeff: 1.0000
  ProductID: 16466790  CorrelationCoeff: 0.8055
  ProductID: 23303030  CorrelationCoeff: 0.7464
  ProductID: 131101936  CorrelationCoeff: 0.6378
  ProductID: 76982440  CorrelationCoeff: 0.6378
[(28261334, 1.0), (16466790, 0.8054809335734195), (23303030, 0.7464153523384786), (131101936, 0.6377657821294167), (76982440, 0.6377657821294167)]
https://vk.com/public28261334
https://vk.com/public16466790
https://vk.com/public23303030
https://vk.com/public131101936
https://vk.com/public76982440


In [86]:
mentions = {}

for user_id, groups in history_in_sirius_user_groups.items():
    for group_id in groups:
        mentions.setdefault(user_id, dict())[group_id] = 1

for group_id in dima_groups:
    mentions.setdefault(DIMA_USER_ID, dict())[group_id] = 1


dima_history_recomendations = makeRecommendation(DIMA_USER_ID, mentions, 5, 5)
print(dima_history_recomendations)
for group_id, _ in dima_history_recomendations:
    print('https://vk.com/public{}'.format(group_id))

Most correlated with '38370578' users:
  UserID: 2084647  Coeff: 0.0577
  UserID: 1686574  Coeff: 0.0570
  UserID:    976  Coeff: 0.0567
  UserID: 96621841  Coeff: 0.0536
  UserID: 158814953  Coeff: 0.0503
Most correlated products:
  ProductID: 32804027  CorrelationCoeff: 0.6227
  ProductID: 24565142  CorrelationCoeff: 0.6227
  ProductID: 74938476  CorrelationCoeff: 0.6078
  ProductID: 26284064  CorrelationCoeff: 0.5981
  ProductID: 44759043  CorrelationCoeff: 0.5959
[(32804027, 0.622697737655578), (24565142, 0.622697737655578), (74938476, 0.6078028177895378), (26284064, 0.5981231499698045), (44759043, 0.5959196566319548)]
https://vk.com/public32804027
https://vk.com/public24565142
https://vk.com/public74938476
https://vk.com/public26284064
https://vk.com/public44759043


In [62]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [None]:
from itertools import chain
# math_in_sirius_user_groups, proger_in_sirius_user_groups, history_in_sirius_user_groups
groups = {}

In [122]:
from itertools import chain
import numpy as np

vectorizer = DictVectorizer(sparse=False)
D = [
    dict.fromkeys(groups, 1)
    for user_id, groups in chain(
        sorted(math_in_sirius_user_groups.items()),
        sorted(proger_in_sirius_user_groups.items()),
        sorted(history_in_sirius_user_groups.items()),
    )
]
X = vectorizer.fit_transform(D)
y = np.array(['math'] * len(math_in_sirius_user_groups) + ['proger'] * len(proger_in_sirius_user_groups) + ['history'] * len(history_in_sirius_user_groups))
len(math_in_sirius_user_groups), len(proger_in_sirius_user_groups), len(history_in_sirius_user_groups)

(642, 580, 312)

In [123]:
X.shape, y.shape

((1534, 136586), (1534,))

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [125]:
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [126]:
clf.predict_proba(X_test)

array([[0.11163063, 0.47730211, 0.41106726],
       [0.10802659, 0.52350681, 0.3684666 ],
       [0.14292074, 0.46407409, 0.39300517],
       ...,
       [0.0985738 , 0.41899572, 0.48243048],
       [0.10384559, 0.41098674, 0.48516767],
       [0.06347139, 0.57784122, 0.35868739]])

In [127]:
clf.predict(X_test)

array(['math', 'math', 'math', 'math', 'math', 'math', 'math', 'math',
       'math', 'history', 'history', 'math', 'math', 'history', 'proger',
       'math', 'proger', 'math', 'proger', 'math', 'math', 'math',
       'history', 'math', 'history', 'math', 'proger', 'proger', 'math',
       'proger', 'math', 'math', 'math', 'math', 'history', 'history',
       'math', 'proger', 'proger', 'history', 'math', 'history', 'math',
       'math', 'proger', 'math', 'math', 'math', 'math', 'proger', 'math',
       'math', 'proger', 'history', 'math', 'proger', 'math', 'history',
       'math', 'history', 'math', 'math', 'math', 'math', 'proger',
       'math', 'math', 'proger', 'math', 'math', 'math', 'math', 'proger',
       'math', 'math', 'math', 'math', 'math', 'math', 'math', 'history',
       'math', 'math', 'math', 'math', 'math', 'math', 'math', 'math',
       'math', 'math', 'math', 'history', 'proger', 'proger', 'math',
       'history', 'proger', 'math', 'proger', 'math', 'math', 'ma

In [128]:
y_test

array(['history', 'math', 'math', 'proger', 'history', 'proger', 'proger',
       'history', 'proger', 'proger', 'history', 'proger', 'math',
       'history', 'proger', 'proger', 'proger', 'math', 'math', 'math',
       'math', 'math', 'math', 'math', 'history', 'math', 'proger',
       'proger', 'proger', 'proger', 'math', 'history', 'math', 'math',
       'history', 'history', 'math', 'proger', 'proger', 'history',
       'history', 'history', 'math', 'math', 'math', 'math', 'math',
       'math', 'proger', 'proger', 'math', 'proger', 'proger', 'history',
       'proger', 'proger', 'math', 'history', 'math', 'proger', 'math',
       'proger', 'math', 'math', 'proger', 'math', 'proger', 'proger',
       'math', 'math', 'proger', 'proger', 'proger', 'math', 'math',
       'proger', 'proger', 'math', 'proger', 'math', 'history', 'proger',
       'math', 'math', 'proger', 'math', 'math', 'proger', 'proger',
       'proger', 'history', 'math', 'math', 'proger', 'history', 'math',
       

In [129]:
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

     history       0.84      0.52      0.64       102
        math       0.55      0.89      0.68       214
      proger       0.74      0.39      0.51       191

   micro avg       0.63      0.63      0.63       507
   macro avg       0.71      0.60      0.61       507
weighted avg       0.68      0.63      0.61       507



In [130]:
dima_x = dict.fromkeys(dima_groups, 1)
serj_x = dict.fromkeys(serj_groups, 1)
_X_dima = vectorizer.transform(dima_x)
_X_serj = vectorizer.transform(serj_x)

In [131]:
print(clf.predict(_X_dima))
print(clf.predict(_X_serj))

['proger']
['math']


In [132]:
print(clf.predict_proba(_X_dima))
print(clf.predict_proba(_X_serj))

[[0.19106614 0.38331523 0.42561863]]
[[0.12406684 0.43825296 0.4376802 ]]


In [134]:
from sklearn.neural_network import MLPClassifier

neural_net_clf = MLPClassifier()
neural_net_clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [135]:
print(classification_report(y_test, neural_net_clf.predict(X_test)))

              precision    recall  f1-score   support

     history       0.89      0.48      0.62       102
        math       0.65      0.75      0.70       214
      proger       0.58      0.62      0.60       191

   micro avg       0.65      0.65      0.65       507
   macro avg       0.71      0.62      0.64       507
weighted avg       0.67      0.65      0.65       507



In [143]:
# Можно убрать пользователей, которые попали сразу в 2 группы
from itertools import chain
import numpy as np

m = {
    user_id: groups for user_id, groups in sorted(math_in_sirius_user_groups.items())
    if user_id not in proger_in_sirius_user_groups and user_id not in history_in_sirius_user_groups
}
p = {
    user_id: groups for user_id, groups in sorted(proger_in_sirius_user_groups.items())
    if user_id not in math_in_sirius_user_groups and user_id not in history_in_sirius_user_groups
}
h = {
    user_id: groups for user_id, groups in sorted(history_in_sirius_user_groups.items())
    if user_id not in math_in_sirius_user_groups and user_id not in proger_in_sirius_user_groups
}

vectorizer = DictVectorizer(sparse=False)
D = [
    dict.fromkeys(groups, 1)
    for user_id, groups in chain(
        sorted(m.items()),
        sorted(p.items()),
        sorted(h.items()),
    )
]
X = vectorizer.fit_transform(D)
y = np.array(['math'] * len(m) + ['proger'] * len(p) + ['history'] * len(h))
len(m), len(p), len(h)


(545, 474, 297)

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [145]:
clf2 = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)

In [146]:
clf2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [147]:
print(classification_report(y_test, clf2.predict(X_test)))

              precision    recall  f1-score   support

     history       0.84      0.58      0.69        96
        math       0.61      0.96      0.75       182
      proger       0.96      0.52      0.67       157

   micro avg       0.71      0.71      0.71       435
   macro avg       0.80      0.69      0.70       435
weighted avg       0.79      0.71      0.71       435



In [148]:
neural_net_clf2 = MLPClassifier()
neural_net_clf2.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [149]:
print(classification_report(y_test, neural_net_clf2.predict(X_test)))

              precision    recall  f1-score   support

     history       0.91      0.50      0.64        96
        math       0.82      0.81      0.82       182
      proger       0.68      0.87      0.76       157

   micro avg       0.77      0.77      0.77       435
   macro avg       0.80      0.73      0.74       435
weighted avg       0.79      0.77      0.76       435



In [150]:
import pickle

In [156]:
with open('interest_classifier.pkl', 'wb') as f:
    pickle.dump(neural_net_clf2, f, protocol=pickle.HIGHEST_PROTOCOL)

In [157]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f, protocol=pickle.HIGHEST_PROTOCOL)

In [159]:
neural_net_clf2.classes_

array(['history', 'math', 'proger'], dtype='<U7')