# Data analysis project

_In this project I am going to write a program to predict the number likes post is going to get in vk._

__Выполнил__: Булгаков Дмитрий (ИАД16)

__Дедлайн__: 23:59 10.04.16

# 1. Loading Data

VK library quiet installation and import into the notebook.

In [4]:
# !pip install vk # makes it quiet
import vk

Starting new vk session in order to parse data

In [5]:
vk_session = vk.Session() # starting new session
vk_api = vk.API(vk_session)

Getting number of posts in selected vk group.

In [6]:
selected_group = 'hse_overheard' # no other ideas :c
posts_number = vk_api.wall.get(domain=selected_group)[0] # number of posts is stored in first element
print('Number of posts in selected group: ', posts_number - 1)

Number of posts in selected group:  13964


Writing a function to parse more, than 100 posts from group.

In [7]:
def load_all_posts(page, n_posts, api):
    all_posts = api.wall.get(domain=page, count=n_posts)
    n_loaded = len(all_posts)
    while n_loaded < n_posts: # loop to load more, than 100 posts
        s = api.wall.get(domain=page, offset=n_loaded, count=(n_posts - n_loaded)) # update offset
        all_posts += s[1:] # no need for first element
        n_loaded += len(s) - 1 # update n_loaded
    return all_posts

Loading all posts from group for future analysis

In [45]:
try:
    loaded_posts = load_all_posts(page=selected_group, n_posts=1501, api=vk_api)[1:] # no need for posts number element
    # 1500 for this time, because I have small amount of ram avaliable :c
    print('Number of loaded posts: ', len(loaded_posts))
except: # timout errors are often to occur
    print('Error occured! Try again.')

Number of loaded posts:  1500


# 2. Data preprocessing

Loading required libs to preprocess data.

In [46]:
# !pip install pymorphy2 -q # silent install again
# !pip install stop_words -q # needed to remove stop words
from stop_words import get_stop_words
import pymorphy2 # need this one to convert words to normal time
import datetime # needed to convert response date 
import string # needed to work with strings
from nltk.tokenize import TweetTokenizer # needed to split text
import pandas as pd # required to work with dataframes
from ipywidgets import IntProgress # progressbar
from IPython.display import display # progressbar

Writing functions to process text data. Converting words to normal form and removing punctuation here.

In [47]:
def split_text(text):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(text) # spliting text into words

def convert_to_normal_form(words_list):
    morph = pymorphy2.MorphAnalyzer()
    normal_forms_list = []
    for word in words_list:
        if word not in string.punctuation and word[0] != "<":
            norm_form = morph.parse(word)[0].normal_form #getting normal form of a word
            normal_forms_list.append(norm_form) #adding it to list
    return normal_forms_list

def convert_text(text):
    words_list = split_text(text) # spliting text into words
    norm_words_list = convert_to_normal_form(words_list) # words into normal form
    filtered_words = [w for w in norm_words_list if not w in get_stop_words('russian')] # removing stop words
    return " ".join(filtered_words) # joining words to a sentence again

Writing a function to convert received list into another with another data.

In [48]:
def convert_posts(posts_list):
    progress = IntProgress() 
    progress.max = len(posts_list) # initializing progressbar
    progress.description = 'Processing data convertion'
    display(progress)
    
    updated_posts = [] # list of new posts' list structure
    for i, post in enumerate(posts_list): 
        tmp_dict = {} # creating empty dictionary for each post
        tmp_dict['likes_number'] = int(post['likes']['count']) # getting likes count
        tmp_dict['text'] = convert_text(post['text']) # converting text into normal form
        tmp_dict['text_length'] = len(post['text']) # calculating text length
        tmp_dict['post_hour'] = int(datetime.datetime.fromtimestamp(post['date']).strftime('%H')) # parsing only post hour
        tmp_dict['post_month'] = int(datetime.datetime.fromtimestamp(post['date']).strftime('%m')) # and post month
        tmp_dict['signed'] = int(post['from_id'] != -57354358) # checking whether post is signed or not
        # checking if any attacment exists
        tmp_dict['with_attachment'] = 1 if 'attachment' in post.keys() else 0
        tmp_dict['pinned'] = 1 if 'is_pinned' in post.keys() else 0 # cheking if post is pinned
        tmp_dict['repost'] = 1 if post['post_type'] == 'copy' else 0 # cheking if repost
        updated_posts.append(tmp_dict)
        progress.value += 1 # increasing progressbar value
    progress.description = 'Done convertion!'
    return updated_posts

Converting list of posts into new more convenient one.

In [49]:
converted_posts = convert_posts(loaded_posts)

# 3. Creating object-feature matrix

Loading pandas

In [50]:
import pandas as pd

Creating dataframe from parsed data

In [51]:
posts_frame = pd.DataFrame(converted_posts)
posts_frame.head()

Unnamed: 0,likes_number,pinned,post_hour,post_month,repost,signed,text,text_length,with_attachment
0,0,0,18,6,0,0,выпуск заниматься студент прикладной математик...,67,0
1,2,0,23,6,0,0,поступать юрфак набигай регион общага родитель...,426,0
2,4,0,14,6,0,0,рассказать чувствовать узнать поступить хороши...,110,0
3,29,0,9,6,0,0,удивительно 30-40 оставаться чайлдфри убедить ...,407,0
4,8,0,22,6,0,0,научный интерес 30 существовать феминистка арб...,412,0


And describing posts data

In [52]:
posts_frame.describe()

Unnamed: 0,likes_number,pinned,post_hour,post_month,repost,signed,text_length,with_attachment
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,18.751333,0.0,15.18,3.732,0.029333,0.0,239.930667,0.223333
std,31.156197,0.0,6.067191,1.664339,0.168795,0.0,357.6688,0.416619
min,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,12.0,2.0,0.0,0.0,67.0,0.0
50%,7.0,0.0,16.0,4.0,0.0,0.0,134.0,0.0
75%,23.0,0.0,20.0,5.0,0.0,0.0,275.25,0.0
max,474.0,0.0,23.0,6.0,1.0,0.0,5035.0,1.0


Creating object-feature matrix

In [247]:
from sklearn.feature_extraction.text import TfidfVectorizer # loading count vectorizer

cv = TfidfVectorizer(norm='l1', max_features = 1000, analyzer = 'word', strip_accents='unicode', binary=True)
train_features = cv.fit_transform(posts_frame['text']).toarray() # vectorizing texts
train_frame = posts_frame.join(pd.DataFrame(train_features, columns=cv.get_feature_names())) # transfering it to pandas

train_frame.drop(['likes_number','text'],inplace=True,axis=1,errors='ignore') # removing unnecessary columns
value_frame = posts_frame['likes_number']

In [248]:
train_frame.describe()

Unnamed: 0,pinned,post_hour,post_month,repost,signed,text_length,with_attachment,000,10,100,...,экономист,экономическии,электричка,юридическии,юрист,юрфак,являться,язык,якобы,ясин
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,0.0,15.18,3.732,0.029333,0.0,239.930667,0.223333,0.000252,0.001951,0.000489,...,0.001051,0.001328,0.000691,0.000534,0.000542,0.000291,0.000997,0.001482,0.000265,0.000243
std,0.0,6.067191,1.664339,0.168795,0.0,357.6688,0.416619,0.004742,0.01947,0.005204,...,0.012615,0.018117,0.011144,0.009194,0.007822,0.004757,0.015235,0.013865,0.00527,0.004234
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,12.0,2.0,0.0,0.0,67.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,16.0,4.0,0.0,0.0,134.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,20.0,5.0,0.0,0.0,275.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,23.0,6.0,1.0,0.0,5035.0,1.0,0.12934,0.441839,0.091979,...,0.334288,0.504354,0.238118,0.21387,0.201969,0.103269,0.462557,0.236528,0.174195,0.097465


Saving train frame to file

In [249]:
# train_frame.to_csv('traindata.csv')

# 4. Comparing different methods

Splitting into train and test samples

In [250]:
from sklearn.cross_validation import train_test_split

# Splitting it into test and train samples
X_train, X_test, y_train, y_test = train_test_split(train_frame, value_frame, test_size=0.3, random_state=42)

Importing libs

In [291]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import numpy as np


In [269]:
def compare(est, param, est_name):
    cv = GridSearchCV(est, param, n_jobs = -1)
    cv.fit(X_train, y_train);
    print('CV best score for', est_name, ': ', cv.best_score_)
    
    predicted = cv.predict(X_test)
    mse = mean_squared_error(y_test, predicted)
    print('MSE for', est_name, ':' , mse)
    r2 = r2_score(y_test, predicted)
    print('R^2 for', est_name, ':' , r2)    

## 4.1 Linear regression

### 4.1.1 Simple linear regression

In [273]:
parameters = {'fit_intercept':[True, False],'normalize':[True, False]}
compare(LinearRegression(), parameters, 'simple linear regression')

CV best score for simple linear regression :  -92.6627023853
MSE for simple linear regression : 389888.390702
R^2 for simple linear regression : -420.516274215


### 4.1.2 Linear regression with L1 regularization

In [274]:
parameters = {'alpha':np.arange(1, 100, 5), 'positive':[True, False],'normalize':[True, False], 
              'selection':['cyclic', 'random']}
compare(Lasso(), parameters, 'linear regression with L1 regularization')

CV best score for linear regression with L1 regularization :  0.224873824274
MSE for linear regression with L1 regularization : 679.171584125
R^2 for linear regression with L1 regularization : 0.26573377787


### 4.1.3 Linear regression with L2 regularization

In [279]:
parameters = {'alpha':np.logspace(1.0, 10.0, 101.00), 'fit_intercept':[True, False],'normalize':[True, False],
              'solver':['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'auto']}
compare(Ridge(), parameters, 'linear regression with L2 regularization')

CV best score for linear regression with L2 regularization :  0.27902482471
MSE for linear regression with L2 regularization : 666.251136282
R^2 for linear regression with L2 regularization : 0.279702336991


## 4.2 Decision trees and random forests

### 4.2.1 DecisionTreeClassifier

In [281]:
parameters = {'presort':[True, False],'max_depth': np.arange(1, 20), 'class_weight':['balanced', None], 'splitter':['random', 'best'],
             'max_features':['auto', 'sqrt', 'log2', None]}
compare(tree.DecisionTreeClassifier(), parameters, 'decision tree classifier')



CV best score for decision tree classifier :  0.174285714286
MSE for decision tree classifier : 1215.97333333
R^2 for decision tree classifier : -0.314613518214


### 4.2.2 RandomForestClassifier

In [294]:
parameters = {'n_estimators':[10, 20, 30], 'class_weight':['balanced', 'balanced_subsample', None],
             'max_features':['auto', 'sqrt', 'log2', None]}
compare(RandomForestClassifier(), parameters, 'random forest classifier')
# 'max_depth': np.arange(2, 20)



JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
C:\Users\bulga\Anaconda3\lib\runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    165         sys.exit(msg)
    166     main_globals = sys.modules["__main__"].__dict__
    167     if alter_argv:
    168         sys.argv[0] = mod_spec.origin
    169     return _run_code(code, main_globals, None,
--> 170                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel.__main__', loader=<_f...da3\\lib\\site-packages\\ipykernel\\__main__.py')
    171 
    172 def run_module(mod_name, init_globals=None,
    173                run_name=None, alter_sys=False):
    174     """Execute a module's code without importing it

...........................................................................
C:\Users\bulga\Anaconda3\lib\runpy.py in _run_code(code=<code object <module> at 0x0000022788557780, fil...lib\site-packages\ipykernel\__main__.py", line 1>, run_globals={'__builtins__': <module 'builtins' (built-in)>, '__cached__': r'C:\Users\bulga\Anaconda3\lib\site-packages\ipykernel\__pycache__\__main__.cpython-35.pyc', '__doc__': None, '__file__': r'C:\Users\bulga\Anaconda3\lib\site-packages\ipykernel\__main__.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...da3\\lib\\site-packages\\ipykernel\\__main__.py'), 'app': <module 'ipykernel.kernelapp' from 'C:\\Users\\b...a3\\lib\\site-packages\\ipykernel\\kernelapp.py'>}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel.__main__', loader=<_f...da3\\lib\\site-packages\\ipykernel\\__main__.py'), pkg_name='ipykernel', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x0000022788557780, fil...lib\site-packages\ipykernel\__main__.py", line 1>
        run_globals = {'__builtins__': <module 'builtins' (built-in)>, '__cached__': r'C:\Users\bulga\Anaconda3\lib\site-packages\ipykernel\__pycache__\__main__.cpython-35.pyc', '__doc__': None, '__file__': r'C:\Users\bulga\Anaconda3\lib\site-packages\ipykernel\__main__.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...da3\\lib\\site-packages\\ipykernel\\__main__.py'), 'app': <module 'ipykernel.kernelapp' from 'C:\\Users\\b...a3\\lib\\site-packages\\ipykernel\\kernelapp.py'>}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\ipykernel\__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\traitlets\config\application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    591         
    592         If a global instance already exists, this reinitializes and starts it
    593         """
    594         app = cls.instance(**kwargs)
    595         app.initialize(argv)
--> 596         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    597 
    598 #-----------------------------------------------------------------------------
    599 # utility functions, for convenience
    600 #-----------------------------------------------------------------------------

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\ipykernel\kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    437         
    438         if self.poller is not None:
    439             self.poller.start()
    440         self.kernel.start()
    441         try:
--> 442             ioloop.IOLoop.instance().start()
    443         except KeyboardInterrupt:
    444             pass
    445 
    446 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    157             PollIOLoop.configure(ZMQIOLoop)
    158         return PollIOLoop.current(*args, **kwargs)
    159     
    160     def start(self):
    161         try:
--> 162             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    163         except ZMQError as e:
    164             if e.errno == ETERM:
    165                 # quietly return on ETERM
    166                 pass

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\tornado\ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    878                 self._events.update(event_pairs)
    879                 while self._events:
    880                     fd, events = self._events.popitem()
    881                     try:
    882                         fd_obj, handler_func = self._handlers[fd]
--> 883                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    884                     except (OSError, IOError) as e:
    885                         if errno_from_exception(e) == errno.EPIPE:
    886                             # Happens when the client closes the connection
    887                             pass

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    271         if self.control_stream:
    272             self.control_stream.on_recv(self.dispatch_control, copy=False)
    273 
    274         def make_dispatcher(stream):
    275             def dispatcher(msg):
--> 276                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    277             return dispatcher
    278 
    279         for s in self.shell_streams:
    280             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': "parameters = {'n_estimators':[10, 20, 30], 'clas...sifier(), parameters, 'random forest classifier')", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-06-12T21:21:09.624392', 'msg_id': 'B856D63C29C847D79CAD08338080828B', 'msg_type': 'execute_request', 'session': 'A53E01A4C04342AA84B2F304A859EF18', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': 'B856D63C29C847D79CAD08338080828B', 'msg_type': 'execute_request', 'parent_header': {}})
    223             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    224         else:
    225             self.log.debug("%s: %s", msg_type, msg)
    226             self.pre_handler_hook()
    227             try:
--> 228                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'A53E01A4C04342AA84B2F304A859EF18']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': "parameters = {'n_estimators':[10, 20, 30], 'clas...sifier(), parameters, 'random forest classifier')", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-06-12T21:21:09.624392', 'msg_id': 'B856D63C29C847D79CAD08338080828B', 'msg_type': 'execute_request', 'session': 'A53E01A4C04342AA84B2F304A859EF18', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': 'B856D63C29C847D79CAD08338080828B', 'msg_type': 'execute_request', 'parent_header': {}}
    229             except Exception:
    230                 self.log.error("Exception in message handler:", exc_info=True)
    231             finally:
    232                 self.post_handler_hook()

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'A53E01A4C04342AA84B2F304A859EF18'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': "parameters = {'n_estimators':[10, 20, 30], 'clas...sifier(), parameters, 'random forest classifier')", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-06-12T21:21:09.624392', 'msg_id': 'B856D63C29C847D79CAD08338080828B', 'msg_type': 'execute_request', 'session': 'A53E01A4C04342AA84B2F304A859EF18', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': 'B856D63C29C847D79CAD08338080828B', 'msg_type': 'execute_request', 'parent_header': {}})
    386         if not silent:
    387             self.execution_count += 1
    388             self._publish_execute_input(code, parent, self.execution_count)
    389 
    390         reply_content = self.do_execute(code, silent, store_history,
--> 391                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    392 
    393         # Flush output before sending the reply.
    394         sys.stdout.flush()
    395         sys.stderr.flush()

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\ipykernel\ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code="parameters = {'n_estimators':[10, 20, 30], 'clas...sifier(), parameters, 'random forest classifier')", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    194 
    195         reply_content = {}
    196         # FIXME: the shell calls the exception handler itself.
    197         shell._reply_content = None
    198         try:
--> 199             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method InteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = "parameters = {'n_estimators':[10, 20, 30], 'clas...sifier(), parameters, 'random forest classifier')"
        store_history = True
        silent = False
    200         except:
    201             status = u'error'
    202             # FIXME: this code right now isn't being used yet by default,
    203             # because the run_cell() call above directly fires off exception

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="parameters = {'n_estimators':[10, 20, 30], 'clas...sifier(), parameters, 'random forest classifier')", store_history=True, silent=False, shell_futures=True)
   2718                 self.displayhook.exec_result = result
   2719 
   2720                 # Execute the user code
   2721                 interactivity = "none" if silent else self.ast_node_interactivity
   2722                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2723                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2724 
   2725                 # Reset this so later displayed values do not modify the
   2726                 # ExecutionResult
   2727                 self.displayhook.exec_result = None

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Expr object>], cell_name='<ipython-input-294-26b1cb07d3bb>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2826                     return True
   2827 
   2828             for i, node in enumerate(to_run_interactive):
   2829                 mod = ast.Interactive([node])
   2830                 code = compiler(mod, cell_name, "single")
-> 2831                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x0000022796E7EAE0, file "<ipython-input-294-26b1cb07d3bb>", line 3>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   2832                     return True
   2833 
   2834             # Flush softspace
   2835             if softspace(sys.stdout, 0):

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x0000022796E7EAE0, file "<ipython-input-294-26b1cb07d3bb>", line 3>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2880         outflag = 1  # happens in more places, so it's easier as default
   2881         try:
   2882             try:
   2883                 self.hooks.pre_run_code_hook()
   2884                 #rprint('Running code', repr(code_obj)) # dbg
-> 2885                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x0000022796E7EAE0, file "<ipython-input-294-26b1cb07d3bb>", line 3>
        self.user_global_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'In': ['', "try:\n    loaded_posts = load_all_posts(page=sele...n to occur\n    print('Error occured! Try again.')", "try:\n    loaded_posts = load_all_posts(page=sele...n to occur\n    print('Error occured! Try again.')", "try:\n    loaded_posts = load_all_posts(page=sele...n to occur\n    print('Error occured! Try again.')", '# !pip install vk # makes it quiet\nimport vk', 'vk_session = vk.Session() # starting new session\nvk_api = vk.API(vk_session)', "selected_group = 'hse_overheard' # no other idea... of posts in selected group: ', posts_number - 1)", 'def load_all_posts(page, n_posts, api):\n    all_...len(s) - 1 # update n_loaded\n    return all_posts', "try:\n    loaded_posts = load_all_posts(page=sele...n to occur\n    print('Error occured! Try again.')", '# !pip install pymorphy2 -q # silent install aga...from IPython.display import display # progressbar', 'def split_text(text):\n    tokenizer = TweetToken...ltered_words) # joining words to a sentence again', "def convert_posts(posts_list):\n    progress = In...ion = 'Done convertion!'\n    return updated_posts", 'converted_posts = convert_posts(loaded_posts)', 'import pandas as pd', 'posts_frame = pd.DataFrame(converted_posts)\nposts_frame.head()', 'posts_frame.describe()', "from sklearn.feature_extraction.text import Coun...columns\nvalue_frame = posts_frame['likes_number']", 'train_frame.describe()', "# train_frame.to_csv('traindata.csv')", 'from sklearn.cross_validation import train_test_...ame, value_frame, test_size=0.3, random_state=42)', ...], 'IntProgress': <class 'ipywidgets.widgets.widget_int.IntProgress'>, 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, 'Out': {14:    likes_number  pinned  post_hour  post_month  ...    0  
3                0  
4                0  , 15:        likes_number  pinned   post_hour  post_mo...  0.00000  
max     2270.00000          1.00000  , 17:        pinned   post_hour  post_month     repost...  1.000000    1.000000  

[8 rows x 4031 columns], 25: 0.0068924852394986564, 27: 0.018093065218199256, 28: {'alpha': 1, 'normalize': False, 'positive': True, 'selection': 'cyclic'}, 30: -9962768471560722.0, 34: -9962768471560722.0, 39: dict_keys(['max_leaf_nodes', 'class_weight', 'pr...min_samples_split', 'random_state', 'criterion']), 40: 0.2742857142857143, ...}, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'Ridge': <class 'sklearn.linear_model.ridge.Ridge'>, ...}
        self.user_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'In': ['', "try:\n    loaded_posts = load_all_posts(page=sele...n to occur\n    print('Error occured! Try again.')", "try:\n    loaded_posts = load_all_posts(page=sele...n to occur\n    print('Error occured! Try again.')", "try:\n    loaded_posts = load_all_posts(page=sele...n to occur\n    print('Error occured! Try again.')", '# !pip install vk # makes it quiet\nimport vk', 'vk_session = vk.Session() # starting new session\nvk_api = vk.API(vk_session)', "selected_group = 'hse_overheard' # no other idea... of posts in selected group: ', posts_number - 1)", 'def load_all_posts(page, n_posts, api):\n    all_...len(s) - 1 # update n_loaded\n    return all_posts', "try:\n    loaded_posts = load_all_posts(page=sele...n to occur\n    print('Error occured! Try again.')", '# !pip install pymorphy2 -q # silent install aga...from IPython.display import display # progressbar', 'def split_text(text):\n    tokenizer = TweetToken...ltered_words) # joining words to a sentence again', "def convert_posts(posts_list):\n    progress = In...ion = 'Done convertion!'\n    return updated_posts", 'converted_posts = convert_posts(loaded_posts)', 'import pandas as pd', 'posts_frame = pd.DataFrame(converted_posts)\nposts_frame.head()', 'posts_frame.describe()', "from sklearn.feature_extraction.text import Coun...columns\nvalue_frame = posts_frame['likes_number']", 'train_frame.describe()', "# train_frame.to_csv('traindata.csv')", 'from sklearn.cross_validation import train_test_...ame, value_frame, test_size=0.3, random_state=42)', ...], 'IntProgress': <class 'ipywidgets.widgets.widget_int.IntProgress'>, 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, 'Out': {14:    likes_number  pinned  post_hour  post_month  ...    0  
3                0  
4                0  , 15:        likes_number  pinned   post_hour  post_mo...  0.00000  
max     2270.00000          1.00000  , 17:        pinned   post_hour  post_month     repost...  1.000000    1.000000  

[8 rows x 4031 columns], 25: 0.0068924852394986564, 27: 0.018093065218199256, 28: {'alpha': 1, 'normalize': False, 'positive': True, 'selection': 'cyclic'}, 30: -9962768471560722.0, 34: -9962768471560722.0, 39: dict_keys(['max_leaf_nodes', 'class_weight', 'pr...min_samples_split', 'random_state', 'criterion']), 40: 0.2742857142857143, ...}, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'Ridge': <class 'sklearn.linear_model.ridge.Ridge'>, ...}
   2886             finally:
   2887                 # Reset our crash handler in place
   2888                 sys.excepthook = old_excepthook
   2889         except SystemExit as e:

...........................................................................
C:\Users\bulga\Dropbox\Учеба\ВШЭ\Майнор\HSE_DataMiningHW\DataAnalysis\Homeworks\Solutions\Project\Part 2\<ipython-input-294-26b1cb07d3bb> in <module>()
      1 
      2 
----> 3 
      4 parameters = {'n_estimators':[10, 20, 30], 'class_weight':['balanced', 'balanced_subsample', None],
      5              'max_features':['auto', 'sqrt', 'log2', None]}
      6 compare(RandomForestClassifier(), parameters, 'random forest classifier')
      7 
      8 
      9 
     10 

...........................................................................
C:\Users\bulga\Dropbox\Учеба\ВШЭ\Майнор\HSE_DataMiningHW\DataAnalysis\Homeworks\Solutions\Project\Part 2\<ipython-input-269-b41b5f451bc3> in compare(est=RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), param={'class_weight': ['balanced', 'balanced_subsample', None], 'max_features': ['auto', 'sqrt', 'log2', None], 'n_estimators': [10, 20, 30]}, est_name='random forest classifier')
      1 
      2 
----> 3 
      4 def compare(est, param, est_name):
      5     cv = GridSearchCV(est, param, n_jobs = -1)
      6     cv.fit(X_train, y_train);
      7     print('CV best score for', est_name, ': ', cv.best_score_)
      8     
      9     predicted = cv.predict(X_test)
     10     mse = mean_squared_error(y_test, predicted)
     11     print('MSE for', est_name, ':' , mse)
     12     r2 = r2_score(y_test, predicted)
     13     print('R^2 for', est_name, ':' , r2)    

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\grid_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
     ...='2*n_jobs', refit=True, scoring=None, verbose=0), X=      pinned  post_hour  post_month  repost  sig...   0.0  
1126   0.0  

[1050 rows x 1007 columns], y=485      17
527       5
199       2
889      16
...  44
1126     18
Name: likes_number, dtype: int64)
    799         y : array-like, shape = [n_samples] or [n_samples, n_output], optional
    800             Target relative to X for classification or regression;
    801             None for unsupervised learning.
    802 
    803         """
--> 804         return self._fit(X, y, ParameterGrid(self.param_grid))
        self._fit = <bound method BaseSearchCV._fit of GridSearchCV(...'2*n_jobs', refit=True, scoring=None, verbose=0)>
        X =       pinned  post_hour  post_month  repost  sig...   0.0  
1126   0.0  

[1050 rows x 1007 columns]
        y = 485      17
527       5
199       2
889      16
...  44
1126     18
Name: likes_number, dtype: int64
        self.param_grid = {'class_weight': ['balanced', 'balanced_subsample', None], 'max_features': ['auto', 'sqrt', 'log2', None], 'n_estimators': [10, 20, 30]}
    805 
    806 
    807 class RandomizedSearchCV(BaseSearchCV):
    808     """Randomized search on hyper parameters.

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\grid_search.py in _fit(self=GridSearchCV(cv=None, error_score='raise',
     ...='2*n_jobs', refit=True, scoring=None, verbose=0), X=      pinned  post_hour  post_month  repost  sig...   0.0  
1126   0.0  

[1050 rows x 1007 columns], y=485      17
527       5
199       2
889      16
...  44
1126     18
Name: likes_number, dtype: int64, parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
    548         )(
    549             delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
    550                                     train, test, self.verbose, parameters,
    551                                     self.fit_params, return_parameters=True,
    552                                     error_score=self.error_score)
--> 553                 for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.grid_search.ParameterGrid object>
    554                 for train, test in cv)
    555 
    556         # Out is a list of triplet: score, estimator, n_test_samples
    557         n_fits = len(out)

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV._fit.<locals>.<genexpr>>)
    805             if pre_dispatch == "all" or n_jobs == 1:
    806                 # The iterable was consumed all at once by the above for loop.
    807                 # No need to wait for async callbacks to trigger to
    808                 # consumption.
    809                 self._iterating = False
--> 810             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    811             # Make sure that we get a last message telling us we are done
    812             elapsed_time = time.time() - self._start_time
    813             self._print('Done %3i out of %3i | elapsed: %s finished',
    814                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Sun Jun 12 21:21:37 2016
PID: 1632                 Python 3.5.1: C:\Users\bulga\Anaconda3\python.exe
...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False),       pinned  post_hour  post_month  repost  sig...   0.0  
1126   0.0  

[1050 rows x 1007 columns], 485      17
527       5
199       2
889      16
...  44
1126     18
Name: likes_number, dtype: int64, <function _passthrough_scorer>, array([  86,  163,  191,  214,  215,  235,  259,...1043, 1044, 1045, 1046, 1047, 1048,
       1049]), array([   0,    1,    2,    3,    4,    5,    6,... 951,  952,  958,
        986,  987, 1012, 1038]), 0, {'class_weight': 'balanced_subsample', 'max_features': 'auto', 'n_estimators': 10}, {}), {'error_score': 'raise', 'return_parameters': True})]
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0=<list_iterator object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False),       pinned  post_hour  post_month  repost  sig...   0.0  
1126   0.0  

[1050 rows x 1007 columns], 485      17
527       5
199       2
889      16
...  44
1126     18
Name: likes_number, dtype: int64, <function _passthrough_scorer>, array([  86,  163,  191,  214,  215,  235,  259,...1043, 1044, 1045, 1046, 1047, 1048,
       1049]), array([   0,    1,    2,    3,    4,    5,    6,... 951,  952,  958,
        986,  987, 1012, 1038]), 0, {'class_weight': 'balanced_subsample', 'max_features': 'auto', 'n_estimators': 10}, {})
        kwargs = {'error_score': 'raise', 'return_parameters': True}
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _fit_and_score(estimator=RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), X=      pinned  post_hour  post_month  repost  sig...   0.0  
1126   0.0  

[1050 rows x 1007 columns], y=485      17
527       5
199       2
889      16
...  44
1126     18
Name: likes_number, dtype: int64, scorer=<function _passthrough_scorer>, train=array([  86,  163,  191,  214,  215,  235,  259,...1043, 1044, 1045, 1046, 1047, 1048,
       1049]), test=array([   0,    1,    2,    3,    4,    5,    6,... 951,  952,  958,
        986,  987, 1012, 1038]), verbose=0, parameters={'class_weight': 'balanced_subsample', 'max_features': 'auto', 'n_estimators': 10}, fit_params={}, return_train_score=False, return_parameters=True, error_score='raise')
   1526 
   1527     try:
   1528         if y_train is None:
   1529             estimator.fit(X_train, **fit_params)
   1530         else:
-> 1531             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method BaseForest.fit of RandomForestClas...e=None, verbose=0,
            warm_start=False)>
        X_train =       pinned  post_hour  post_month  repost  sig...000  
1126  0.000000  

[650 rows x 1007 columns]
        y_train = 1387     73
1478    150
915      73
638      16
...  44
1126     18
Name: likes_number, dtype: int64
        fit_params = {}
   1532 
   1533     except Exception as e:
   1534         if error_score == 'raise':
   1535             raise

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in fit(self=RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), X=array([[  0.,  15.,   1., ...,   0.,   0.,   0.]...1.,   2., ...,   0.,   0.,   0.]], dtype=float32), y=array([[ 65.],
       [ 74.],
       [ 65.],
   ....],
       [ 39.],
       [ 44.],
       [ 18.]]), sample_weight=None)
    285             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
    286                              backend="threading")(
    287                 delayed(_parallel_build_trees)(
    288                     t, self, X, y, sample_weight, i, len(trees),
    289                     verbose=self.verbose, class_weight=self.class_weight)
--> 290                 for i, t in enumerate(trees))
        i = 9
    291 
    292             # Collect newly grown trees
    293             self.estimators_.extend(trees)
    294 

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=Parallel(n_jobs=1), iterable=<generator object BaseForest.fit.<locals>.<genexpr>>)
    795         self._smoothed_batch_duration = 0.0
    796         try:
    797             # Only set self._iterating to True if at least a batch
    798             # was dispatched. In particular this covers the edge
    799             # case of Parallel used with an exhausted iterator.
--> 800             while self.dispatch_one_batch(iterator):
        self.dispatch_one_batch = <bound method Parallel.dispatch_one_batch of Parallel(n_jobs=1)>
        iterator = <generator object BaseForest.fit.<locals>.<genexpr>>
    801                 self._iterating = True
    802             else:
    803                 self._iterating = False
    804 

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self=Parallel(n_jobs=1), iterator=<generator object BaseForest.fit.<locals>.<genexpr>>)
    653             tasks = BatchedCalls(itertools.islice(iterator, batch_size))
    654             if not tasks:
    655                 # No more tasks available in the iterator: tell caller to stop.
    656                 return False
    657             else:
--> 658                 self._dispatch(tasks)
        self._dispatch = <bound method Parallel._dispatch of Parallel(n_jobs=1)>
        tasks = <sklearn.externals.joblib.parallel.BatchedCalls object>
    659                 return True
    660 
    661     def _print(self, msg, msg_args):
    662         """Display the message on stout or stderr depending on verbosity"""

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self=Parallel(n_jobs=1), batch=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    561         # If job.get() catches an exception, it closes the queue:
    562         if self._aborting:
    563             return
    564 
    565         if self._pool is None:
--> 566             job = ImmediateComputeBatch(batch)
        job = undefined
        batch = <sklearn.externals.joblib.parallel.BatchedCalls object>
    567             self._jobs.append(job)
    568             self.n_dispatched_batches += 1
    569             self.n_dispatched_tasks += len(batch)
    570             self.n_completed_tasks += len(batch)

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __init__(self=<sklearn.externals.joblib.parallel.ImmediateComputeBatch object>, batch=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    175 
    176     """
    177     def __init__(self, batch):
    178         # Don't delay the application, to avoid keeping the input
    179         # arguments in memory
--> 180         self.results = batch()
        self.results = undefined
        batch = <sklearn.externals.joblib.parallel.BatchedCalls object>
    181 
    182     def get(self):
    183         return self.results
    184 

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _parallel_build_trees>, (DecisionTreeClassifier(class_weight=None, criter...t=False, random_state=608667729, splitter='best'), RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), array([[  0.,  15.,   1., ...,   0.,   0.,   0.]...1.,   2., ...,   0.,   0.,   0.]], dtype=float32), array([[ 65.],
       [ 74.],
       [ 65.],
   ....],
       [ 39.],
       [ 44.],
       [ 18.]]), None, 0, 10), {'class_weight': 'balanced_subsample', 'verbose': 0})]
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0=<list_iterator object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _parallel_build_trees>
        args = (DecisionTreeClassifier(class_weight=None, criter...t=False, random_state=608667729, splitter='best'), RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), array([[  0.,  15.,   1., ...,   0.,   0.,   0.]...1.,   2., ...,   0.,   0.,   0.]], dtype=float32), array([[ 65.],
       [ 74.],
       [ 65.],
   ....],
       [ 39.],
       [ 44.],
       [ 18.]]), None, 0, 10)
        kwargs = {'class_weight': 'balanced_subsample', 'verbose': 0}
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in _parallel_build_trees(tree=DecisionTreeClassifier(class_weight=None, criter...t=False, random_state=608667729, splitter='best'), forest=RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), X=array([[  0.,  15.,   1., ...,   0.,   0.,   0.]...1.,   2., ...,   0.,   0.,   0.]], dtype=float32), y=array([[ 65.],
       [ 74.],
       [ 65.],
   ....],
       [ 39.],
       [ 44.],
       [ 18.]]), sample_weight=None, tree_idx=0, n_trees=10, verbose=0, class_weight='balanced_subsample')
    109         if class_weight == 'subsample':
    110             with warnings.catch_warnings():
    111                 warnings.simplefilter('ignore', DeprecationWarning)
    112                 curr_sample_weight *= compute_sample_weight('auto', y, indices)
    113         elif class_weight == 'balanced_subsample':
--> 114             curr_sample_weight *= compute_sample_weight('balanced', y, indices)
        curr_sample_weight = array([ 0.,  1.,  1.,  1.,  0.,  0.,  2.,  3.,  ...1.,  1.,  3.,  1.,  0.,  2.,  0.,  2.,  3.,  0.])
        y = array([[ 65.],
       [ 74.],
       [ 65.],
   ....],
       [ 39.],
       [ 44.],
       [ 18.]])
        indices = array([286,  41, 345, 507, 409, 232, 108,   2, 3...26, 453, 648, 290, 195, 523, 562, 453, 551, 103])
    115 
    116         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    117     else:
    118         tree.fit(X, y, sample_weight=sample_weight, check_input=False)

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\sklearn\utils\class_weight.py in compute_sample_weight(class_weight='balanced', y=array([[ 65.],
       [ 74.],
       [ 65.],
   ....],
       [ 39.],
       [ 44.],
       [ 18.]]), indices=array([286,  41, 345, 507, 409, 232, 108,   2, 3...26, 453, 648, 290, 195, 523, 562, 453, 551, 103]))
    157             weight_k = np.choose(np.searchsorted(classes_subsample,
    158                                                  classes_full),
    159                                  compute_class_weight(class_weight_k,
    160                                                       classes_subsample,
    161                                                       y_subsample),
--> 162                                  mode='clip')
    163 
    164             classes_missing = set(classes_full) - set(classes_subsample)
    165         else:
    166             weight_k = compute_class_weight(class_weight_k,

...........................................................................
C:\Users\bulga\Anaconda3\lib\site-packages\numpy\core\fromnumeric.py in choose(a=array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 1...    62, 62, 63, 64, 65, 66, 67, 68], dtype=int64), choices=array([ 0.08722491,  0.17444981,  0.21907651,  0...42028986,  9.42028986,  9.42028986,  9.42028986]), out=None, mode='clip')
    346     """
    347     try:
    348         choose = a.choose
    349     except AttributeError:
    350         return _wrapit(a, 'choose', choices, out=out, mode=mode)
--> 351     return choose(choices, out=out, mode=mode)
        choose = <built-in method choose of numpy.ndarray object>
        choices = array([ 0.08722491,  0.17444981,  0.21907651,  0...42028986,  9.42028986,  9.42028986,  9.42028986])
        out = None
        mode = 'clip'
    352 
    353 
    354 def repeat(a, repeats, axis=None):
    355     """

ValueError: Need between 2 and (32) array objects (inclusive).
___________________________________________________________________________

### 4.3 kNN

In [300]:
parameters = {'leaf_size':np.arange(30, 100, 10),'n_neighbors': np.arange(5, 20), 
              'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']}
compare(KNeighborsClassifier(), parameters, 'kNN')



CV best score for kNN :  0.150476190476
MSE for kNN : 1027.62666667
R^2 for kNN : -0.11098810364
