In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

In [2]:
scaler = MinMaxScaler()

Чтение датасетов

In [7]:
df = pd.read_csv('bfd/train_dataset.csv')
df = df[(~df.TailNum.isnull()) & (~df.DepTime.isnull()) & (~df.AirTime.isnull())]

test = pd.read_csv('bfd/test_dataset.csv').replace([np.nan, np.inf, -np.inf], 0)
test_ids = test.Id.values
test.drop('Id', axis=1, inplace=True)

Считаем простые признаки. Одновременно для трейн и тест выборок. Так меньше шансов ошибиться

In [8]:
df['speed'] = (60 * df.Distance/df.AirTime).replace([np.nan, np.inf, -np.inf], 0)
test['speed'] = (60 * test.Distance/test.AirTime).replace([np.nan, np.inf, -np.inf], 0)

In [9]:
df['diff_arr_dep_time'] = df.ArrTime - df.DepTime
test['diff_arr_dep_time'] = test.ArrTime - test.DepTime

In [10]:
df['elapsed_time'] = df.ArrTime + df.TaxiIn + df.TaxiOut
test['elapsed_time'] = test.ArrTime + test.TaxiIn + test.TaxiOut

In [None]:
def season(x):
    if x in [12, 1, 2]:
        return 'winter'
    elif x in [3,4,5]:
        return 'spring'
    elif x in [6,7,8]:
        return 'summer'
    else:
        return 'fall'

df['season'] = df.Month.apply(lambda x: season(x))
test['season'] = test.Month.apply(lambda x: season(x))

In [None]:
def timefl(x):
    begin = x['DepTime']
    end = x['ArrTime']
    if end < begin :
        end += 2400
    bh = int(begin/100)
    eh = int(end/100)
    bm = int(begin%100)
    em = int(end%100)
    answerm = (eh - bh)*60 + em - bm
    return answerm

df['FullTime'] = df.apply(timefl, axis = 1)
test['FullTime'] = df.apply(timefl, axis = 1)

Перемешиваем датасет и разбиваем на 2 части. По одной части будем насчитывать статистику, а на другой будем обучать модель

In [9]:
df = shuffle(df)

x_count, x_train = train_test_split(df, test_size=0.3, random_state=42)
print(x_count.shape, x_train.shape)

(2399348, 21) (1028292, 21)


Дальше будет код для подсчета эмпирического(наблюдаемого) среднего по различным признакам. Обычно вместо того чтобы использовать среднее по историческим данным, используют так называемые <b>log odds</b>. Подумайте почему :)

In [24]:
import math
def logit(x):
    return math.log((x / (1-x)) + 1e-5)

def logit_vec(v):
    return logit(v.mean()) # guess why ?

In [25]:
day_mean = df.loc[:, ['DayOfWeek', 'target']].groupby('DayOfWeek').agg(logit_vec).reset_index()
day_mean.rename(columns={'target': 'day_mean_target'}, inplace=True)
day_mean.head()

Unnamed: 0,DayOfWeek,day_mean_target
0,1,-0.83596
1,2,-0.983972
2,3,-0.987045
3,4,-0.840462
4,5,-0.681037


In [26]:
month_mean = df.loc[:, ['Month', 'target']].groupby('Month').agg(logit_vec).reset_index()
month_mean.rename(columns={'target': 'month_mean_target'}, inplace=True)
month_mean.head()

Unnamed: 0,Month,month_mean_target
0,1,-0.716806
1,2,-0.563873
2,3,-0.612841
3,4,-0.947695
4,5,-0.985031


In [27]:
import warnings
warnings.filterwarnings("ignore")

In [28]:
flight_c_volume = df.loc[:, ['target', 'UniqueCarrier']].groupby(['UniqueCarrier']).agg('count').reset_index()
flight_c_volume.rename(columns={'target': 'flight_carrier_volume'}, inplace=True)
flight_c_volume['flight_carrier_volume'] = scaler.fit_transform(flight_c_volume['flight_carrier_volume'].reshape(-1,1))
flight_c_volume.head()

Unnamed: 0,UniqueCarrier,flight_carrier_volume
0,9E,0.208778
1,AA,0.490782
2,AQ,0.0
3,AS,0.119449
4,B6,0.156475


In [29]:
flight_volume = df.loc[:, ['target', 'TailNum']].groupby(['TailNum']).agg('count').reset_index()
flight_volume.rename(columns={'target': 'flight_volume'}, inplace=True)
flight_volume['flight_volume'] = scaler.fit_transform(flight_volume['flight_volume'].reshape(-1,1))
flight_volume.head()

Unnamed: 0,TailNum,flight_volume
0,80009E,0.390379
1,80019E,0.395913
2,80059E,0.395913
3,80129E,0.407407
4,80139E,0.398467


In [30]:
season = x_count.loc[:, ['target', 'season']].groupby('season').agg(logit_vec).reset_index()
season.rename(columns={'target': 'season_empirical_mean'}, inplace=True)
season.head()

Unnamed: 0,season,season_empirical_mean
0,fall,-1.363724
1,spring,-0.845076
2,summer,-0.770963
3,winter,-0.562199


In [31]:
un_carrier = x_count.loc[:, ['target', 'UniqueCarrier']].groupby('UniqueCarrier').agg(logit_vec).reset_index()
un_carrier.rename(columns={'target': 'un_carrier_empirical_mean'}, inplace=True)
un_carrier.head()

Unnamed: 0,UniqueCarrier,un_carrier_empirical_mean
0,9E,-1.311483
1,AA,-0.661085
2,AQ,-2.104625
3,AS,-0.948923
4,B6,-0.859654


In [32]:
origin = x_count.loc[:, ['target', 'Origin']].groupby('Origin').agg(logit_vec).reset_index()
print(origin.columns)
origin.rename(columns={'target': 'origin_empirical_mean'}, inplace=True)
origin.head()

Index(['Origin', 'target'], dtype='object')


Unnamed: 0,Origin,origin_empirical_mean
0,ABE,-1.283641
1,ABI,-1.66041
2,ABQ,-0.948466
3,ABY,-0.962785
4,ACK,-0.03921


In [33]:
dest = x_count.loc[:, ['target', 'Dest']].groupby('Dest').agg(logit_vec).reset_index()
dest.rename(columns={'target': 'dest_empirical_mean'}, inplace=True)
dest.head()

Unnamed: 0,Dest,dest_empirical_mean
0,ABE,-0.928064
1,ABI,-0.613735
2,ABQ,-0.743714
3,ABY,-0.73794
4,ACK,-0.514439


По некоторым самолётам очень мало статистики. Поэтому здесь нужно либо воспользоваться обычным средним, либо размышлять над тем как сгладить средее. Поскольку это код бейзлайна, я просто воспользуюсь средним.

In [34]:
tailnum = x_count.loc[:, ['target', 'TailNum']].groupby('TailNum').agg('mean').reset_index()
tailnum.rename(columns={'target': 'tailnum_empirical_mean'}, inplace=True)
tailnum.head()

Unnamed: 0,TailNum,tailnum_empirical_mean
0,80009E,0.214626
1,80019E,0.208145
2,80059E,0.217195
3,80129E,0.221726
4,80139E,0.201238


In [35]:
x_train = pd.merge(x_train, season, on=['season'])
x_train = pd.merge(x_train, un_carrier, on=['UniqueCarrier'])
x_train = pd.merge(x_train, origin, on=['Origin'])
x_train = pd.merge(x_train, dest, on=['Dest'])
x_train = pd.merge(x_train, tailnum, on=['TailNum'])
x_train = pd.merge(x_train, flight_volume, on=['TailNum'])
x_train = pd.merge(x_train, month_mean, on=['Month'])
x_train = pd.merge(x_train, day_mean, on=['DayOfWeek'])
x_train = pd.merge(x_train, flight_c_volume, on=['UniqueCarrier'])

x_train.shape

(1028281, 32)

In [36]:
x_train = shuffle(x_train)

Будем пользоваться только перечисленными ниже фичами

In [40]:
numeric_features = [
    'Month',
    'DayofMonth',
    'DayOfWeek',
    'DepTime',
    'ArrTime',
    'AirTime',
    'Distance',
    'TaxiIn',
    'TaxiOut'
]

numeric_features += ['FullTime',
                     'elapsed_time',
                     'diff_arr_dep_time',
                     'speed',
                     'season_empirical_mean',
                     'un_carrier_empirical_mean', 
                     'origin_empirical_mean', 
                     'dest_empirical_mean',
                     'tailnum_empirical_mean',
                     'flight_volume',
                     'day_mean_target',
                     'month_mean_target',
                     'flight_carrier_volume']

In [41]:
test2 = test.copy()

In [42]:
test = pd.merge(test, season, on=['season'], how='left')
test = pd.merge(test, un_carrier, on=['UniqueCarrier'], how='left')
test = pd.merge(test, origin, on=['Origin'], how='left')
test = pd.merge(test, dest, on=['Dest'], how='left')
test = pd.merge(test, tailnum, on=['TailNum'], how='left')
test = pd.merge(test, flight_volume, on=['TailNum'], how='left')
test = pd.merge(test, month_mean, on=['Month'], how='left')
test = pd.merge(test, day_mean, on=['DayOfWeek'], how='left')
test = pd.merge(test, flight_c_volume, on=['UniqueCarrier'], how='left')

test.shape

(3504864, 37)

In [43]:
x_train = x_train.loc[:, numeric_features + ['target']]
test = test.loc[:, numeric_features]
print(x_train.shape, test.shape)
x_train.head()

(1028281, 23) (3504864, 22)


Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,AirTime,Distance,TaxiIn,TaxiOut,FullTime,...,season_empirical_mean,un_carrier_empirical_mean,origin_empirical_mean,dest_empirical_mean,tailnum_empirical_mean,flight_volume,day_mean_target,month_mean_target,flight_carrier_volume,target
140157,10,20,1,1849.0,1920.0,139.0,935,6.0,6.0,31,...,,,-1.06872,-1.039568,0.307787,0.489144,-0.83596,-1.37876,1.0,0
950143,11,11,2,1706.0,1932.0,50.0,264,27.0,9.0,146,...,,,-1.177351,-0.965937,0.268166,0.349085,-0.983972,-1.287725,0.370244,0
655384,3,29,6,1421.0,1557.0,143.0,866,2.0,11.0,96,...,,,-1.079229,-1.228544,0.276404,0.277139,-0.947283,-0.612841,0.301651,0
538960,8,29,5,947.0,1211.0,128.0,919,4.0,12.0,144,...,,,-0.763227,-0.883898,0.285505,0.417625,-0.681037,-0.919968,0.212929,0
804776,7,7,1,2138.0,549.0,293.0,2466,6.0,12.0,491,...,,,-1.226517,-1.217491,0.272277,0.256279,-0.83596,-0.791903,0.285397,0


### Кросс валидация

Обычно её делают так

In [44]:
cv = cross_val_score(estimator=LogisticRegression(), 
                X=x_train.drop('target', axis=1), 
                y=x_train.target.values,
                cv=5, # you may use 3 or 5
                scoring='roc_auc', 
                n_jobs=-1,)
print(cv.mean(), cv.std())
print(cv)

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
A:\Anaconda3\lib\runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...nda3\\lib\\site-packages\\ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
A:\Anaconda3\lib\runpy.py in _run_code(code=<code object <module> at 0x000001C59263DC90, fil...lib\site-packages\ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': r'A:\Anaconda3\lib\site-packages\__pycache__\ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': r'A:\Anaconda3\lib\site-packages\ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...nda3\\lib\\site-packages\\ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from 'A:\\Anaconda3\\lib\\site-packages\\ipykernel\\kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...nda3\\lib\\site-packages\\ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x000001C59263DC90, fil...lib\site-packages\ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': r'A:\Anaconda3\lib\site-packages\__pycache__\ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': r'A:\Anaconda3\lib\site-packages\ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...nda3\\lib\\site-packages\\ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from 'A:\\Anaconda3\\lib\\site-packages\\ipykernel\\kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
A:\Anaconda3\lib\site-packages\ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
A:\Anaconda3\lib\site-packages\traitlets\config\application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
A:\Anaconda3\lib\site-packages\ipykernel\kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    472             return self.subapp.start()
    473         if self.poller is not None:
    474             self.poller.start()
    475         self.kernel.start()
    476         try:
--> 477             ioloop.IOLoop.instance().start()
    478         except KeyboardInterrupt:
    479             pass
    480 
    481 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
A:\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    172             )
    173         return loop
    174     
    175     def start(self):
    176         try:
--> 177             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    178         except ZMQError as e:
    179             if e.errno == ETERM:
    180                 # quietly return on ETERM
    181                 pass

...........................................................................
A:\Anaconda3\lib\site-packages\tornado\ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    883                 self._events.update(event_pairs)
    884                 while self._events:
    885                     fd, events = self._events.popitem()
    886                     try:
    887                         fd_obj, handler_func = self._handlers[fd]
--> 888                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    889                     except (OSError, IOError) as e:
    890                         if errno_from_exception(e) == errno.EPIPE:
    891                             # Happens when the client closes the connection
    892                             pass

...........................................................................
A:\Anaconda3\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
A:\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
A:\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
A:\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
A:\Anaconda3\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
A:\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
A:\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'cv = cross_val_score(estimator=LogisticRegressio... n_jobs=-1,)\nprint(cv.mean(), cv.std())\nprint(cv)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 2, 24, 18, 55, 57, 492250, tzinfo=tzutc()), 'msg_id': '477830FF1DBF499885B7307851B82494', 'msg_type': 'execute_request', 'session': '6D8CFF996F07448DB708A867FE8C7AC1', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '477830FF1DBF499885B7307851B82494', 'msg_type': 'execute_request', 'parent_header': {}})
    230             self.log.warn("Unknown message type: %r", msg_type)
    231         else:
    232             self.log.debug("%s: %s", msg_type, msg)
    233             self.pre_handler_hook()
    234             try:
--> 235                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'6D8CFF996F07448DB708A867FE8C7AC1']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'cv = cross_val_score(estimator=LogisticRegressio... n_jobs=-1,)\nprint(cv.mean(), cv.std())\nprint(cv)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 2, 24, 18, 55, 57, 492250, tzinfo=tzutc()), 'msg_id': '477830FF1DBF499885B7307851B82494', 'msg_type': 'execute_request', 'session': '6D8CFF996F07448DB708A867FE8C7AC1', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '477830FF1DBF499885B7307851B82494', 'msg_type': 'execute_request', 'parent_header': {}}
    236             except Exception:
    237                 self.log.error("Exception in message handler:", exc_info=True)
    238             finally:
    239                 self.post_handler_hook()

...........................................................................
A:\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'6D8CFF996F07448DB708A867FE8C7AC1'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'cv = cross_val_score(estimator=LogisticRegressio... n_jobs=-1,)\nprint(cv.mean(), cv.std())\nprint(cv)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 2, 24, 18, 55, 57, 492250, tzinfo=tzutc()), 'msg_id': '477830FF1DBF499885B7307851B82494', 'msg_type': 'execute_request', 'session': '6D8CFF996F07448DB708A867FE8C7AC1', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '477830FF1DBF499885B7307851B82494', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
A:\Anaconda3\lib\site-packages\ipykernel\ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='cv = cross_val_score(estimator=LogisticRegressio... n_jobs=-1,)\nprint(cv.mean(), cv.std())\nprint(cv)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'cv = cross_val_score(estimator=LogisticRegressio... n_jobs=-1,)\nprint(cv.mean(), cv.std())\nprint(cv)'
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
A:\Anaconda3\lib\site-packages\ipykernel\zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('cv = cross_val_score(estimator=LogisticRegressio... n_jobs=-1,)\nprint(cv.mean(), cv.std())\nprint(cv)',), **kwargs={'silent': False, 'store_history': True})
    528             )
    529         self.payload_manager.write_payload(payload)
    530 
    531     def run_cell(self, *args, **kwargs):
    532         self._last_traceback = None
--> 533         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('cv = cross_val_score(estimator=LogisticRegressio... n_jobs=-1,)\nprint(cv.mean(), cv.std())\nprint(cv)',)
        kwargs = {'silent': False, 'store_history': True}
    534 
    535     def _showtraceback(self, etype, evalue, stb):
    536         # try to preserve ordering of tracebacks and print statements
    537         sys.stdout.flush()

...........................................................................
A:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='cv = cross_val_score(estimator=LogisticRegressio... n_jobs=-1,)\nprint(cv.mean(), cv.std())\nprint(cv)', store_history=True, silent=False, shell_futures=True)
   2693                 self.displayhook.exec_result = result
   2694 
   2695                 # Execute the user code
   2696                 interactivity = "none" if silent else self.ast_node_interactivity
   2697                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2698                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2699                 
   2700                 self.last_execution_succeeded = not has_raised
   2701 
   2702                 # Reset this so later displayed values do not modify the

...........................................................................
A:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Expr object>, <_ast.Expr object>], cell_name='<ipython-input-44-f5a234f2c534>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 1c581f23550, executio..._before_exec=None error_in_exec=None result=None>)
   2797 
   2798         try:
   2799             for i, node in enumerate(to_run_exec):
   2800                 mod = ast.Module([node])
   2801                 code = compiler(mod, cell_name, "exec")
-> 2802                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x000001C581CF9C00, file "<ipython-input-44-f5a234f2c534>", line 1>
        result = <ExecutionResult object at 1c581f23550, executio..._before_exec=None error_in_exec=None result=None>
   2803                     return True
   2804 
   2805             for i, node in enumerate(to_run_interactive):
   2806                 mod = ast.Interactive([node])

...........................................................................
A:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x000001C581CF9C00, file "<ipython-input-44-f5a234f2c534>", line 1>, result=<ExecutionResult object at 1c581f23550, executio..._before_exec=None error_in_exec=None result=None>)
   2857         outflag = True  # happens in more places, so it's easier as default
   2858         try:
   2859             try:
   2860                 self.hooks.pre_run_code_hook()
   2861                 #rprint('Running code', repr(code_obj)) # dbg
-> 2862                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x000001C581CF9C00, file "<ipython-input-44-f5a234f2c534>", line 1>
        self.user_global_ns = {'In': ['', 'import pandas as pd\nimport numpy as np\nfrom skle...rt MinMaxScaler\nfrom sklearn.utils import shuffle', 'scaler = MinMaxScaler()', "df = pd.read_csv('train_dataset.csv')\ndf = df[(~...t.Id.values\ntest.drop('Id', axis=1, inplace=True)", "df['speed'] = (60 * df.Distance/df.AirTime).repl...st.AirTime).replace([np.nan, np.inf, -np.inf], 0)", "df['diff_arr_dep_time'] = df.ArrTime - df.DepTim...diff_arr_dep_time'] = test.ArrTime - test.DepTime", "df['elapsed_time'] = df.ArrTime + df.TaxiIn + df...ime'] = test.ArrTime + test.TaxiIn + test.TaxiOut", "def season(x):\n    if x in [12, 1, 2]:\n        r...'season'] = test.Month.apply(lambda x: season(x))", "def timefl(x):\n    begin = x['DepTime']\n    end ... 1)\ntest['FullTime'] = df.apply(timefl, axis = 1)", 'df = shuffle(df)\n\nx_count, x_train = train_test_...dom_state=42)\nprint(x_count.shape, x_train.shape)', 'import math\ndef logit(x):\n    return math.log(x ...   return logit(v.mean() + 0.00001) # guess why ?', "day_mean = df.loc[:, ['DayOfWeek', 'target']].gr...'day_mean_target'}, inplace=True)\nday_mean.head()", "month_mean = df.loc[:, ['Month', 'target']].grou...th_mean_target'}, inplace=True)\nmonth_mean.head()", 'import warnings\nwarnings.filterwarnings("ignore")', "flight_c_volume = df.loc[:, ['target', 'UniqueCa...er_volume'].reshape(-1,1))\nflight_c_volume.head()", "flight_volume = df.loc[:, ['target', 'TailNum']]...ight_volume'].reshape(-1,1))\nflight_volume.head()", "season = x_count.loc[:, ['target', 'season']].gr...son_empirical_mean'}, inplace=True)\nseason.head()", "un_carrier = x_count.loc[:, ['target', 'UniqueCa...empirical_mean'}, inplace=True)\nun_carrier.head()", "origin = x_count.loc[:, ['target', 'Origin']].gr...gin_empirical_mean'}, inplace=True)\norigin.head()", "dest = x_count.loc[:, ['target', 'Dest']].groupb...'dest_empirical_mean'}, inplace=True)\ndest.head()", ...], 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'MinMaxScaler': <class 'sklearn.preprocessing.data.MinMaxScaler'>, 'Out': {11:    DayOfWeek  day_mean_target
0          1      ... 4        -0.840437
4          5        -0.681012, 12:    Month  month_mean_target
0      1          -0... 4          -0.947671
4      5          -0.985007, 14:   UniqueCarrier  flight_carrier_volume
0        ...  0.119449
4            B6               0.156475, 15:   TailNum  flight_volume
0  80009E       0.39037...3  80129E       0.407407
4  80139E       0.398467, 16:    season  season_empirical_mean
0    fall      ...       -0.770938
3  winter              -0.562173, 17:   UniqueCarrier  un_carrier_empirical_mean
0    ...948899
4            B6                  -0.859630, 18: Empty DataFrame
Columns: [index]
Index: [], 19:   Dest  dest_empirical_mean
0  ABE            -0...            -0.737915
4  ACK            -0.514413, 20:   TailNum  tailnum_empirical_mean
0  80009E     ...       0.221726
4  80139E                0.201238, 25:    DayOfWeek  day_mean_target
0          1      ... 4        -0.840462
4          5        -0.681037, ...}, '_':         Month  DayofMonth  DayOfWeek  DepTime  A...        0.285397       0  

[5 rows x 23 columns], '_11':    DayOfWeek  day_mean_target
0          1      ... 4        -0.840437
4          5        -0.681012, '_12':    Month  month_mean_target
0      1          -0... 4          -0.947671
4      5          -0.985007, '_14':   UniqueCarrier  flight_carrier_volume
0        ...  0.119449
4            B6               0.156475, '_15':   TailNum  flight_volume
0  80009E       0.39037...3  80129E       0.407407
4  80139E       0.398467, '_16':    season  season_empirical_mean
0    fall      ...       -0.770938
3  winter              -0.562173, ...}
        self.user_ns = {'In': ['', 'import pandas as pd\nimport numpy as np\nfrom skle...rt MinMaxScaler\nfrom sklearn.utils import shuffle', 'scaler = MinMaxScaler()', "df = pd.read_csv('train_dataset.csv')\ndf = df[(~...t.Id.values\ntest.drop('Id', axis=1, inplace=True)", "df['speed'] = (60 * df.Distance/df.AirTime).repl...st.AirTime).replace([np.nan, np.inf, -np.inf], 0)", "df['diff_arr_dep_time'] = df.ArrTime - df.DepTim...diff_arr_dep_time'] = test.ArrTime - test.DepTime", "df['elapsed_time'] = df.ArrTime + df.TaxiIn + df...ime'] = test.ArrTime + test.TaxiIn + test.TaxiOut", "def season(x):\n    if x in [12, 1, 2]:\n        r...'season'] = test.Month.apply(lambda x: season(x))", "def timefl(x):\n    begin = x['DepTime']\n    end ... 1)\ntest['FullTime'] = df.apply(timefl, axis = 1)", 'df = shuffle(df)\n\nx_count, x_train = train_test_...dom_state=42)\nprint(x_count.shape, x_train.shape)', 'import math\ndef logit(x):\n    return math.log(x ...   return logit(v.mean() + 0.00001) # guess why ?', "day_mean = df.loc[:, ['DayOfWeek', 'target']].gr...'day_mean_target'}, inplace=True)\nday_mean.head()", "month_mean = df.loc[:, ['Month', 'target']].grou...th_mean_target'}, inplace=True)\nmonth_mean.head()", 'import warnings\nwarnings.filterwarnings("ignore")', "flight_c_volume = df.loc[:, ['target', 'UniqueCa...er_volume'].reshape(-1,1))\nflight_c_volume.head()", "flight_volume = df.loc[:, ['target', 'TailNum']]...ight_volume'].reshape(-1,1))\nflight_volume.head()", "season = x_count.loc[:, ['target', 'season']].gr...son_empirical_mean'}, inplace=True)\nseason.head()", "un_carrier = x_count.loc[:, ['target', 'UniqueCa...empirical_mean'}, inplace=True)\nun_carrier.head()", "origin = x_count.loc[:, ['target', 'Origin']].gr...gin_empirical_mean'}, inplace=True)\norigin.head()", "dest = x_count.loc[:, ['target', 'Dest']].groupb...'dest_empirical_mean'}, inplace=True)\ndest.head()", ...], 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'MinMaxScaler': <class 'sklearn.preprocessing.data.MinMaxScaler'>, 'Out': {11:    DayOfWeek  day_mean_target
0          1      ... 4        -0.840437
4          5        -0.681012, 12:    Month  month_mean_target
0      1          -0... 4          -0.947671
4      5          -0.985007, 14:   UniqueCarrier  flight_carrier_volume
0        ...  0.119449
4            B6               0.156475, 15:   TailNum  flight_volume
0  80009E       0.39037...3  80129E       0.407407
4  80139E       0.398467, 16:    season  season_empirical_mean
0    fall      ...       -0.770938
3  winter              -0.562173, 17:   UniqueCarrier  un_carrier_empirical_mean
0    ...948899
4            B6                  -0.859630, 18: Empty DataFrame
Columns: [index]
Index: [], 19:   Dest  dest_empirical_mean
0  ABE            -0...            -0.737915
4  ACK            -0.514413, 20:   TailNum  tailnum_empirical_mean
0  80009E     ...       0.221726
4  80139E                0.201238, 25:    DayOfWeek  day_mean_target
0          1      ... 4        -0.840462
4          5        -0.681037, ...}, '_':         Month  DayofMonth  DayOfWeek  DepTime  A...        0.285397       0  

[5 rows x 23 columns], '_11':    DayOfWeek  day_mean_target
0          1      ... 4        -0.840437
4          5        -0.681012, '_12':    Month  month_mean_target
0      1          -0... 4          -0.947671
4      5          -0.985007, '_14':   UniqueCarrier  flight_carrier_volume
0        ...  0.119449
4            B6               0.156475, '_15':   TailNum  flight_volume
0  80009E       0.39037...3  80129E       0.407407
4  80139E       0.398467, '_16':    season  season_empirical_mean
0    fall      ...       -0.770938
3  winter              -0.562173, ...}
   2863             finally:
   2864                 # Reset our crash handler in place
   2865                 sys.excepthook = old_excepthook
   2866         except SystemExit as e:

...........................................................................
C:\Users\Даниил\<ipython-input-44-f5a234f2c534> in <module>()
      1 cv = cross_val_score(estimator=LogisticRegression(), 
      2                 X=x_train.drop('target', axis=1), 
      3                 y=x_train.target.values,
      4                 cv=5, # you may use 3 or 5
      5                 scoring='roc_auc', 
----> 6                 n_jobs=-1,)
      7 print(cv.mean(), cv.std())
      8 print(cv)

...........................................................................
A:\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator=LogisticRegression(C=1.0, class_weight=None, dua...ol=0.0001,
          verbose=0, warm_start=False), X=         Month  DayofMonth  DayOfWeek  DepTime  ...          0.372204  

[1028281 rows x 22 columns], y=array([0, 0, 0, ..., 0, 0, 0], dtype=int64), groups=None, scoring='roc_auc', cv=5, n_jobs=-1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs')
    316     cv_results = cross_validate(estimator=estimator, X=X, y=y, groups=groups,
    317                                 scoring={'score': scorer}, cv=cv,
    318                                 return_train_score=False,
    319                                 n_jobs=n_jobs, verbose=verbose,
    320                                 fit_params=fit_params,
--> 321                                 pre_dispatch=pre_dispatch)
        pre_dispatch = '2*n_jobs'
    322     return cv_results['test_score']
    323 
    324 
    325 def _fit_and_score(estimator, X, y, scorer, train, test, verbose,

...........................................................................
A:\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator=LogisticRegression(C=1.0, class_weight=None, dua...ol=0.0001,
          verbose=0, warm_start=False), X=         Month  DayofMonth  DayOfWeek  DepTime  ...          0.372204  

[1028281 rows x 22 columns], y=array([0, 0, 0, ..., 0, 0, 0], dtype=int64), groups=None, scoring={'score': make_scorer(roc_auc_score, needs_threshold=True)}, cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False), n_jobs=-1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', return_train_score=False)
    190     scores = parallel(
    191         delayed(_fit_and_score)(
    192             clone(estimator), X, y, scorers, train, test, verbose, None,
    193             fit_params, return_train_score=return_train_score,
    194             return_times=True)
--> 195         for train, test in cv.split(X, y, groups))
        cv.split = <bound method StratifiedKFold.split of Stratifie...ld(n_splits=5, random_state=None, shuffle=False)>
        X =          Month  DayofMonth  DayOfWeek  DepTime  ...          0.372204  

[1028281 rows x 22 columns]
        y = array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
        groups = None
    196 
    197     if return_train_score:
    198         train_scores, test_scores, fit_times, score_times = zip(*scores)
    199         train_scores = _aggregate_score_dicts(train_scores)

...........................................................................
A:\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object cross_validate.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Sat Feb 24 21:57:28 2018
PID: 13836                            Python 3.6.2: A:\Anaconda3\python.exe
...........................................................................
A:\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (LogisticRegression(C=1.0, class_weight=None, dua...ol=0.0001,
          verbose=0, warm_start=False),          Month  DayofMonth  DayOfWeek  DepTime  ...          0.372204  

[1028281 rows x 22 columns], memmap([0, 0, 0, ..., 0, 0, 0], dtype=int64), {'score': make_scorer(roc_auc_score, needs_threshold=True)}, memmap([ 204912,  204921,  204923, ..., 1028278, 1028279, 1028280]), array([     0,      1,      2, ..., 205944, 205945, 205946]), 0, None, None), {'return_times': True, 'return_train_score': False})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
A:\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (LogisticRegression(C=1.0, class_weight=None, dua...ol=0.0001,
          verbose=0, warm_start=False),          Month  DayofMonth  DayOfWeek  DepTime  ...          0.372204  

[1028281 rows x 22 columns], memmap([0, 0, 0, ..., 0, 0, 0], dtype=int64), {'score': make_scorer(roc_auc_score, needs_threshold=True)}, memmap([ 204912,  204921,  204923, ..., 1028278, 1028279, 1028280]), array([     0,      1,      2, ..., 205944, 205945, 205946]), 0, None, None)
        kwargs = {'return_times': True, 'return_train_score': False}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
A:\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator=LogisticRegression(C=1.0, class_weight=None, dua...ol=0.0001,
          verbose=0, warm_start=False), X=         Month  DayofMonth  DayOfWeek  DepTime  ...          0.372204  

[1028281 rows x 22 columns], y=memmap([0, 0, 0, ..., 0, 0, 0], dtype=int64), scorer={'score': make_scorer(roc_auc_score, needs_threshold=True)}, train=memmap([ 204912,  204921,  204923, ..., 1028278, 1028279, 1028280]), test=array([     0,      1,      2, ..., 205944, 205945, 205946]), verbose=0, parameters=None, fit_params={}, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=True, error_score='raise')
    432 
    433     try:
    434         if y_train is None:
    435             estimator.fit(X_train, **fit_params)
    436         else:
--> 437             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method LogisticRegression.fit of Logistic...l=0.0001,
          verbose=0, warm_start=False)>
        X_train =         Month  DayofMonth  DayOfWeek  DepTime  A...           0.372204  

[822624 rows x 22 columns]
        y_train = memmap([1, 1, 1, ..., 0, 0, 0], dtype=int64)
        fit_params = {}
    438 
    439     except Exception as e:
    440         # Note fit time as time until error
    441         fit_time = time.time() - start_time

...........................................................................
A:\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self=LogisticRegression(C=1.0, class_weight=None, dua...ol=0.0001,
          verbose=0, warm_start=False), X=        Month  DayofMonth  DayOfWeek  DepTime  A...           0.372204  

[822624 rows x 22 columns], y=memmap([1, 1, 1, ..., 0, 0, 0], dtype=int64), sample_weight=None)
   1211             _dtype = [np.float64, np.float32]
   1212         else:
   1213             _dtype = np.float64
   1214 
   1215         X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype,
-> 1216                          order="C")
   1217         check_classification_targets(y)
   1218         self.classes_ = np.unique(y)
   1219         n_samples, n_features = X.shape
   1220 

...........................................................................
A:\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X=        Month  DayofMonth  DayOfWeek  DepTime  A...           0.372204  

[822624 rows x 22 columns], y=memmap([1, 1, 1, ..., 0, 0, 0], dtype=int64), accept_sparse='csr', dtype=<class 'numpy.float64'>, order='C', copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, warn_on_dtype=False, estimator=None)
    537     y_converted : object
    538         The converted and validated y.
    539     """
    540     X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
    541                     ensure_2d, allow_nd, ensure_min_samples,
--> 542                     ensure_min_features, warn_on_dtype, estimator)
        ensure_min_features = 1
        warn_on_dtype = False
        estimator = None
    543     if multi_output:
    544         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
    545                         dtype=None)
    546     else:

...........................................................................
A:\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array=array([[  1.        ,  27.        ,   7.        ...0.68103685,
         -0.56387301,   0.37220361]]), accept_sparse='csr', dtype=<class 'numpy.float64'>, order='C', copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, warn_on_dtype=False, estimator=None)
    417             array = array.astype(np.float64)
    418         if not allow_nd and array.ndim >= 3:
    419             raise ValueError("Found array with dim %d. %s expected <= 2."
    420                              % (array.ndim, estimator_name))
    421         if force_all_finite:
--> 422             _assert_all_finite(array)
        array = array([[  1.        ,  27.        ,   7.        ...0.68103685,
         -0.56387301,   0.37220361]])
    423 
    424     shape_repr = _shape_repr(array.shape)
    425     if ensure_min_samples > 0:
    426         n_samples = _num_samples(array)

...........................................................................
A:\Anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X=array([[  1.        ,  27.        ,   7.        ...0.68103685,
         -0.56387301,   0.37220361]]))
     38     # everything is finite; fall back to O(n) space np.isfinite to prevent
     39     # false positives from overflow in sum method.
     40     if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
     41             and not np.isfinite(X).all()):
     42         raise ValueError("Input contains NaN, infinity"
---> 43                          " or a value too large for %r." % X.dtype)
        X.dtype = dtype('float64')
     44 
     45 
     46 def assert_all_finite(X):
     47     """Throw a ValueError if X contains NaN or infinity.

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
___________________________________________________________________________

### Обучение модели

In [35]:
test.fillna(0, inplace=True)

In [36]:
clf = LogisticRegression(C=1.0)
clf.fit(x_train.drop('target', axis=1), x_train.target.values)
submission = clf.predict_proba(test)[:, 1]

In [37]:
test_ids.shape

(3504864,)

In [38]:
submission.shape

(3504864,)

In [39]:
pd.DataFrame({'Id': test_ids, 'Prediction1': submission}).to_csv('mean_target_mining_2.csv', index=False)