# import lib

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

from tqdm.notebook import tqdm

from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV
from sklearn.utils.class_weight import compute_class_weight

from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import keras 
from keras.models import Sequential  # Для создания и обучения нейронных сетей
from keras.layers import LSTM, Dense, Dropout, SimpleRNN, Conv1D, Flatten, MaxPooling1D, BatchNormalization
from keras.preprocessing.sequence import pad_sequences
import keras_tuner as kt
from keras.optimizers import Adam

from sklearn.metrics import accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')

# data opening

In [2]:
train = pd.read_csv("train.tsv", sep='\t').drop(['id'], axis =1)
test = pd.read_csv("test.tsv", sep='\t')

In [3]:
train

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x992,x993,x994,x995,x996,x997,x998,x999,x1000,y
0,80,?,-10,-20,-10,150,?,-510,?,-10,...,20,60,-10,50,?,-50,?,-3860,-270,P
1,70,20,-20,?,-10,?,360,-440,-400,?,...,20,?,-90,50,?,-60,0,-3250,-750,P
2,80,?,20,-10,10,?,190,-500,880,?,...,10,?,?,60,-1310,-40,0,-3410,-630,N
3,60,?,?,-40,?,?,580,?,-200,-10,...,20,60,40,60,?,-50,?,-3020,?,P
4,70,?,?,-10,0,230,?,30,390,0,...,20,?,-60,?,-1410,-50,-40,-3500,-420,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,70,80,?,0,0,180,180,-600,590,?,...,20,60,?,40,-1320,-40,-20,-3510,?,N
19996,70,90,?,0,0,140,540,-380,?,-10,...,10,?,-40,50,?,?,-20,-3420,-510,N
19997,60,?,?,40,10,180,320,?,?,?,...,10,?,?,?,-1050,?,?,-3250,-520,N
19998,?,?,-30,0,?,200,300,?,?,0,...,10,70,?,40,?,-60,-40,-3500,-850,N


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Columns: 1001 entries, x1 to y
dtypes: object(1001)
memory usage: 152.7+ MB


In [5]:
Counter(train.y)

Counter({'N': 14085, 'P': 5915})

In [6]:
test

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x991,x992,x993,x994,x995,x996,x997,x998,x999,x1000
0,20001,70,90,-100,?,-10,?,260,-370,-280,...,?,?,10,140,50,?,-60,?,-3200,-950
1,20002,60,40,-40,-20,-10,110,330,?,?,...,?,20,40,?,?,-1220,-50,?,-3560,-920
2,20003,80,60,?,-60,0,?,360,-540,?,...,?,20,?,100,?,-1170,-50,-10,?,?
3,20004,80,90,40,10,?,190,490,-380,?,...,?,20,20,-40,40,?,?,-40,?,?
4,20005,80,70,40,?,?,70,470,-340,630,...,50,10,30,20,40,-1700,-60,-20,?,-420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12340,32341,60,?,?,20,?,240,430,?,340,...,50,10,70,50,50,-1390,?,-10,-2890,-490
12341,32342,?,50,-30,0,20,80,360,?,-180,...,?,?,?,?,50,?,-50,?,-3010,?
12342,32343,70,?,30,?,10,170,190,-570,310,...,50,10,20,-150,?,?,-50,?,-3120,-710
12343,32344,70,20,-60,-40,-30,210,560,?,-540,...,?,?,?,-10,60,-1630,?,?,-3700,-570


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12345 entries, 0 to 12344
Columns: 1001 entries, id to x1000
dtypes: int64(1), object(1000)
memory usage: 94.3+ MB


In [8]:
for i in tqdm(range(1, 1001)):
    train[f'x{i}'] = train[f'x{i}'].apply(lambda x: x if x != '?' else np.nan).astype(float)
    train[f'x{i}'] = train[f'x{i}'].fillna(train[f'x{i}'].mean())
train

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x992,x993,x994,x995,x996,x997,x998,x999,x1000,y
0,80.000000,60.793651,-10.000000,-20.000000,-10.000000,150.000000,385.874377,-510.000000,53.733647,-10.000,...,20.0,60.000000,-10.000000,50.000000,-1260.99042,-50.000000,-18.394139,-3860.0,-270.000000,P
1,70.000000,20.000000,-20.000000,-7.420088,-10.000000,123.654212,360.000000,-440.000000,-400.000000,-4.158,...,20.0,40.226641,-90.000000,50.000000,-1260.99042,-60.000000,0.000000,-3250.0,-750.000000,P
2,80.000000,60.793651,20.000000,-10.000000,10.000000,123.654212,190.000000,-500.000000,880.000000,-4.158,...,10.0,40.226641,16.217575,60.000000,-1310.00000,-40.000000,0.000000,-3410.0,-630.000000,N
3,60.000000,60.793651,5.645222,-40.000000,-2.613335,123.654212,580.000000,-415.535701,-200.000000,-10.000,...,20.0,60.000000,40.000000,60.000000,-1260.99042,-50.000000,-18.394139,-3020.0,-650.198583,P
4,70.000000,60.793651,5.645222,-10.000000,0.000000,230.000000,385.874377,30.000000,390.000000,0.000,...,20.0,40.226641,-60.000000,48.734345,-1410.00000,-50.000000,-40.000000,-3500.0,-420.000000,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,70.000000,80.000000,5.645222,0.000000,0.000000,180.000000,180.000000,-600.000000,590.000000,-4.158,...,20.0,60.000000,16.217575,40.000000,-1320.00000,-40.000000,-20.000000,-3510.0,-650.198583,N
19996,70.000000,90.000000,5.645222,0.000000,0.000000,140.000000,540.000000,-380.000000,53.733647,-10.000,...,10.0,40.226641,-40.000000,50.000000,-1260.99042,-50.806794,-20.000000,-3420.0,-510.000000,N
19997,60.000000,60.793651,5.645222,40.000000,10.000000,180.000000,320.000000,-415.535701,53.733647,-4.158,...,10.0,40.226641,16.217575,48.734345,-1050.00000,-50.806794,-18.394139,-3250.0,-520.000000,N
19998,71.792725,60.793651,-30.000000,0.000000,-2.613335,200.000000,300.000000,-415.535701,53.733647,0.000,...,10.0,70.000000,16.217575,40.000000,-1260.99042,-60.000000,-40.000000,-3500.0,-850.000000,N


In [9]:
for i in tqdm(range(1, 1001)):
    test[f'x{i}'] = test[f'x{i}'].apply(lambda x: x if x != '?' else np.nan).astype(float)
    test[f'x{i}'] = test[f'x{i}'].fillna(test[f'x{i}'].mean())
test

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x991,x992,x993,x994,x995,x996,x997,x998,x999,x1000
0,20001,70.000000,90.000000,-100.000000,-7.692767,-10.00000,122.507357,260.000000,-370.000000,-280.000000,...,51.832335,15.8103,10.000000,140.000000,50.000000,-1260.440221,-60.00000,-18.293118,-3200.000000,-950.000000
1,20002,60.000000,40.000000,-40.000000,-20.000000,-10.00000,110.000000,330.000000,-412.937313,59.390404,...,51.832335,20.0000,40.000000,15.435328,48.894787,-1220.000000,-50.00000,-18.293118,-3560.000000,-920.000000
2,20003,80.000000,60.000000,6.902077,-60.000000,0.00000,122.507357,360.000000,-540.000000,59.390404,...,51.832335,20.0000,39.960365,100.000000,48.894787,-1170.000000,-50.00000,-10.000000,-3229.857769,-655.992307
3,20004,80.000000,90.000000,40.000000,10.000000,-2.52149,190.000000,490.000000,-380.000000,59.390404,...,51.832335,20.0000,20.000000,-40.000000,40.000000,-1260.440221,-50.79482,-40.000000,-3229.857769,-655.992307
4,20005,80.000000,70.000000,40.000000,-7.692767,-2.52149,70.000000,470.000000,-340.000000,630.000000,...,50.000000,10.0000,30.000000,20.000000,40.000000,-1700.000000,-60.00000,-20.000000,-3229.857769,-420.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12340,32341,60.000000,60.830025,6.902077,20.000000,-2.52149,240.000000,430.000000,-412.937313,340.000000,...,50.000000,10.0000,70.000000,50.000000,50.000000,-1390.000000,-50.79482,-10.000000,-2890.000000,-490.000000
12341,32342,71.693249,50.000000,-30.000000,0.000000,20.00000,80.000000,360.000000,-412.937313,-180.000000,...,51.832335,15.8103,39.960365,15.435328,50.000000,-1260.440221,-50.00000,-18.293118,-3010.000000,-655.992307
12342,32343,70.000000,60.830025,30.000000,-7.692767,10.00000,170.000000,190.000000,-570.000000,310.000000,...,50.000000,10.0000,20.000000,-150.000000,48.894787,-1260.440221,-50.00000,-18.293118,-3120.000000,-710.000000
12343,32344,70.000000,20.000000,-60.000000,-40.000000,-30.00000,210.000000,560.000000,-412.937313,-540.000000,...,51.832335,15.8103,39.960365,-10.000000,60.000000,-1630.000000,-50.79482,-18.293118,-3700.000000,-570.000000


In [10]:
le =LabelEncoder()
train['y'] = le.fit_transform(train.y)
train

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x992,x993,x994,x995,x996,x997,x998,x999,x1000,y
0,80.000000,60.793651,-10.000000,-20.000000,-10.000000,150.000000,385.874377,-510.000000,53.733647,-10.000,...,20.0,60.000000,-10.000000,50.000000,-1260.99042,-50.000000,-18.394139,-3860.0,-270.000000,1
1,70.000000,20.000000,-20.000000,-7.420088,-10.000000,123.654212,360.000000,-440.000000,-400.000000,-4.158,...,20.0,40.226641,-90.000000,50.000000,-1260.99042,-60.000000,0.000000,-3250.0,-750.000000,1
2,80.000000,60.793651,20.000000,-10.000000,10.000000,123.654212,190.000000,-500.000000,880.000000,-4.158,...,10.0,40.226641,16.217575,60.000000,-1310.00000,-40.000000,0.000000,-3410.0,-630.000000,0
3,60.000000,60.793651,5.645222,-40.000000,-2.613335,123.654212,580.000000,-415.535701,-200.000000,-10.000,...,20.0,60.000000,40.000000,60.000000,-1260.99042,-50.000000,-18.394139,-3020.0,-650.198583,1
4,70.000000,60.793651,5.645222,-10.000000,0.000000,230.000000,385.874377,30.000000,390.000000,0.000,...,20.0,40.226641,-60.000000,48.734345,-1410.00000,-50.000000,-40.000000,-3500.0,-420.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,70.000000,80.000000,5.645222,0.000000,0.000000,180.000000,180.000000,-600.000000,590.000000,-4.158,...,20.0,60.000000,16.217575,40.000000,-1320.00000,-40.000000,-20.000000,-3510.0,-650.198583,0
19996,70.000000,90.000000,5.645222,0.000000,0.000000,140.000000,540.000000,-380.000000,53.733647,-10.000,...,10.0,40.226641,-40.000000,50.000000,-1260.99042,-50.806794,-20.000000,-3420.0,-510.000000,0
19997,60.000000,60.793651,5.645222,40.000000,10.000000,180.000000,320.000000,-415.535701,53.733647,-4.158,...,10.0,40.226641,16.217575,48.734345,-1050.00000,-50.806794,-18.394139,-3250.0,-520.000000,0
19998,71.792725,60.793651,-30.000000,0.000000,-2.613335,200.000000,300.000000,-415.535701,53.733647,0.000,...,10.0,70.000000,16.217575,40.000000,-1260.99042,-60.000000,-40.000000,-3500.0,-850.000000,0


In [11]:
X = train.drop(['y'], axis=1)
X

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x991,x992,x993,x994,x995,x996,x997,x998,x999,x1000
0,80.000000,60.793651,-10.000000,-20.000000,-10.000000,150.000000,385.874377,-510.000000,53.733647,-10.000,...,50.000000,20.0,60.000000,-10.000000,50.000000,-1260.99042,-50.000000,-18.394139,-3860.0,-270.000000
1,70.000000,20.000000,-20.000000,-7.420088,-10.000000,123.654212,360.000000,-440.000000,-400.000000,-4.158,...,51.877524,20.0,40.226641,-90.000000,50.000000,-1260.99042,-60.000000,0.000000,-3250.0,-750.000000
2,80.000000,60.793651,20.000000,-10.000000,10.000000,123.654212,190.000000,-500.000000,880.000000,-4.158,...,50.000000,10.0,40.226641,16.217575,60.000000,-1310.00000,-40.000000,0.000000,-3410.0,-630.000000
3,60.000000,60.793651,5.645222,-40.000000,-2.613335,123.654212,580.000000,-415.535701,-200.000000,-10.000,...,50.000000,20.0,60.000000,40.000000,60.000000,-1260.99042,-50.000000,-18.394139,-3020.0,-650.198583
4,70.000000,60.793651,5.645222,-10.000000,0.000000,230.000000,385.874377,30.000000,390.000000,0.000,...,50.000000,20.0,40.226641,-60.000000,48.734345,-1410.00000,-50.000000,-40.000000,-3500.0,-420.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,70.000000,80.000000,5.645222,0.000000,0.000000,180.000000,180.000000,-600.000000,590.000000,-4.158,...,51.877524,20.0,60.000000,16.217575,40.000000,-1320.00000,-40.000000,-20.000000,-3510.0,-650.198583
19996,70.000000,90.000000,5.645222,0.000000,0.000000,140.000000,540.000000,-380.000000,53.733647,-10.000,...,51.877524,10.0,40.226641,-40.000000,50.000000,-1260.99042,-50.806794,-20.000000,-3420.0,-510.000000
19997,60.000000,60.793651,5.645222,40.000000,10.000000,180.000000,320.000000,-415.535701,53.733647,-4.158,...,60.000000,10.0,40.226641,16.217575,48.734345,-1050.00000,-50.806794,-18.394139,-3250.0,-520.000000
19998,71.792725,60.793651,-30.000000,0.000000,-2.613335,200.000000,300.000000,-415.535701,53.733647,0.000,...,50.000000,10.0,70.000000,16.217575,40.000000,-1260.99042,-60.000000,-40.000000,-3500.0,-850.000000


In [12]:
# kbest = SelectKBest(f_classif, k=10)
# X = pd.DataFrame(data=kbest.fit_transform(X, train.y))
# X

In [13]:
# pca = PCA(n_components=900)
# X = pd.DataFrame(data=pca.fit_transform(X))
# X

In [14]:
scaler = StandardScaler()
# scaler = MinMaxScaler((0, 1))
# scaler = RobustScaler()
X = pd.DataFrame(scaler.fit_transform(X))
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,1.381947,0.000000,-4.567135e-01,-0.458116,-7.649544e-01,0.304677,0.000000,-6.460535e-01,0.000000,-1.439699,...,-5.826274e-01,1.032660,1.209963,-0.435156,1.318085e-01,0.000000,1.287858e-01,2.880684e-16,-1.517613,1.552813
1,-0.301860,-1.227218,-7.486323e-01,0.000000,-7.649544e-01,0.000000,-0.198807,-1.673145e-01,-0.879830,0.000000,...,-4.409868e-15,1.032660,0.000000,-1.762987,1.318085e-01,0.000000,-1.467480e+00,1.491471e+00,-0.038504,-0.407611
2,1.381947,0.000000,4.190430e-01,-0.093951,1.306222e+00,0.000000,-1.505011,-5.776622e-01,1.602204,0.000000,...,-5.826274e-01,-1.426701,0.000000,0.000000,1.173233e+00,-0.268038,1.725052e+00,1.491471e+00,-0.426467,0.082495
3,-1.985667,0.000000,-2.592760e-17,-1.186447,4.598936e-17,0.000000,1.491574,-3.887595e-16,-0.492012,-1.439699,...,-5.826274e-01,1.032660,1.209963,0.394738,1.173233e+00,0.000000,1.287858e-01,2.880684e-16,0.519192,0.000000
4,-0.301860,0.000000,-2.592760e-17,-0.093951,2.706340e-01,1.229841,0.000000,3.047076e+00,0.652050,1.024695,...,-5.826274e-01,1.032660,0.000000,-1.265051,7.399769e-16,-0.814949,1.287858e-01,-1.751891e+00,-0.644696,0.940181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,-0.301860,0.577795,-2.592760e-17,0.270214,2.706340e-01,0.651613,-1.581846,-1.261575e+00,1.039868,0.000000,...,-4.409868e-15,1.032660,1.209963,0.000000,-9.096164e-01,-0.322730,1.725052e+00,-1.302097e-01,-0.668944,0.000000
19996,-0.301860,0.878631,-2.592760e-17,0.270214,2.706340e-01,0.189032,1.184232,2.430332e-01,0.000000,-1.439699,...,-4.409868e-15,-1.426701,0.000000,-0.933093,1.318085e-01,0.000000,1.134215e-15,-1.302097e-01,-0.450715,0.572601
19997,-1.985667,0.000000,-2.592760e-17,1.726874,1.306222e+00,0.651613,-0.506149,-3.887595e-16,0.000000,0.000000,...,2.520541e+00,-1.426701,0.000000,0.000000,7.399769e-16,1.153929,1.134215e-15,2.880684e-16,-0.038504,0.531759
19998,0.000000,0.000000,-1.040551e+00,0.270214,4.598936e-17,0.882904,-0.659820,-3.887595e-16,0.000000,1.024695,...,-5.826274e-01,-1.426701,1.821879,0.000000,-9.096164e-01,0.000000,-1.467480e+00,-1.751891e+00,-0.644696,-0.816032


In [14]:
y = train.y
y

0        1
1        1
2        0
3        1
4        1
        ..
19995    0
19996    0
19997    0
19998    0
19999    1
Name: y, Length: 20000, dtype: int32

In [15]:
Counter(y)

Counter({0: 14085, 1: 5915})

In [16]:
weights = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
weights

array([0.70997515, 1.69061708])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# creating model

## base

In [3]:
for i in tqdm(range(1, 1001)):
    train[f'x{i}'] = train[f'x{i}'].apply(lambda x: x if x != '?' else np.nan).astype(float)
    train[f'x{i}'] = train[f'x{i}'].fillna(train[f'x{i}'].mean())
train

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x992,x993,x994,x995,x996,x997,x998,x999,x1000,y
0,80.000000,60.793651,-10.000000,-20.000000,-10.000000,150.000000,385.874377,-510.000000,53.733647,-10.000,...,20.0,60.000000,-10.000000,50.000000,-1260.99042,-50.000000,-18.394139,-3860.0,-270.000000,P
1,70.000000,20.000000,-20.000000,-7.420088,-10.000000,123.654212,360.000000,-440.000000,-400.000000,-4.158,...,20.0,40.226641,-90.000000,50.000000,-1260.99042,-60.000000,0.000000,-3250.0,-750.000000,P
2,80.000000,60.793651,20.000000,-10.000000,10.000000,123.654212,190.000000,-500.000000,880.000000,-4.158,...,10.0,40.226641,16.217575,60.000000,-1310.00000,-40.000000,0.000000,-3410.0,-630.000000,N
3,60.000000,60.793651,5.645222,-40.000000,-2.613335,123.654212,580.000000,-415.535701,-200.000000,-10.000,...,20.0,60.000000,40.000000,60.000000,-1260.99042,-50.000000,-18.394139,-3020.0,-650.198583,P
4,70.000000,60.793651,5.645222,-10.000000,0.000000,230.000000,385.874377,30.000000,390.000000,0.000,...,20.0,40.226641,-60.000000,48.734345,-1410.00000,-50.000000,-40.000000,-3500.0,-420.000000,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,70.000000,80.000000,5.645222,0.000000,0.000000,180.000000,180.000000,-600.000000,590.000000,-4.158,...,20.0,60.000000,16.217575,40.000000,-1320.00000,-40.000000,-20.000000,-3510.0,-650.198583,N
19996,70.000000,90.000000,5.645222,0.000000,0.000000,140.000000,540.000000,-380.000000,53.733647,-10.000,...,10.0,40.226641,-40.000000,50.000000,-1260.99042,-50.806794,-20.000000,-3420.0,-510.000000,N
19997,60.000000,60.793651,5.645222,40.000000,10.000000,180.000000,320.000000,-415.535701,53.733647,-4.158,...,10.0,40.226641,16.217575,48.734345,-1050.00000,-50.806794,-18.394139,-3250.0,-520.000000,N
19998,71.792725,60.793651,-30.000000,0.000000,-2.613335,200.000000,300.000000,-415.535701,53.733647,0.000,...,10.0,70.000000,16.217575,40.000000,-1260.99042,-60.000000,-40.000000,-3500.0,-850.000000,N


In [4]:
for i in tqdm(range(1, 1001)):
    test[f'x{i}'] = test[f'x{i}'].apply(lambda x: x if x != '?' else np.nan).astype(float)
    test[f'x{i}'] = test[f'x{i}'].fillna(test[f'x{i}'].mean())
test

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x991,x992,x993,x994,x995,x996,x997,x998,x999,x1000
0,20001,70.000000,90.000000,-100.000000,-7.692767,-10.00000,122.507357,260.000000,-370.000000,-280.000000,...,51.832335,15.8103,10.000000,140.000000,50.000000,-1260.440221,-60.00000,-18.293118,-3200.000000,-950.000000
1,20002,60.000000,40.000000,-40.000000,-20.000000,-10.00000,110.000000,330.000000,-412.937313,59.390404,...,51.832335,20.0000,40.000000,15.435328,48.894787,-1220.000000,-50.00000,-18.293118,-3560.000000,-920.000000
2,20003,80.000000,60.000000,6.902077,-60.000000,0.00000,122.507357,360.000000,-540.000000,59.390404,...,51.832335,20.0000,39.960365,100.000000,48.894787,-1170.000000,-50.00000,-10.000000,-3229.857769,-655.992307
3,20004,80.000000,90.000000,40.000000,10.000000,-2.52149,190.000000,490.000000,-380.000000,59.390404,...,51.832335,20.0000,20.000000,-40.000000,40.000000,-1260.440221,-50.79482,-40.000000,-3229.857769,-655.992307
4,20005,80.000000,70.000000,40.000000,-7.692767,-2.52149,70.000000,470.000000,-340.000000,630.000000,...,50.000000,10.0000,30.000000,20.000000,40.000000,-1700.000000,-60.00000,-20.000000,-3229.857769,-420.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12340,32341,60.000000,60.830025,6.902077,20.000000,-2.52149,240.000000,430.000000,-412.937313,340.000000,...,50.000000,10.0000,70.000000,50.000000,50.000000,-1390.000000,-50.79482,-10.000000,-2890.000000,-490.000000
12341,32342,71.693249,50.000000,-30.000000,0.000000,20.00000,80.000000,360.000000,-412.937313,-180.000000,...,51.832335,15.8103,39.960365,15.435328,50.000000,-1260.440221,-50.00000,-18.293118,-3010.000000,-655.992307
12342,32343,70.000000,60.830025,30.000000,-7.692767,10.00000,170.000000,190.000000,-570.000000,310.000000,...,50.000000,10.0000,20.000000,-150.000000,48.894787,-1260.440221,-50.00000,-18.293118,-3120.000000,-710.000000
12343,32344,70.000000,20.000000,-60.000000,-40.000000,-30.00000,210.000000,560.000000,-412.937313,-540.000000,...,51.832335,15.8103,39.960365,-10.000000,60.000000,-1630.000000,-50.79482,-18.293118,-3700.000000,-570.000000


In [5]:
X = train.drop(['y'], axis=1)
y = train.y

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [7]:
cat = CatBoostClassifier(iterations=1000000, od_wait=1000, task_type='CPU', eval_metric="TotalF1", use_best_model=True)
cat.fit(X_train, y_train, verbose=200, eval_set=(X_test, y_test))

Learning rate set to 0.003114
0:	learn: 0.8267241	test: 0.8218020	best: 0.8218020 (0)	total: 268ms	remaining: 3d 2h 29m 18s
200:	learn: 0.8928992	test: 0.8831961	best: 0.8839475 (186)	total: 11.4s	remaining: 15h 43m 3s
400:	learn: 0.9034078	test: 0.8943488	best: 0.8943488 (389)	total: 22.7s	remaining: 15h 44m 40s
600:	learn: 0.9102519	test: 0.8999232	best: 0.9001938 (591)	total: 33.5s	remaining: 15h 28m 42s
800:	learn: 0.9147312	test: 0.9037321	best: 0.9042126 (766)	total: 44.3s	remaining: 15h 20m 53s
1000:	learn: 0.9177955	test: 0.9059102	best: 0.9061784 (992)	total: 56.9s	remaining: 15h 46m 49s
1200:	learn: 0.9217009	test: 0.9068511	best: 0.9070914 (1159)	total: 1m 8s	remaining: 15h 45m 6s
1400:	learn: 0.9249072	test: 0.9087466	best: 0.9087466 (1398)	total: 1m 19s	remaining: 15h 43m 42s
1600:	learn: 0.9275117	test: 0.9103445	best: 0.9105850 (1569)	total: 1m 29s	remaining: 15h 29m 35s
1800:	learn: 0.9297166	test: 0.9119141	best: 0.9119141 (1794)	total: 1m 39s	remaining: 15h 17m 5s
200

<catboost.core.CatBoostClassifier at 0x1e193247350>

In [9]:
# f1_score(cat.predict(X_test), y_test), accuracy_score(cat.predict(X_test), y_test)

## testing models

In [18]:
def test_models(X_train, X_test, y_train, y_test):

    sl = {}
    rand = RandomForestClassifier(n_jobs=-1).fit(X_train, y_train)
    sl['RandomForestClassifier'] = f1_score(rand.predict(X_test), y_test)
    cat = CatBoostClassifier(verbose=False).fit(X_train, y_train)
    sl['CatBoostClassifier'] = f1_score(cat.predict(X_test), y_test)
    svc = SVC(probability=True).fit(X_train, y_train)
    sl['SVC'] = f1_score(svc.predict(X_test), y_test)
    gaus = GaussianNB().fit(X_train, y_train)
    sl['GaussianNB'] = f1_score(gaus.predict(X_test), y_test)
    # cater = CategoricalNB().fit(X_train, y_train) # не умеет работать с отрицательными числами
    # sl['CategoricalNB'] = f1_score(cater.predict(X_test), y_test)
    knei = KNeighborsClassifier(n_jobs=-1).fit(X_train, y_train)
    sl['KNeighborsClassifier'] = f1_score(knei.predict(X_test), y_test)
    ada = AdaBoostClassifier().fit(X_train, y_train)
    sl['AdaBoostClassifier'] = f1_score(ada.predict(X_test), y_test)
    xg = XGBClassifier(n_jobs=-1).fit(X_train, y_train)
    sl['XGBClassifier'] = f1_score(xg.predict(X_test), y_test)
    lgbm = LGBMClassifier(n_jobs=-1).fit(X_train, y_train)
    sl['LGBMClassifier'] = f1_score(lgbm.predict(X_test), y_test)
    bag = BaggingClassifier(n_jobs=-1).fit(X_train, y_train)
    sl['BaggingClassifier'] = f1_score(bag.predict(X_test), y_test)

    return sl

In [19]:
test_models(X_train, X_test, y_train, y_test)

[LightGBM] [Info] Number of positive: 4732, number of negative: 11268
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.138847 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 229500
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 900
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.295750 -> initscore=-0.867619
[LightGBM] [Info] Start training from score -0.867619


{'RandomForestClassifier': 0.7349397590361446,
 'CatBoostClassifier': 0.8505647263249348,
 'SVC': 0.8748913987836664,
 'GaussianNB': 0.4677248677248677,
 'KNeighborsClassifier': 0.8428819444444444,
 'AdaBoostClassifier': 0.8056872037914692,
 'XGBClassifier': 0.8545135845749343,
 'LGBMClassifier': 0.8582541054451167,
 'BaggingClassifier': 0.8167770419426048}

## GridSearch RandomForest

In [21]:
# визуализация валидационной выборки для обучения
ps = PredefinedSplit(test_fold=[-1 if i in X_train.index else 0 for i in X.index])
ps

PredefinedSplit(test_fold=array([-1, -1, ..., -1, -1]))

In [22]:
rf = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid={
        'n_estimators': [20, 50, 100, 200, 250, 300],
        'max_depth': [7, 8, 10, 20, 25, None],
        'max_features': ['sqrt', 'log2'],
        'criterion': ['gini', 'entropy', 'log_loss'],
        'min_samples_split': [1, 2, 5, 10],
        'class_weight': [dict(enumerate(weights))],
        'n_jobs': [-1]
    },
    scoring='f1',
    verbose=52,
    cv=ps,
    n_jobs=-1
)

In [23]:
rf.fit(X, y)

Fitting 1 folds for each of 1296 candidates, totalling 1296 fits
[CV 1/1; 1/1296] START class_weight={0: 0.7099751508697195, 1: 1.6906170752324599}, criterion=gini, max_depth=7, max_features=sqrt, min_samples_split=1, n_estimators=20, n_jobs=-1
[CV 1/1; 1/1296] END class_weight={0: 0.7099751508697195, 1: 1.6906170752324599}, criterion=gini, max_depth=7, max_features=sqrt, min_samples_split=1, n_estimators=20, n_jobs=-1;, score=nan total time=   0.0s
[CV 1/1; 2/1296] START class_weight={0: 0.7099751508697195, 1: 1.6906170752324599}, criterion=gini, max_depth=7, max_features=sqrt, min_samples_split=1, n_estimators=50, n_jobs=-1
[CV 1/1; 2/1296] END class_weight={0: 0.7099751508697195, 1: 1.6906170752324599}, criterion=gini, max_depth=7, max_features=sqrt, min_samples_split=1, n_estimators=50, n_jobs=-1;, score=nan total time=   0.0s
[CV 1/1; 3/1296] START class_weight={0: 0.7099751508697195, 1: 1.6906170752324599}, criterion=gini, max_depth=7, max_features=sqrt, min_samples_split=1, n_es

In [24]:
params = rf.best_params_
params

{'class_weight': {0: 0.7099751508697195, 1: 1.6906170752324599},
 'criterion': 'log_loss',
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_split': 5,
 'n_estimators': 100,
 'n_jobs': -1}

In [71]:
f1_score(rf.predict(X_test), y_test), accuracy_score(rf.predict(X_test), y_test)

(0.9991539763113367, 0.9995)

In [70]:
rf = RandomForestClassifier(n_estimators=25, max_depth=35, criterion='entropy', max_features='log2', n_jobs=-1)
# rf = RandomForestClassifier(class_weight=dict(enumerate(weights)))
rf.fit(X, y)

## SVC GridSearch

In [18]:
# визуализация валидационной выборки для обучения
ps = PredefinedSplit(test_fold=[-1 if i in X_train.index else 0 for i in X.index])
ps

PredefinedSplit(test_fold=array([-1, -1, ..., -1, -1]))

In [19]:
svc = GridSearchCV(
    estimator=SVC(),
    param_grid={
        'C': [0.1, 1, 10, 100, 1000], #1000
        'gamma': ['auto'],
        'kernel': ['rbf'],
        
        # 'gamma': [1, 0.1, 0.01, 0.001, 0.0001, None],
        # 'gamma': ['auto', 'scale'],
        # 'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 'auto', 'scale'],
        # 'kernel': ['rbf', 'linear', 'poly'],
        # 'class_weight': [dict(enumerate(weights))]
        # 'class_weight': ['balanced']
    },
    scoring='f1',
    verbose=True,
    cv=ps, 
    n_jobs=-1
)

In [20]:
svc.fit(X, y)

Fitting 1 folds for each of 5 candidates, totalling 5 fits


In [21]:
params = svc.best_params_
params

{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}

In [22]:
f1_score(svc.predict(X_test), y_test), accuracy_score(svc.predict(X_test), y_test)

(1.0, 1.0)

## CatBoost GridSearch

In [18]:
cat = CatBoostClassifier(iterations=10000, 
                        #    learning_rate=1e-3, 
                        #    loss_function='Logloss',
                           # loss_function='MultiClass',
                           eval_metric='TotalF1',
                           # class_weights=dict(enumerate(weights)),
                           # max_depth=8,
                           depth=9, 
                           od_wait=500,
                           use_best_model=True,
                           task_type='CPU'
                           )

cat.fit(X_train, y_train, verbose=200, eval_set=(X_test, y_test))

Learning rate set to 0.023085
0:	learn: 0.8530758	test: 0.8457103	best: 0.8457103 (0)	total: 420ms	remaining: 1h 10m 3s
200:	learn: 0.9428672	test: 0.9197495	best: 0.9199913 (185)	total: 1m 1s	remaining: 49m 39s
400:	learn: 0.9558995	test: 0.9219282	best: 0.9229389 (358)	total: 1m 57s	remaining: 46m 57s
600:	learn: 0.9643133	test: 0.9230450	best: 0.9232658 (549)	total: 2m 53s	remaining: 45m 19s
800:	learn: 0.9730329	test: 0.9268795	best: 0.9273640 (784)	total: 3m 52s	remaining: 44m 28s
1000:	learn: 0.9783895	test: 0.9266373	best: 0.9273640 (784)	total: 4m 49s	remaining: 43m 22s
1200:	learn: 0.9829871	test: 0.9273838	best: 0.9279076 (1110)	total: 5m 46s	remaining: 42m 20s
1400:	learn: 0.9872707	test: 0.9276653	best: 0.9279076 (1110)	total: 6m 44s	remaining: 41m 20s
1600:	learn: 0.9914137	test: 0.9281694	best: 0.9286927 (1540)	total: 7m 42s	remaining: 40m 24s
1800:	learn: 0.9938609	test: 0.9284311	best: 0.9286927 (1540)	total: 8m 39s	remaining: 39m 25s
2000:	learn: 0.9958673	test: 0.9274

<catboost.core.CatBoostClassifier at 0x1833f076c10>

In [19]:
f1_score(cat.predict(X_test), y_test), accuracy_score(cat.predict(X_test), y_test)

(0.8774978279756733, 0.9295)

## KNN GridSearch

In [18]:
# визуализация валидационной выборки для обучения
ps = PredefinedSplit(test_fold=[-1 if i in X_train.index else 0 for i in X.index])
ps

PredefinedSplit(test_fold=array([-1, -1, ..., -1,  0]))

In [19]:
knn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid={
        'n_neighbors': [1, 5, 10, 20, 30],
        'weights': ['uniform', 'distance'],
        'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'leaf_size': list(range(10, 51, 10)),
        'p' : list(range(1, 6)),
        'n_jobs': [-1]
    },
    scoring='f1',
    verbose=52,
    cv=ps,
    n_jobs=-1
)

In [20]:
knn.fit(X, y)

Fitting 1 folds for each of 1000 candidates, totalling 1000 fits


In [25]:
params = knn.best_params_
params

{'n_jobs': -1, 'n_neighbors': 7, 'weights': 'uniform'}

In [26]:
f1_score(knn.predict(X_test), y_test), accuracy_score(knn.predict(X_test), y_test)

(0.8765808983863934, 0.92925)

## XGBoost GridSearch

In [35]:
# визуализация валидационной выборки для обучения
ps = PredefinedSplit(test_fold=[-1 if i in X_train.index else 0 for i in X.index])
ps

PredefinedSplit(test_fold=array([ 0, -1, ..., -1, -1]))

In [36]:
xgb = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid={
        'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
        'n_estimators': [25, 50, 100, 200, 250, 300, 350, 400, 450, 500],
        'n_jobs': [-1]
    },
    scoring='f1',
    verbose=52,
    cv=ps,
    n_jobs=-1
)

In [37]:
xgb.fit(X, y)

Fitting 1 folds for each of 100 candidates, totalling 100 fits


In [38]:
params = xgb.best_params_
params

{'max_depth': 20, 'n_estimators': 250, 'n_jobs': -1}

In [39]:
f1_score(xgb.predict(X_test), y_test), accuracy_score(xgb.predict(X_test), y_test)

(1.0, 1.0)

## Voting Ensemble

In [18]:
vot = VotingClassifier(
    estimators=[
        ('svc', SVC(probability=True, C=10, gamma='auto', kernel='rbf')),
        ('rf', RandomForestClassifier(n_estimators=25, max_depth=35, criterion='entropy', max_features='log2', n_jobs=-1)),
        ('xgb', XGBClassifier(n_jobs=-1, n_estimators=250, max_depth=20)),
        ('knn', KNeighborsClassifier(n_jobs=-1, n_neighbors=7, weights='uniform')),
        ('catboost', CatBoostClassifier(verbose=0, iterations=10000, depth=9, od_wait=500, task_type='CPU')),
        # ('lgbm', LGBMClassifier(n_jobs=-1)),
    ],
    voting='hard',
    n_jobs=-1,
    verbose=True
)
vot.fit(X, y)

In [19]:
f1_score(vot.predict(X_test), y_test), accuracy_score(vot.predict(X_test), y_test)

(1.0, 1.0)

## lstm

In [19]:
X_train.shape

(16000, 1000)

In [20]:
X_train = np.array(X_train).reshape((X_train.shape[0], 1, X_train.shape[1]))

In [21]:
X_train.shape

(16000, 1, 1000)

In [22]:
X_test = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))

In [23]:
y_train = keras.utils.to_categorical(y_train, 2)
y_test = keras.utils.to_categorical(y_test, 2)

In [24]:
X = np.array(X).reshape((X.shape[0], 1, X.shape[1]))
y = keras.utils.to_categorical(y, 2)

In [31]:
def lstm_model(hp):
    model = Sequential()
    model.add(LSTM(hp.Int('units', min_value=32, max_value=512, step=8), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]))) # протестируем с различной выходной размерностью
    model.add(Dropout(hp.Float('dropout_rate', min_value=0.1, max_value=1, step=0.1))) # Протестируем с различными значениями Dropout от 0.1 до 0.5
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer=hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd']), loss='binary_crossentropy', metrics=['F1Score']) # Проверим работу нейронной сети с различными оптимайзерами
    return model

lstm_tuner = kt.Hyperband(lstm_model,
                     objective='val_loss',
                     max_epochs=20,
                     factor=3,
                     directory='my_lstm',
                     project_name='lstm_tuning')

# Проведем обучение нейроной сети
lstm_tuner.search(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test))
# tuner.search(X, y, epochs=20, batch_size=64, validation_data=(X_test, y_test), callbacks=callbacks)

Trial 1 Complete [00h 00m 04s]
val_loss: 0.23608249425888062

Best val_loss So Far: 0.23608249425888062
Total elapsed time: 00h 00m 04s


In [None]:
lstm_tuner.get_best_hyperparameters()[0].values

In [32]:
# В качестве модели будем использовать модель, показавшую себя наилучшим образом по ходу обучения 
lstm = lstm_tuner.get_best_models(num_models=1)[0]

In [33]:
f1_score(lstm.predict(X_test).argmax(axis=-1), y_test.argmax(axis=-1))

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


0.8655278142794569

In [34]:
accuracy_score(lstm.predict(X_test).argmax(axis=-1), y_test.argmax(axis=-1))

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


0.92325

## RNN

In [19]:
X_train = np.array(X_train).reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))
y_train = keras.utils.to_categorical(y_train, 2)
y_test = keras.utils.to_categorical(y_test, 2)

In [20]:
X = np.array(X).reshape((X.shape[0], 1, X.shape[1]))
y = keras.utils.to_categorical(y, 2)

In [25]:
def rnn_model(hp):
    model = Sequential()
    model.add(SimpleRNN(hp.Int('units', min_value=32, max_value=512, step=8), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(hp.Float('dropout_rate', min_value=0.1, max_value=1, step=0.1)))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer=hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd']), loss='binary_crossentropy', metrics=['F1Score'])
    return model

rnn_tuner = kt.Hyperband(rnn_model,
                     objective='val_loss',
                     max_epochs=20,
                     factor=3,
                     directory='my_rnn',
                     project_name='rnn_tuning')

rnn_tuner.search(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test))

Trial 30 Complete [00h 00m 24s]
val_loss: 0.25124838948249817

Best val_loss So Far: 0.2462279349565506
Total elapsed time: 00h 05m 29s


In [33]:
rnn_tuner.get_best_hyperparameters()[0].values

{'units': 320,
 'dropout_rate': 0.6,
 'optimizer': 'sgd',
 'tuner/epochs': 20,
 'tuner/initial_epoch': 7,
 'tuner/bracket': 1,
 'tuner/round': 1,
 'tuner/trial_id': '0018'}

In [26]:
rnn = rnn_tuner.get_best_models(num_models=1)[0]

In [27]:
f1_score(rnn.predict(X_test).argmax(axis=-1), y_test.argmax(axis=-1))

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


0.8539325842696629

In [28]:
accuracy_score(rnn.predict(X_test).argmax(axis=-1), y_test.argmax(axis=-1))

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


0.9155

## Conv1D

In [28]:
X_train = np.array(X_train).reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))
y_train = keras.utils.to_categorical(y_train, 2)
y_test = keras.utils.to_categorical(y_test, 2)

ValueError: cannot reshape array of size 16000000 into shape (16000,1,1)

In [20]:
X = np.array(X).reshape((X.shape[0], 1, X.shape[1]))
y = keras.utils.to_categorical(y, 2)

In [27]:
def cnn_model(hp):
    # model = Sequential()
    # model.add(SimpleRNN(hp.Int('units', min_value=32, max_value=512, step=8), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
    # model.add(Dropout(hp.Float('dropout_rate', min_value=0.1, max_value=1, step=0.1)))
    # model.add(Dense(2, activation='softmax'))
    # model.compile(optimizer=hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd']), loss='binary_crossentropy', metrics=['F1Score'])

    model = Sequential()
    model.add(Conv1D(32, 3, activation='relu', input_shape=(X_train.shape[2], X_train.shape[1])))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(32, 3, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer=hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd']), loss='binary_crossentropy', metrics=['F1Score'])
    
    return model

cnn_tuner = kt.Hyperband(cnn_model,
                     objective='val_loss',
                     max_epochs=3,
                     factor=3,
                     directory='my_cnn',
                     project_name='cnn_tuning')

cnn_tuner.search(X_train, y_train, epochs=3, batch_size=64, validation_data=(X_test, y_test))

Trial 2 Complete [00h 00m 01s]

Best val_loss So Far: None
Total elapsed time: 00h 00m 02s

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
sgd               |rmsprop           |optimizer
1                 |1                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
1                 |1                 |tuner/bracket
0                 |0                 |tuner/round



Traceback (most recent call last):
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras_tuner\src\engine\base_tuner.py", line 274, in _try_run_and_update_trial
    self._run_and_update_trial(trial, *fit_args, **fit_kwargs)
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras_tuner\src\engine\base_tuner.py", line 239, in _run_and_update_trial
    results = self.run_trial(trial, *fit_args, **fit_kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras_tuner\src\tuners\hyperband.py", line 427, in run_trial
    return super().run_trial(trial, *fit_args, **fit_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras_tuner\src\engine\tuner.py", line 314, in run_trial
    obj_value = self._build_and_fit_model(trial, *args, **copied

RuntimeError: Number of consecutive failures exceeded the limit of 3.
Traceback (most recent call last):
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras_tuner\src\engine\base_tuner.py", line 274, in _try_run_and_update_trial
    self._run_and_update_trial(trial, *fit_args, **fit_kwargs)
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras_tuner\src\engine\base_tuner.py", line 239, in _run_and_update_trial
    results = self.run_trial(trial, *fit_args, **fit_kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras_tuner\src\tuners\hyperband.py", line 427, in run_trial
    return super().run_trial(trial, *fit_args, **fit_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras_tuner\src\engine\tuner.py", line 314, in run_trial
    obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras_tuner\src\engine\tuner.py", line 233, in _build_and_fit_model
    results = self.hypermodel.fit(hp, model, *args, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras_tuner\src\engine\hypermodel.py", line 149, in fit
    return model.fit(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 122, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\layers\input_spec.py", line 227, in assert_input_compatibility
    raise ValueError(
ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "conv1d" is incompatible with the layer: expected axis -1 of input shape to have value 1, but received input with shape (64, 1, 1000)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(64, 1, 1000), dtype=float32)
  • training=True
  • mask=None


In [25]:
cnn_tuner.get_best_hyperparameters()[0].values

{'optimizer': 'adam',
 'tuner/epochs': 1,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

In [26]:
cnn = cnn_tuner.get_best_models(num_models=1)[0]

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'my_cnn\cnn_tuning\trial_0000\checkpoint.weights.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
f1_score(cnn.predict(X_test).argmax(axis=-1), y_test.argmax(axis=-1))

In [None]:
accuracy_score(cnn.predict(X_test).argmax(axis=-1), y_test.argmax(axis=-1))

## Dense

In [19]:
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)

(16000, 1000)
(4000, 1000)
(16000,)
(4000,)


In [20]:
y_train = keras.utils.to_categorical(y_train, 2)
y_test = keras.utils.to_categorical(y_test, 2)

In [21]:
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)

(16000, 1000)
(4000, 1000)
(16000, 2)
(4000, 2)


In [30]:
model = Sequential()
model.add(Dense(120, input_dim=1000, activation='relu'))
model.add(Dense(90,  activation='relu'))
model.add(Dense(60,  activation='relu'))
model.add(Dense(30,  activation='relu'))
model.add(Dense(2, activation='softmax'))
# model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=3e-4), metrics=['F1Score'])
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['F1Score'])

In [31]:
# Обучаем сеть
model.fit(X_train,
          y_train,
          batch_size=64,    # Размер батча
          validation_data=(X_test, y_test),
          epochs=100,      # Количество эпох
          verbose=1)       # Выводить процесс обучения на каждой эпохе

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - F1Score: 0.8203 - loss: 0.3249 - val_F1Score: 0.8963 - val_loss: 0.2446
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - F1Score: 0.9305 - loss: 0.1715 - val_F1Score: 0.8908 - val_loss: 0.2519
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - F1Score: 0.9496 - loss: 0.1301 - val_F1Score: 0.8946 - val_loss: 0.2913
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - F1Score: 0.9657 - loss: 0.0851 - val_F1Score: 0.8861 - val_loss: 0.3533
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - F1Score: 0.9813 - loss: 0.0443 - val_F1Score: 0.8879 - val_loss: 0.4560
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - F1Score: 0.9878 - loss: 0.0294 - val_F1Score: 0.8895 - val_loss: 0.5131
Epoch 7/100
[1m250/250[0m [32m━

<keras.src.callbacks.history.History at 0x1c7cd338e10>

In [32]:
np.argmax(model.predict(X_test), axis=-1)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 739us/step


array([0, 0, 1, ..., 1, 0, 0], dtype=int64)

# create submit

In [8]:
X_pred = test.drop(['id'], axis=1)
# X_pred = kbest.transform(X_pred)
# X_pred = pca.transform(X_pred)
# X_pred = scaler.transform(X_pred)
pred = cat.predict(X_pred)

# X_pred = np.array(X_pred).reshape((X_pred.shape[0], 1, X_pred.shape[1]))
# pred = model.predict(X_pred).argmax(axis=-1)

pred

array(['P', 'N', 'N', ..., 'N', 'N', 'N'], dtype=object)

In [9]:
# test['y'] = le.inverse_transform(pred)
test['y'] = pred
test

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x992,x993,x994,x995,x996,x997,x998,x999,x1000,y
0,20001,70.000000,90.000000,-100.000000,-7.692767,-10.00000,122.507357,260.000000,-370.000000,-280.000000,...,15.8103,10.000000,140.000000,50.000000,-1260.440221,-60.00000,-18.293118,-3200.000000,-950.000000,P
1,20002,60.000000,40.000000,-40.000000,-20.000000,-10.00000,110.000000,330.000000,-412.937313,59.390404,...,20.0000,40.000000,15.435328,48.894787,-1220.000000,-50.00000,-18.293118,-3560.000000,-920.000000,N
2,20003,80.000000,60.000000,6.902077,-60.000000,0.00000,122.507357,360.000000,-540.000000,59.390404,...,20.0000,39.960365,100.000000,48.894787,-1170.000000,-50.00000,-10.000000,-3229.857769,-655.992307,N
3,20004,80.000000,90.000000,40.000000,10.000000,-2.52149,190.000000,490.000000,-380.000000,59.390404,...,20.0000,20.000000,-40.000000,40.000000,-1260.440221,-50.79482,-40.000000,-3229.857769,-655.992307,N
4,20005,80.000000,70.000000,40.000000,-7.692767,-2.52149,70.000000,470.000000,-340.000000,630.000000,...,10.0000,30.000000,20.000000,40.000000,-1700.000000,-60.00000,-20.000000,-3229.857769,-420.000000,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12340,32341,60.000000,60.830025,6.902077,20.000000,-2.52149,240.000000,430.000000,-412.937313,340.000000,...,10.0000,70.000000,50.000000,50.000000,-1390.000000,-50.79482,-10.000000,-2890.000000,-490.000000,N
12341,32342,71.693249,50.000000,-30.000000,0.000000,20.00000,80.000000,360.000000,-412.937313,-180.000000,...,15.8103,39.960365,15.435328,50.000000,-1260.440221,-50.00000,-18.293118,-3010.000000,-655.992307,N
12342,32343,70.000000,60.830025,30.000000,-7.692767,10.00000,170.000000,190.000000,-570.000000,310.000000,...,10.0000,20.000000,-150.000000,48.894787,-1260.440221,-50.00000,-18.293118,-3120.000000,-710.000000,N
12343,32344,70.000000,20.000000,-60.000000,-40.000000,-30.00000,210.000000,560.000000,-412.937313,-540.000000,...,15.8103,39.960365,-10.000000,60.000000,-1630.000000,-50.79482,-18.293118,-3700.000000,-570.000000,N


In [10]:
sub = test[['id', 'y']]
sub

Unnamed: 0,id,y
0,20001,P
1,20002,N
2,20003,N
3,20004,N
4,20005,N
...,...,...
12340,32341,N
12341,32342,N
12342,32343,N
12343,32344,N


In [11]:
sub.to_csv('sub.tsv', sep='\t')