In [1]:
print('start')

start


In [2]:
import sys

print('Python %s on %s' % (sys.version, sys.platform))
sys.path.extend(['../'])

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from scripts.data_utils import get_connectome
from scripts.classification_models import LightGBMModel  # Импортируем новый класс LightGBM

# Загрузка файлов с временными рядами и метками
bnu_series_path = '../data/ts_cut/HCPex/bnu{}.npy'
bnu_labels_path = '../data/ts_cut/HCPex/bnu.csv'
ihb_series_path = '../data/ts_cut/HCPex/ihb.npy'
ihb_labels_path = '../data/ts_cut/HCPex/ihb.csv'

X_bnu = np.concatenate([np.load(bnu_series_path.format(i)) for i in (1, 2)], axis=0)
print(X_bnu.shape)
Y_bnu = pd.read_csv(bnu_labels_path)
print(Y_bnu.shape)
X_ihb = np.load(ihb_series_path)
print(X_ihb.shape)
Y_ihb = pd.read_csv(ihb_labels_path)
print(Y_ihb.shape)

# Преобразование временных рядов в матрицы связности
X_bnu = get_connectome(X_bnu)
X_ihb = get_connectome(X_ihb)

# Конкатенация данных для обучения
X = np.concatenate([X_bnu, X_ihb])
Y = np.concatenate([Y_bnu.values.ravel(), Y_ihb.values.ravel()])

# Разделение данных на обучающую и тестовую выборки
x_train, x_validate, y_train, y_validate = train_test_split(X, Y, test_size=0.15, random_state=10)
print(x_train.shape, x_validate.shape)
print(y_train.shape, y_validate.shape)

Python 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)] on win32
(142, 240, 419)
(142, 1)
(20, 120, 419)
(20, 1)
(137, 419, 419) (25, 419, 419)
(137,) (25,)


In [3]:
# Инициализация и обучение модели LightGBM
lightgbm_model = LightGBMModel()
train_acc = lightgbm_model.model_training(x_train, y_train, x_validate, y_validate)
print(f'Accuracy on train: {train_acc}')

[LightGBM] [Info] Number of positive: 48, number of negative: 89
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.286982 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8290454
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 175142
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.350365 -> initscore=-0.617435
[LightGBM] [Info] Start training from score -0.617435
Training until validation scores don't improve for 100 rounds










Early stopping, best iteration is:
[522]	valid_0's binary_logloss: 0.161843
Accuracy on train: 1.0


In [4]:
# Тестирование модели
acc, f1 = lightgbm_model.model_testing(x_validate, y_validate)
print(f'Accuracy on test: {acc}')
print(f'F1 score on test: {f1}')

Accuracy on test: 0.92
F1 score on test: 0.8888888888888888


In [5]:
import pickle

# save model and weights 

pkl_filename = "./model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(lightgbm_model, file)

In [6]:
# create local environment same as Yandex Contest
import os
import shutil

if not os.path.exists('./data/ts_cut/HCPex/'):
    os.makedirs('./data/ts_cut/HCPex/')

np.save('./data/ts_cut/HCPex/predict.npy', np.concatenate([np.load(bnu_series_path.format(i)) for i in (1, 2)], axis=0))


In [7]:
# create script, which loads model, does all preprocessing and outputs solution.csv

import numpy as np
import pandas as pd
import pickle
from scripts.data_utils import get_connectome
from scripts.classification_models import LightGBMModel

X = np.load('./data/ts_cut/HCPex/predict.npy')
print(X.shape)
X = get_connectome(X)

with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

y_pred = model.model_predict(X)
print(y_pred)

solution = pd.DataFrame(data=y_pred, columns=['prediction'])
solution.to_csv('./solution.csv', index=False)

(142, 240, 419)
[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0
 1 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 1 0 1 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0 0 0
 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 1 0 0 1 0 0 1 1 0 1 1 0 0 0 0]


In [8]:
# build the .zip to submit
import zipfile
import datetime

# save source from previous cell into file
# will produce the correct result only in case of running previous cell just before
with open('run.py', 'w') as f_run:
    f_run.write(_ih[-2])

with open('run.sh', 'w') as f_run_sh:
    f_run_sh.write('export PATH=/usr/conda/bin:$PATH\npython run.py')

with open('train.py', 'w') as f_run:
    f_run.write('print("\\n".join(map(str, range(100))))')

with open('train.sh', 'w') as f_run_sh:
    f_run_sh.write('export PATH=/usr/conda/bin:$PATH\npython train.py')

with open('Makefile', 'w') as f_makefile:
    f_makefile.write('''all: build

build:
	@echo 'starting....'
	bash train.sh
run:
	bash run.sh
train:
	bash train.sh
''')

submission_zip = zipfile.ZipFile(f"submission-{datetime.datetime.now()}.zip".replace(':', '-').replace(' ', '-'), "w")
submission_zip.write('./Makefile', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('run.py', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('run.sh', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('train.py', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('train.sh', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('model.pkl', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('scripts', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('scripts/__init__.py', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('scripts/classification_models.py', compress_type=zipfile.ZIP_DEFLATED)
submission_zip.write('scripts/data_utils.py', compress_type=zipfile.ZIP_DEFLATED)

submission_zip.close()
