In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import (IsolationForest, RandomForestClassifier, RandomForestRegressor,
                             RandomTreesEmbedding)
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
from sklearn.metrics import (recall_score, make_scorer, f1_score, balanced_accuracy_score)
from sklearn.model_selection import cross_val_score, StratifiedKFold, RepeatedStratifiedKFold as RSKF
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from statistics import mean
from sklearn.naive_bayes import GaussianNB as GNB, MultinomialNB as MNB, BernoulliNB as BNB
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures as PF
from imblearn.pipeline import make_pipeline as imb_ppl
from imblearn.under_sampling import ClusterCentroids, NeighbourhoodCleaningRule as NCR, RandomUnderSampler as RUS
from imblearn.combine import SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier as BRFC
from statistics import mean
from sklearn.preprocessing import OneHotEncoder as OHE

from sklearn.cluster import MeanShift
from sklearn.pipeline import make_pipeline

from sklearn.experimental import enable_iterative_imputer  # noqas
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.decomposition import PCA
from imblearn.keras import BalancedBatchGenerator as BBG
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Activation, Dropout

from imblearn.under_sampling import RandomUnderSampler as RUS, NearMiss
from sklearn.preprocessing import PolynomialFeatures as PF
from imblearn.combine import SMOTEENN
from imblearn.ensemble import BalancedBaggingClassifier as BBC
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif
import umap
from imblearn.ensemble import BalancedRandomForestClassifier as BRFC, RUSBoostClassifier, EasyEnsembleClassifier
from sklearn.linear_model import BayesianRidge, Ridge

from xgboost.sklearn import XGBRegressor, XGBClassifier
from sklearn.svm import LinearSVC
from catboost import CatBoostClassifier

data = pd.read_csv("train.csv", encoding='utf_8_sig')
test_raw = pd.read_csv("test_dataset_test.csv", encoding='utf_8_sig')
IGNORE_COLS = ["ID", "ID_y", "ОНМК", "Стенокардия, ИБС, инфаркт миокарда", "Артериальная гипертензия",
           "Сердечная недостаточность", "Прочие заболевания сердца"]

def autodrop(df):
    return df.drop(columns=[col for col in df.columns if col in IGNORE_COLS])

def get_cat_cols(df):
    return list(df.select_dtypes("object").columns)

def drop_id(dataset):
    if "ID_y" in dataset.columns:
        dataset = dataset.drop(columns=["ID_y"])
    
    dataset = dataset.drop(columns=["ID"])
    return dataset

def convert_time_to_int(dataset):
    def make_int(timestr):
        try:
            return int(timestr[:2])*60*60+int(timestr[3:5])*60+int(timestr[6:8])
        except:
            return np.nan
    dataset['Время засыпания'] = dataset['Время засыпания'].apply(make_int)
    dataset['Время пробуждения'] = dataset['Время пробуждения'].apply(make_int)
    return dataset
    
def prepare(data):
    # Базовые преобразования датасета
    data = drop_id(data)

    data = convert_time_to_int(data)
    data = data.drop(columns=['Религия', 'Национальность', 'Этнос', 'Религия, клубы'])
    data['Статус Курения'] = data['Статус Курения'].apply(lambda x: x if x != "Никогда не курил" else "Никогда не курил(а)")
    data.loc[data["Статус Курения"] == "Никогда не курил(а)", "Возраст курения"] = [0.0] * sum(data["Статус Курения"] == "Никогда не курил(а)")
    data.loc[data["Статус Курения"] == "Никогда не курил(а)", "Сигарет в день"] = [0.0] * sum(data["Статус Курения"] == "Никогда не курил(а)")

    data.loc[data["Пассивное курение"] == 0, "Частота пасс кур"] = [0.0] * sum(data["Пассивное курение"] == 0)
    int_convert = {'1-2 раза в неделю': 1, '3-6 раз в неделю': 3,
              'не менее 1 раза в день': 7, '4 и более раз в день': 28,
               '2-3 раза в день': 14
              }
    data["Частота пасс кур"] = data["Частота пасс кур"].apply(lambda x: int_convert.get(x, 0.0)).astype(float)

    data.loc[data["Алкоголь"] == "никогда не употреблял", "Возраст алког"] = [0] * sum(data["Алкоголь"] == "никогда не употреблял")
    data['Время сна'] = (data['Время засыпания'] - data['Время пробуждения']).abs()

    return data

data_ORIG = prepare(data)
test_ORIG = prepare(test_raw)

  numba.core.entrypoints.init_all()
  numba.core.entrypoints.init_all()


In [2]:
def make_submission(target_model_dict, ids):
    out = pd.DataFrame()
    out["ID"] = ids
    out["Артериальная гипертензия"] = target_model_dict["Артериальная гипертензия"]
    out["ОНМК"] = target_model_dict["ОНМК"]
    out["\"Стенокардия, ИБС, инфаркт миокарда\""] = target_model_dict["Стенокардия, ИБС, инфаркт миокарда"]
    out["Сердечная недостаточность"] = target_model_dict["Сердечная недостаточность"]
    out["Прочие заболевания сердца"] = target_model_dict["Прочие заболевания сердца"]
    
    with open("my_submission.csv", "w", encoding='utf_8_sig') as file:
        file.write("ID,Артериальная гипертензия,ОНМК,\"Стенокардия, ИБС, инфаркт миокарда\",Сердечная недостаточность,Прочие заболевания сердца\n")
        for vals in zip(out[out.columns].values):
            _id, ar, on, si, sn, pz = vals[0]
            file.write(f"{_id},{ar},{on},{si},{sn},{pz}\n")
    
    return out
    

In [13]:
from imblearn.ensemble import RUSBoostClassifier
from sklearn_genetic import GASearchCV, GAFeatureSelectionCV
from sklearn_genetic.space import Categorical, Integer, Continuous
from sklearn.random_projection import GaussianRandomProjection as GRP
from sklearn.decomposition import PCA
from sklearn.neighbors import NeighborhoodComponentsAnalysis as NCA
from sklearn.cluster import DBSCAN, Birch, KMeans, MeanShift
from imblearn.ensemble import BalancedRandomForestClassifier as BRFC
from sklearn.svm import LinearSVC
from tpot import TPOTClassifier
from feature_engine.creation import MathFeatures
from feature_engine import transformation as vt

targets = ["Артериальная гипертензия", "ОНМК", "Стенокардия, ИБС, инфаркт миокарда",
          "Сердечная недостаточность", "Прочие заболевания сердца"]
target_model_dict = {}

for target in targets:
    print(target)
    data = data_ORIG.copy(deep=True)
    test = test_ORIG.copy(deep=True)
    
    ohe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)#OHE(sparse=False, handle_unknown='ignore')
    scaler = StandardScaler()
    data = data.dropna()
    y = data[target].copy(deep=True).reset_index(drop=True)
    data = autodrop(data)
    
    cat_features = list(data.select_dtypes("object").columns) + [col for col in data.columns if data[col].nunique()==2]
    
    data[data.select_dtypes("object").columns] = ohe.fit_transform(data[data.select_dtypes("object").columns])
    
    test = autodrop(test_ORIG.copy(deep=True))
    test[test.select_dtypes("object").columns] = ohe.transform(test[test.select_dtypes("object").columns])
    test[test.columns] = SimpleImputer().fit_transform(test[test.columns])

    data[[col for col in data.columns if col not in cat_features]] =\
        scaler.fit_transform(data[[col for col in data.columns if col not in cat_features]])
    
    test[[col for col in test.columns if col not in cat_features]] =\
        scaler.transform(test[[col for col in test.columns if col not in cat_features]])
    
    from sklearn_genetic import GAFeatureSelectionCV, GASearchCV
    from sklearn.svm import LinearSVC
    from sklearn.model_selection import StratifiedKFold
    from sklearn.ensemble import VotingClassifier
    from sklearn.svm import SVC
    from sklearn.neural_network import MLPClassifier
    from imblearn.pipeline import make_pipeline as make_imblearn_pipeline
    from lightgbm.sklearn import LGBMClassifier
    from sklearn.ensemble import StackingClassifier
    from xgboost.sklearn import XGBClassifier
    from sklearn_genetic.space import Categorical, Integer, Continuous
    from imblearn.under_sampling import (ClusterCentroids, NearMiss, EditedNearestNeighbours, AllKNN, 
                                         CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule)
    from imblearn.ensemble import RUSBoostClassifier, EasyEnsembleClassifier
    
   
    
    param_grid = {"n_estimators": Integer(20, 500),
                  "max_depth": Integer(2, 10),
                  "subsample": Continuous(0.1, 1.0),
                  "colsample_bytree": Continuous(0.2, 1.0),
                 "learning_rate": Continuous(0.0001, 1.0),
                 "max_bin": Integer(12, 512)}
    
    #model = RUSBoostClassifier()
    model = GASearchCV(XGBClassifier(n_jobs=-1, use_label_encoder=False, eval_metric='logloss'), param_grid=param_grid, cv=RSKF(n_splits=3, n_repeats=2, random_state=300),
                      scoring='f1',
                      population_size=30,
                      generations=30,
                      keep_top_k=3,
                       n_jobs=1,
                      verbose=2)
    #model = XGBClassifier(eval_metric='logloss', use_label_encoder=False,
    #                      max_depth=4, subsample=0.8, learning_rate=0.01, n_estimators=400)
    #print(cross_val_score(model, data, y, cv=RSKF(n_splits=3, n_repeats=3, random_state=300),
    #                     scoring='f1'))
    model.fit(data.to_numpy(), y.to_numpy())
    
    #TPOTClassifier(
           #         cv=RSKF(n_splits=3, n_repeats=1, random_state=300), n_jobs=-1,
           #         scoring='accuracy', verbosity=2)
    #model.fit(data[keep].to_numpy(), y.to_numpy())
    target_model_dict[target] = model.predict(test.to_numpy())


Артериальная гипертензия




gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.620552	0.0304272  	0.682269   	0.568775   
1  	60    	0.648549	0.0271905  	0.687396   	0.608218   
2  	60    	0.676142	0.0170955  	0.704167   	0.628771   
3  	60    	0.688291	0.00865733 	0.704167   	0.655359   
4  	60    	0.693292	0.0057824  	0.705229   	0.684596   
5  	60    	0.698859	0.00586607 	0.711838   	0.687947   
6  	60    	0.703466	0.00661833 	0.711838   	0.690813   
7  	60    	0.708411	0.00681584 	0.724558   	0.694358   
8  	60    	0.711446	0.00641598 	0.724558   	0.704167   
9  	60    	0.714667	0.00724696 	0.725631   	0.703116   
10 	60    	0.720239	0.00588193 	0.726374   	0.705817   
11 	60    	0.723834	0.00246593 	0.727185   	0.717058   
12 	60    	0.724222	0.00322653 	0.727185   	0.709656   
13 	60    	0.725351	0.00170235 	0.727304   	0.720672   
14 	60    	0.724326	0.00724873 	0.727304   	0.690823   
15 	60    	0.725191	0.00677359 	0.733847   	0.69813    
16 	60    	0.725354	0.00463162 	0.728172   	0.70

15 	60    	0.155432 	0.00777462 	0.160727   	0.114952   
16 	60    	0.158305 	0.00242224 	0.160727   	0.155787   
17 	60    	0.158494 	0.00451369 	0.160727   	0.137271   
18 	60    	0.155791 	0.011103   	0.170359   	0.123328   
19 	60    	0.158523 	0.0113671  	0.170359   	0.112787   
20 	60    	0.156171 	0.0149625  	0.170359   	0.103345   
21 	60    	0.153948 	0.0194747  	0.170359   	0.0987283  
22 	60    	0.152214 	0.0229174  	0.170359   	0.0706847  
23 	60    	0.156791 	0.0202109  	0.170359   	0.0747834  
24 	60    	0.15906  	0.0155909  	0.170359   	0.0966212  
25 	60    	0.161956 	0.0119019  	0.170359   	0.115272   
26 	60    	0.161536 	0.0212323  	0.170359   	0.0754133  
27 	60    	0.161338 	0.0216603  	0.170359   	0.0754133  
28 	60    	0.164372 	0.0165914  	0.170359   	0.0939571  
29 	60    	0.163441 	0.0175631  	0.170359   	0.0995964  
30 	60    	0.168059 	0.0107409  	0.170359   	0.110971   


In [14]:
make_submission(target_model_dict, test_raw['ID']) # FIX CLASSES! THEY ARE DIVIDED!

Unnamed: 0,ID,Артериальная гипертензия,ОНМК,"""Стенокардия, ИБС, инфаркт миокарда""",Сердечная недостаточность,Прочие заболевания сердца
0,54-001-019-01,1,0,0,0,0
1,54-002-133-01,1,0,0,0,0
2,54-001-007-01,1,0,0,0,0
3,54-102-116-01,0,0,0,0,1
4,54-502-005-02,1,0,0,0,0
...,...,...,...,...,...,...
633,54-102-095-01,1,0,0,0,0
634,54-102-235-01,1,0,0,0,0
635,54-502-016-01,1,0,0,0,0
636,54-002-138-01,0,0,0,0,0
