In [None]:
import numpy as np
import pandas as pd
import json

In [None]:
def get_indices(index):

    def load_file(name, index):

        path = 'split/'
        with open(path + name + str(index) + ".json") as f:
            arr = json.load(f)

        return arr

    train_indices = np.array(load_file('train', index))
    val_indices = np.array(load_file('val', index))
    
    return train_indices, val_indices

In [None]:
y_tot = pd.read_csv('y_train.csv', index_col=0)['y'].to_numpy()

In [None]:
X_train_hc_unprocessed = pd.read_csv('train_features.csv', index_col=0).to_numpy()
X_test_hc_unprocessed = pd.read_csv('test_features.csv', index_col=0).to_numpy()

In [None]:
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2

X_train_hc_no_nan = np.nan_to_num(X_train_hc_unprocessed, nan = 0)
X_test_hc_no_nan = np.nan_to_num(X_test_hc_unprocessed, nan = 0)
pipe = Pipeline([('variance', VarianceThreshold()),
                 ('selector',  SelectKBest(k=240))
                ])

X_tot_hc = pipe.fit_transform(X_train_hc_no_nan, y_tot)
X_test_hc = pipe.transform(X_test_hc_no_nan)
print(X_tot_hc.shape,X_test_hc.shape)

In [None]:
def get_prediction_estimator(fold, estimator, predict_test=False, predict_score=False, predict_proba_test=False):
    
    print('Creating estimator for fold:', fold)
    
    train_indices, val_indices = get_indices(fold)
    X_tot_ml1 = np.loadtxt('resnet_model/resnet_training_features' + str(fold) + '.txt', delimiter=',')
    X_train_ml1 = X_tot_ml1[train_indices]
    X_val_ml1 = X_tot_ml1[val_indices]
    X_test_ml1 = np.loadtxt('resnet_model/resnet_test_features' + str(fold) + '.txt', delimiter=',')
    print(X_train_ml1.shape, X_val_ml1.shape, X_test_ml1.shape)
    
    X_tot_ml2 = np.loadtxt('paper_good_fellow/fella_ml_training_features' + str(fold) + '.txt', delimiter=',')
    X_train_ml2 = X_tot_ml2[train_indices]
    X_val_ml2 = X_tot_ml2[val_indices]
    X_test_ml2 = np.loadtxt('paper_good_fellow/fella_ml_test_features' + str(fold) + '.txt', delimiter=',')
    print(X_train_ml2.shape, X_val_ml2.shape, X_test_ml2.shape)
    
    X_train_hc = X_tot_hc[train_indices]
    X_val_hc = X_tot_hc[val_indices]
    
    X_train_combined = np.concatenate([X_train_ml1, X_train_ml2, X_train_hc], axis=1)
    X_val_combined = np.concatenate([X_val_ml1, X_val_ml2, X_val_hc], axis=1)
    X_test_combined = np.concatenate([X_test_ml1, X_test_ml2, X_test_hc], axis=1)
    
    y_train = y_tot[train_indices]
    y_val = y_tot[val_indices]
    
    if predict_score:
        estimator.fit(X_train_combined, y_train)
        prediction = estimator.predict(X_val_combined)
        print('got score:', f1_score(y_val, prediction, average='micro'))
        
        return estimator 
    
    if predict_test:
        X_all = np.concatenate([X_train_combined, X_val_combined], axis=0)
        y_all = np.concatenate([y_train, y_val], axis=0)
        
        estimator.fit(X_all, y_all)
        y_test = estimator.predict(X_test_combined)
        
        return y_test
    
    if predict_proba_test:
        X_all = np.concatenate([X_train_combined, X_val_combined], axis=0)
        y_all = np.concatenate([y_train, y_val], axis=0)
        
        estimator.fit(X_all, y_all)
        y_test = estimator.predict_proba(X_test_combined)
        
        return y_test
    

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, StackingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.base import clone

import os
os.environ['OMP_NUM_THREADS'] = "25"

lgbm = LGBMClassifier(n_estimators=2000, learning_rate=0.11, num_leaves=16, random_state=0, num_threads=128)
xgboost = XGBClassifier(n_estimators=2000, random_state=0, learning_rate=0.11, max_depth=16, alpha=0.2)
gradient = HistGradientBoostingClassifier(random_state=0, learning_rate=0.15, max_iter=400, max_leaf_nodes=31)
forest = RandomForestClassifier(n_estimators=2000, random_state=0, n_jobs=-1)

selected = [('lgbm', lgbm), ('xgboost', xgboost), ('hist', gradient), ('forest', forest)]
estimator = StackingClassifier(selected) 

In [None]:
for i in range(5):
    get_prediction_estimator(fold=i, estimator=clone(estimator), predict_score=True)

In [None]:
predictions = []
for i in range(5):
    predictions.append(get_prediction_estimator(fold=i, estimator=clone(estimator), predict_test=True))
df_pred = pd.DataFrame(predictions)
y_final_pred = (df_pred.mode(axis=0, numeric_only=True).iloc[0].to_numpy(dtype=np.float32))

In [None]:
df_submission = pd.read_csv('sample.csv')
df_submission['y'] = y_final_pred
df_submission.to_csv('current.csv', index=False)