In [None]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import flowkit as fk
import gc
from sklearn.metrics import mean_squared_error, roc_auc_score

from xgboost import XGBClassifier, XGBRFRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from lightgbm import LGBMModel,LGBMClassifier

# import warnings
# warnings.filterwarnings("ignore")

In [None]:
cog_score = pd.read_csv('/home/chengstark/Dev/brain-flow-data/BRAIN NP+WCST_Merged 2020-0528.csv')
cog_score.drop('npnote', inplace=True, axis=1)
cog_score

In [None]:
processed_cog_score = cog_score.loc[:, ['subject','GDS', 'GDS_impair', 'agey_np', 'edu_np', 
                                       'race_np', 'gen_np']]
recorded_subjects = processed_cog_score['subject'].values

In [None]:
processed_cog_score

In [None]:
npy_path = 'viable_npy/'
subsample_count = 20
min_sample_size = 1000


kf = KFold(n_splits=5, shuffle=True, random_state=1)
pbar = tqdm(enumerate(kf.split(os.listdir(npy_path))))
for fidx, splits in pbar:
    print(fidx)
    xs_train = []
    ys_train = []
    scores_train = []
    xs_val = []
    ys_val = []
    scores_val = []
    
    covars_train = []
    covars_val = []
    
    train_idx, val_idx = splits
    
    train_f = [os.listdir(npy_path)[i] for i in train_idx]
    val_f = [os.listdir(npy_path)[i] for i in val_idx]
    
    for f in train_f:
        subject = int(f.split(' ')[0])
        if not subject in recorded_subjects:
            print('{} Subject {} no cog score'.format(fidx, subject))
            continue
        
        x = np.load(npy_path + f)
#         print(x.shape[0], min_sample_size)
        y = processed_cog_score.loc[processed_cog_score['subject'] == subject]['GDS_impair'].values
        score = processed_cog_score.loc[processed_cog_score['subject'] == subject]['GDS'].values

        covar = []
        for cvr in ['agey_np', 'edu_np']:
            covar.append(processed_cog_score.loc[processed_cog_score['subject'] == subject][cvr].values[0])
        for i in range(subsample_count):
            np.random.seed(i)
            subsample_idx = np.random.choice(np.arange(x.shape[0]), min_sample_size, replace=False)
            subsampled_x = x[subsample_idx]
            xs_train.append(subsampled_x)
            ys_train.append(y[0])
            scores_train.append(score[0])
            covars_train.append(covar)
            
    xs_train = np.asarray(xs_train)
    ys_train = np.asarray(ys_train)
    scores_train = np.asarray(scores_train)
    covars_train = np.asarray(covars_train)
    
    n_xs_train = xs_train[ys_train == 0]
    p_xs_train = xs_train[ys_train == 1]
    n_ys_train = ys_train[ys_train == 0]
    p_ys_train = ys_train[ys_train == 1]
    n_scores_train = scores_train[ys_train == 0]
    p_scores_train = scores_train[ys_train == 1]
    n_covars_train = covars_train[ys_train == 0]
    p_covars_train = covars_train[ys_train == 1]

    clip = min(n_ys_train.shape[0], p_ys_train.shape[0])
    
    p_clip_idx = np.random.choice(np.arange(p_xs_train.shape[0]), clip, replace=False)
    n_clip_idx = np.random.choice(np.arange(n_xs_train.shape[0]), clip, replace=False)
    
    n_xs_train = n_xs_train[n_clip_idx]
    p_xs_train = p_xs_train[p_clip_idx]
    n_ys_train = n_ys_train[n_clip_idx]
    p_ys_train = p_ys_train[p_clip_idx]
    n_scores_train = n_scores_train[n_clip_idx]
    p_scores_train = p_scores_train[p_clip_idx]
    n_covars_train = n_covars_train[n_clip_idx]
    p_covars_train = p_covars_train[p_clip_idx]
    
    xs_train = np.concatenate((n_xs_train, p_xs_train), axis=0)
    ys_train = np.concatenate((n_ys_train, p_ys_train))
    scores_train = np.concatenate((n_scores_train, p_scores_train))
    covars_train = np.concatenate((n_covars_train, p_covars_train), axis=0)
    
    np.save('data_folds_raw/X_train_{}.npy'.format(fidx), xs_train)
    np.save('data_folds_raw/y_train_{}.npy'.format(fidx), ys_train)
    np.save('data_folds_raw/score_train_{}.npy'.format(fidx), scores_train)
    np.save('data_folds_raw/covar_train_{}.npy'.format(fidx), covars_train)

    print('train shapes: {} {} {} {}'.format(xs_train.shape, ys_train.shape, scores_train.shape, covars_train.shape))

    del xs_train, ys_train, covars_train, scores_train, n_covars_train, n_scores_train, n_xs_train, n_ys_train, p_covars_train, p_scores_train, p_xs_train, p_ys_train
    gc.collect()
    
    for f in val_f:
        subject = int(f.split(' ')[0])
        if not subject in recorded_subjects:
            print('Subject {} no cog score'.format(subject))
            continue
        
        x = np.load(npy_path + f)

        y = processed_cog_score.loc[processed_cog_score['subject'] == subject]['GDS_impair'].values
        score = processed_cog_score.loc[processed_cog_score['subject'] == subject]['GDS'].values
        covar = []
        for cvr in ['agey_np', 'edu_np']:
            covar.append(processed_cog_score.loc[processed_cog_score['subject'] == subject][cvr].values[0])
        
        for i in range(subsample_count):
            np.random.seed(i+subsample_count)
            subsample_idx = np.random.choice(np.arange(x.shape[0]), min_sample_size, replace=False)
            subsampled_x = x[subsample_idx]
            xs_val.append(subsampled_x)
            ys_val.append(y[0])
            scores_val.append(score[0])
            covars_val.append(covar)
            
    
    xs_val = np.asarray(xs_val)
    ys_val = np.asarray(ys_val)
    scores_val = np.asarray(scores_val)
    covars_val = np.asarray(covars_val)
    
    print('val shapes: {} {} {} {}'.format(xs_val.shape, ys_val.shape, scores_val.shape, covars_val.shape))
    
    np.save('data_folds_raw/X_val_{}.npy'.format(fidx), xs_val)
    np.save('data_folds_raw/y_val_{}.npy'.format(fidx), ys_val)
    np.save('data_folds_raw/score_val_{}.npy'.format(fidx), scores_val)
    np.save('data_folds_raw/covar_val_{}.npy'.format(fidx), covars_val)

    pbar.set_description('Created fold {} save file\n'.format(fidx))
    
    del xs_val, ys_val, scores_val, covars_val
    gc.collect()