In [1]:
import pandas
pandas.options.mode.chained_assignment = None
import sys
import numpy as np
from globsML.utils.preprocessing import rescale_data, select_sources, create_data_dict, replace_NaN
from globsML.models.skmodels import StandardClassifier

# create seeds randomly
np.random.seed(1231245)
seeds = list(map(int, np.random.random(10)*12345678))
assert(len(set(seeds))==len(seeds))

In [2]:
# choose method
method = 'catboost'
final_df = pandas.DataFrame()

# run experiment for all random splits
for seed in seeds:
    # load data
    data_path = '../data/ACS_sources_original.csv'
    data = pandas.read_csv(data_path)

    # create data split
    df_train, dfs_test = select_sources(data, random_seed = seed)
    df_train, dfs_test = replace_NaN(df_train, dfs_test)
    df = create_data_dict(data, df_train, dfs_test)
    df, _ = rescale_data(df)

    input_dim = len(df['train']['inputs'][0])
    
    # create and fit model
    model = StandardClassifier(method=method, data=df)
    # evaluate model on test data
    model.test()
    final_df = final_df.append(model.stats_all)
final_df['seeds'] = seeds
final_df.to_csv('results_{}.csv'.format(method))

Number of sources in training split after dropping rows with NaN as CI/m/color: 66148
1795 sources have been dropped.
NaN values in testing data have been replaced with the correspnding median value observed in the training split


100%|█████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 3018.38it/s]

eccentricity will not be transformed. Skipped.
eccentricity_z will not be transformed. Skipped.
Using default parameters: {'iterations': 100, 'random_seed': 63, 'learning_rate': 0.5}
Loading data...
Fitting model...



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return auc(x,y)/(2*auc([0,x[-1]], [0,y[-1]])), x, y


Number of sources in training split after dropping rows with NaN as CI/m/color: 66142
1801 sources have been dropped.
NaN values in testing data have been replaced with the correspnding median value observed in the training split


100%|█████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 2786.15it/s]

eccentricity will not be transformed. Skipped.
eccentricity_z will not be transformed. Skipped.
Using default parameters: {'iterations': 100, 'random_seed': 63, 'learning_rate': 0.5}
Loading data...
Fitting model...





Number of sources in training split after dropping rows with NaN as CI/m/color: 66159
1784 sources have been dropped.
NaN values in testing data have been replaced with the correspnding median value observed in the training split


100%|█████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 2397.12it/s]

eccentricity will not be transformed. Skipped.
eccentricity_z will not be transformed. Skipped.
Using default parameters: {'iterations': 100, 'random_seed': 63, 'learning_rate': 0.5}
Loading data...
Fitting model...





Number of sources in training split after dropping rows with NaN as CI/m/color: 66147
1796 sources have been dropped.
NaN values in testing data have been replaced with the correspnding median value observed in the training split


100%|█████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 2439.72it/s]

eccentricity will not be transformed. Skipped.
eccentricity_z will not be transformed. Skipped.
Using default parameters: {'iterations': 100, 'random_seed': 63, 'learning_rate': 0.5}
Loading data...
Fitting model...





Number of sources in training split after dropping rows with NaN as CI/m/color: 66167
1776 sources have been dropped.
NaN values in testing data have been replaced with the correspnding median value observed in the training split


100%|█████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 2706.66it/s]

eccentricity will not be transformed. Skipped.
eccentricity_z will not be transformed. Skipped.
Using default parameters: {'iterations': 100, 'random_seed': 63, 'learning_rate': 0.5}
Loading data...
Fitting model...





Number of sources in training split after dropping rows with NaN as CI/m/color: 66198
1745 sources have been dropped.
NaN values in testing data have been replaced with the correspnding median value observed in the training split


100%|█████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 2907.91it/s]

eccentricity will not be transformed. Skipped.
eccentricity_z will not be transformed. Skipped.
Using default parameters: {'iterations': 100, 'random_seed': 63, 'learning_rate': 0.5}
Loading data...
Fitting model...





Number of sources in training split after dropping rows with NaN as CI/m/color: 66163
1780 sources have been dropped.
NaN values in testing data have been replaced with the correspnding median value observed in the training split


100%|█████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 2566.89it/s]

eccentricity will not be transformed. Skipped.
eccentricity_z will not be transformed. Skipped.
Using default parameters: {'iterations': 100, 'random_seed': 63, 'learning_rate': 0.5}
Loading data...
Fitting model...





Number of sources in training split after dropping rows with NaN as CI/m/color: 66129
1814 sources have been dropped.
NaN values in testing data have been replaced with the correspnding median value observed in the training split


100%|█████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 2617.89it/s]

eccentricity will not be transformed. Skipped.
eccentricity_z will not be transformed. Skipped.
Using default parameters: {'iterations': 100, 'random_seed': 63, 'learning_rate': 0.5}
Loading data...
Fitting model...





Number of sources in training split after dropping rows with NaN as CI/m/color: 66151
1792 sources have been dropped.
NaN values in testing data have been replaced with the correspnding median value observed in the training split


100%|█████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 2927.93it/s]

eccentricity will not be transformed. Skipped.
eccentricity_z will not be transformed. Skipped.
Using default parameters: {'iterations': 100, 'random_seed': 63, 'learning_rate': 0.5}
Loading data...
Fitting model...





Number of sources in training split after dropping rows with NaN as CI/m/color: 66170
1773 sources have been dropped.
NaN values in testing data have been replaced with the correspnding median value observed in the training split


100%|█████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 2677.71it/s]

eccentricity will not be transformed. Skipped.
eccentricity_z will not be transformed. Skipped.
Using default parameters: {'iterations': 100, 'random_seed': 63, 'learning_rate': 0.5}
Loading data...
Fitting model...





In [3]:
loaded_df = pandas.read_csv('results_{}.csv'.format(method))
print(loaded_df.mean())
print(loaded_df.std())

Unnamed: 0      0.000000e+00
TPR             9.199993e-01
FDR             8.905949e-02
FPR             2.508506e-02
AUC(FDR,TPR)    9.634821e-01
AUC(FPR,TPR)    9.901422e-01
# found GCs     3.409800e+03
# total GCs     3.706200e+03
# fake GCs      3.332000e+02
# sources       1.698600e+04
seeds           6.561560e+06
dtype: float64
Unnamed: 0      0.000000e+00
TPR             6.273670e-03
FDR             7.311637e-03
FPR             1.836246e-03
AUC(FDR,TPR)    2.631459e-03
AUC(FPR,TPR)    8.803380e-04
# found GCs     6.209455e+01
# total GCs     5.527065e+01
# fake GCs      2.558124e+01
# sources       0.000000e+00
seeds           4.151769e+06
dtype: float64


  print(loaded_df.mean())
  print(loaded_df.std())
