In [0]:
%%capture
!pip3 install ijson
!pip3 install json
!pip3 install send2trash
from google.colab import drive, files
drive.mount('/content/gdrive', force_remount=True)


In [2]:
import sys
import matplotlib.pyplot as plt
import keras
import tensorflow
import pandas as pd
from keras.layers import *
from pathlib import Path
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

print("TensorFlow version:", tensorflow.__version__)


np.set_printoptions(edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: "%.3g" % x))
GOOGLE_COLAB = "google.colab" in sys.modules
if GOOGLE_COLAB:
    sys.path.append("./gdrive/My Drive/Colab Notebooks/solar_flares")
    plt.style.use("default")
    config = tensorflow.ConfigProto(device_count={"GPU": 1})
    keras.backend.set_session(tensorflow.Session(config=config))

else:
    config = tensorflow.ConfigProto(intra_op_parallelism_threads=8, inter_op_parallelism_threads=8)
    keras.backend.tensorflow_backend.set_session(tensorflow.Session(config=config))
    pass

from conv.conv_model import *
from reading_data import load_npz_file, preprocess_data
from plotting import plot_losses

Using TensorFlow backend.


TensorFlow version: 1.13.1


In [4]:
if __name__ == '__main__':
    ROOTDIR = "/Users/mag/Google Drive/Colab Notebooks/solar_flares/"
    DATADIR = "input/npz"
    OUTDIR = "output"
    DATANAME = "conv"
    TAG = None
    
    if GOOGLE_COLAB:
        ROOTDIR = "./gdrive/My Drive/Colab Notebooks" + str(ROOTDIR).split("Colab Notebooks")[-1]

        
    rootdir = Path(ROOTDIR)
    datadir = rootdir.joinpath(DATADIR)
    outdir = rootdir.joinpath(OUTDIR)
    
    TRAIN = True
    NEW_MODEL = True

    TEST = False

    BALANCED = True
    
    
    CALLBACK_TIMEOUT = 10
    N_TIMESTEPS = 60  # Change if Variable length
    SCALER = sklearn.preprocessing.maxabs_scale
    STATIONARY = False
    if TEST:
        model_name = "{}_test_model.h5".format(DATANAME)
        EPOCHS =100
        PERCENTAGE = 100
        BATCH_SIZE = 128
    else:
        model_name = "{}_best_model.h5".format(DATANAME)
        EPOCHS = 25
        PERCENTAGE = 100
        BATCH_SIZE = 128

    

    ## Loading data
    if TEST:
        X, y = load_npz_file(datadir / 'small_processed.npz')

        y = keras.utils.to_categorical(y, num_classes=2)

        # preprocess x


        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
            X, y,
            train_size = 0.9
            )
        X = None
        y = None
    else:
#         
    
        if BALANCED:
            print('Using balanced data')
            X_train, y_train = load_npz_file(datadir / 'fold1-2Training_balanced.npz')
    
        else:
            print('Using unbalanced data')
            X1, y1 = load_npz_file(datadir / 'fold1Training.npz')
            X2, y2 = load_npz_file(datadir / 'fold2Training.npz')


            X_train = np.concatenate([X1, X2,])
            y_train = np.concatenate([y1, y2,])

            # remove vars
            X1 = None
            y1 = None
            X2 = None
            y2 = None

            
            
        X_test, y_test = load_npz_file(datadir / 'fold3Training_processed.npz')
    
        y_test = keras.utils.to_categorical(y_test, num_classes=2)
        y_train = keras.utils.to_categorical(y_train, num_classes=2)




        
    print('Data loaded.')


    model = get_model(
        n_features=X_train.shape[-1],
        train=TRAIN,
        new_model=NEW_MODEL,
        model_name=model_name,
        model_path=outdir,
        google_colab=GOOGLE_COLAB,
    )

    if TAG is not None:
        DATANAME += "_" + TAG
        model_name = model_name.replace("best_model", TAG + "_best_model")

    if TRAIN:
        callbacks = generate_callbacks(patience=CALLBACK_TIMEOUT, outdir=outdir, name=DATANAME)

        model.fit(
            x=X_train,
            y=y_train,
            validation_data=(X_test, y_test),
            epochs=EPOCHS,
            batch_size=BATCH_SIZE,
            callbacks=callbacks,
        )
        plot_losses(logpath=outdir, outdir=outdir, name=DATANAME)

#         if GOOGLE_COLAB:
#             print("Converted model from GPU to CPU-compatible")
#             cpu_model = create_model(google_colab=False, n_features=X_train.shape[-1])
#             gpu_model_to_cpu(
#                 trained_gpu_model=model, untrained_cpu_model=cpu_model, outdir=outdir, modelname=model_name.replace('.h5', '_cpu.h5')
#             )



Using balanced data
Data loaded.
Created new model.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 60, 25)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 60, 256)           51456     
_________________________________________________________________
batch_normalization_1 (Batch (None, 60, 256)           1024      
_________________________________________________________________
activation_1 (Activation)    (None, 60, 256)           0         
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 60, 256)           0         
___________________________________

In [5]:
    print("Evaluating...")
    
    
#     X_test, y_test = load_npz_file(datadir / 'fold3Training.npz')
#     X_test = preprocess_data(X_test)
#     y_test = keras.utils.to_categorical(y_test, num_classes=2)
#     print('Data loaded.')
    
    
    
    y_pred = model.predict(X_test)
#     print(np.argmax(y_test,axis=1))
#     print(np.argmax(y_pred,axis=1))
    
    y_true = np.argmax(y_test,axis=1)
    y_labels = np.argmax(y_pred,axis=1)
    print(y_true.sum())        
    print(y_labels.sum())
    # model.save(os.path.join(outdir, name + "_best_model_gpu.h5"))

    #for yp, yt in zip(np.argmax(y_pred, axis=1), np.argmax(y_test, axis=1)):
    #    print('T:{} P:{}'.format(yt,yp))
#     plot_confusion_matrices(
#         y_target=y_true, y_pred=y_labels, y_is_binary=True, outdir=outdir, name=DATANAME
#     )
    print("F1 score: {}".format(f1_score(y_true, y_labels)))
    # 
    # df = pd.DataFrame({'Id':ids,'ClassLabel':np.argmax(y_pred,axis=1)})
    # df.to_csv(outdir/'submission.csv')
#     del X_train
#     del y_train




Evaluating...
4770
27006
F1 score: 0.300226586102719


In [0]:
print(y_test.shape)
print(y_pred.shape)

(27006, 2)
(27006, 2)


In [0]:
    X_test, _ = load_npz_file(datadir / 'testSet_processed.npz')
    X_test = preprocess_data(X_test)

    print('Loaded')
    y_pred = model.predict(X_test)
    
    y_labels = np.argmax(y_pred,axis=1)


    CSVNAME =  model_name.replace('.h5','') + '.csv'
    
    ids = np.arange(1, len(y_pred) + 1, dtype=int) 
    df = pd.DataFrame({'Id':ids,'ClassLabel':y_labels})
    df.to_csv(outdir / CSVNAME, index = False)
    print(df.head())

Loaded
   Id  ClassLabel
0   1           0
1   2           0
2   3           0
3   4           0
4   5           0
