In [1]:
import torch
import json
from PIL import Image
from model import VGG16
from torchvision.transforms import Compose, Resize, ToTensor
from torch.nn.functional import softmax
import warnings
warnings.simplefilter("ignore", Warning)
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import os
from sklearn.metrics import confusion_matrix
from datetime import timedelta
# from evaluation import sklearn_Compatible_preds_and_targets

In [2]:
class MyJP2Dataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __getitem__(self, index):
        img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
        hmi = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(hmi)
            
        y_label = torch.tensor(int(self.annotations.iloc[index, 1]))
        
        return (image, y_label, img_path)

    def __len__(self):
        return len(self.annotations)

In [3]:
# Load Data
im_size = 512
datapath = '/scratch/cpandey1/hmi_jpgs_512/'
partition1_path = '../data_labeling/data_labels/Fold1_val.csv'
partition2_path = '../data_labeling/data_labels/Fold2_val.csv'
partition3_path = '../data_labeling/data_labels/Fold3_val.csv'
partition4_path = '../data_labeling/data_labels/Fold4_val.csv'


transformations = Compose([
    Resize(im_size),
    ToTensor()
])

part1 = MyJP2Dataset(csv_file = partition1_path, 
                             root_dir = datapath,
                             transform = transformations)
part2 = MyJP2Dataset(csv_file = partition2_path, 
                             root_dir = datapath,
                             transform = transformations)
part3 = MyJP2Dataset(csv_file = partition3_path, 
                             root_dir = datapath,
                             transform = transformations)
part4 = MyJP2Dataset(csv_file = partition4_path, 
                             root_dir = datapath,
                             transform = transformations)

In [4]:
part1_loader = DataLoader(dataset=part1, batch_size=48, num_workers=4, shuffle=False)
part2_loader = DataLoader(dataset=part2, batch_size=48, num_workers=4, shuffle=False)
part3_loader = DataLoader(dataset=part3, batch_size=48, num_workers=4, shuffle=False)
part4_loader = DataLoader(dataset=part4, batch_size=48, num_workers=4, shuffle=False)

In [5]:
device = torch.device('cuda')
model_PATH1 = '../create_models/trained_models/fold1/fold1.pth'
model_PATH2 = '../create_models/trained_models/fold2/fold2.pth'
model_PATH3 = '../create_models/trained_models/fold3/fold3.pth'
model_PATH4 = '../create_models/trained_models/fold4/fold4.pth'
weights1 = torch.load(model_PATH1)
weights2 = torch.load(model_PATH2)
weights3 = torch.load(model_PATH3)
weights4 = torch.load(model_PATH4)
test_model = VGG16(ipt_size=(512, 512), train=False).to(device)

In [6]:
def sklearn_Compatible_preds_and_targets(model_prediction_list, model_target_list, model_path_list):
    y_pred_list = []
    preds = []
    target_list = []
    tgts = []
    path_list = []
    path = []
    y_pred_list = [a.squeeze().tolist() for a in model_prediction_list]
    preds = [item for sublist in y_pred_list for item in sublist]
    target_list = [a.squeeze().tolist() for a in model_target_list]
    tgts = [item for sublist in target_list for item in sublist]
    path_list = [a for a in model_path_list]
    path = [item for sublist in path_list for item in sublist]
    return preds,tgts, path


def accuracy_score(prediction, target):
    TN, FP, FN, TP = confusion_matrix(target, prediction).ravel()
    print("TP: ", TP, "FP: ", FP, "TN: ", TN, "FN: ", FN)
    #TSS Computation also known as "recall"
    tp_rate = TP / float(TP + FN) if TP > 0 else 0  
    fp_rate = FP / float(FP + TN) if FP > 0 else 0
    TSS = tp_rate - fp_rate
    
    #HSS2 Computation
    N = TN + FP
    P = TP + FN
    HSS = (2 * (TP * TN - FN * FP)) / float((P * (FN + TN) + (TP + FP) * N))

    return TSS, HSS

In [7]:
def predict(checkpoint, test_loader, desc ):
    test_target_list=[]
    test_prediction_list=[]
    test_path_list = []
    test_model.load_state_dict(checkpoint['model_state_dict'])
    test_model.eval()
    print('***********************', desc, '*************************')
    with torch.no_grad():
        for d, t, path in test_loader:
            # Get data to cuda if possible
            d = d.to(device=device)
            t = t.to(device=device)
    #         pa = path.to(device=device)
            test_target_list.append(t)
            test_path_list.append(list(path))
    #         print(list(path))
            # forward pass
            s = test_model(d)
            #print("scores", s)

            # validation batch loss and accuracy
    #         l = criterion(s, t)
            p = softmax(s,dim=1)
    #         print(p[:,1])
            test_prediction_list.append(p[:,1])
            # accumulating the val_loss and accuracy
    #         val_loss += l.item()
            #val_acc += acc.item()
            del d,t,s,p
    a, b, c = sklearn_Compatible_preds_and_targets(test_prediction_list, test_target_list, test_path_list)
    preds = [int(i >=0.5) for i in a]
    print(accuracy_score(preds, b))
    prob_list = pd.DataFrame(
    {'timestamp': c,
     'flare_prob': a,
     'target': b
    })

    print(prob_list['target'].value_counts())
#     prob_list['timestamp'] = prob_list['timestamp'].apply(lambda row: row[35:-4])
#     prob_list['timestamp'] = pd.to_datetime(prob_list['timestamp'], format='%Y.%m.%d_%H.%M.%S')
    return prob_list

In [8]:
fold1 = predict(weights1, part1_loader, 'Fold-1 Results')
fold2 = predict(weights2, part2_loader, 'Fold-2 Results')
fold3 = predict(weights3, part3_loader, 'Fold-3 Results')
fold4 = predict(weights4, part4_loader, 'Fold-4 Results')

*********************** Fold-1 Results *************************
TP:  1777 FP:  2319 TN:  10135 FN:  557
(0.5751486636202545, 0.44014573119868605)
0    12454
1     2334
Name: target, dtype: int64
*********************** Fold-2 Results *************************
TP:  1237 FP:  3278 TN:  10577 FN:  375
(0.5307764394253492, 0.295586748222468)
0    13855
1     1612
Name: target, dtype: int64
*********************** Fold-3 Results *************************
TP:  1335 FP:  2581 TN:  11727 FN:  1029
(0.3843322183890593, 0.30166847919515566)
0    14308
1     2364
Name: target, dtype: int64
*********************** Fold-4 Results *************************
TP:  2073 FP:  3373 TN:  10659 FN:  617
(0.530252836850873, 0.3749851523238552)
0    14032
1     2690
Name: target, dtype: int64


In [11]:
fold1.to_csv(r'../create_models/prediction_results/fold1_res.csv', index=False, header=True, columns=['timestamp', 'flare_prob', 'target'])
fold2.to_csv(r'../create_models/prediction_results/fold2_res.csv', index=False, header=True, columns=['timestamp', 'flare_prob', 'target'])
fold3.to_csv(r'../create_models/prediction_results/fold3_res.csv', index=False, header=True, columns=['timestamp', 'flare_prob', 'target'])
fold4.to_csv(r'../create_models/prediction_results/fold4_res.csv', index=False, header=True, columns=['timestamp', 'flare_prob', 'target'])

In [12]:
fold1_val = pd.read_csv(r'../create_models/prediction_results/fold1_res.csv')
fold2_val = pd.read_csv(r'../create_models/prediction_results/fold2_res.csv')
fold3_val = pd.read_csv(r'../create_models/prediction_results/fold3_res.csv')
fold4_val = pd.read_csv(r'../create_models/prediction_results/fold4_res.csv')
total = pd.concat([fold1_val, fold2_val, fold3_val, fold4_val])
total['timestamp'] = total['timestamp'].apply(lambda row: row[47:-4])
total['timestamp'] =  pd.to_datetime(total['timestamp'], format='%Y.%m.%d_%H.%M.%S')
total.reset_index(inplace=True)

In [13]:
details = pd.read_csv('M_full_dataset_cleaned_1_hours_with_loc_and_time_new.csv')
details['timestamp'] = details['label'].apply(lambda row: row[16:-4])
details['timestamp'] = pd.to_datetime(details['timestamp'], format='%Y.%m.%d_%H.%M.%S')
details.drop(columns=['label'], inplace=True)
df = total.merge(details, how='left', on='timestamp')
df

Unnamed: 0,index,timestamp,flare_prob,target,goes_class,fl_location,flare_start
0,0,2011-01-01 00:00:00,0.171418,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
1,1,2011-01-01 01:00:00,0.213330,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
2,2,2011-01-01 02:00:00,0.201592,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
3,3,2011-01-01 03:00:00,0.237191,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
4,4,2011-01-01 04:00:00,0.204492,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
...,...,...,...,...,...,...,...
63644,16717,2018-12-30 19:00:00,0.018860,0,NF,unk,unk
63645,16718,2018-12-30 20:00:00,0.019474,0,NF,unk,unk
63646,16719,2018-12-30 21:00:00,0.019670,0,NF,unk,unk
63647,16720,2018-12-30 22:00:00,0.019339,0,NF,unk,unk


In [14]:
def date_to_filename(df):
    cols=['timestamp']
    for items in cols:

        df[items] = pd.to_datetime(df[items], format='%Y-%m-%d %H:%M:%S')

        #Renaming label(Date) to this format of file HMI.m2010.05.21_12.00.00 
        df[items] = df[items].dt.year.astype(str) + '/' \
            + df[items].dt.month.map("{:02}".format).astype(str) + '/'\
            + df[items].dt.day.map("{:02}".format).astype(str) + '/'+ 'HMI.m'+ df[items].dt.year.astype(str) + '.' \
            + df[items].dt.month.map("{:02}".format).astype(str) + '.'\
            + df[items].dt.day.map("{:02}".format).astype(str) + '_' \
            + df[items].dt.hour.map("{:02}".format).astype(str) + '.'\
            + df[items].dt.minute.map("{:02}".format).astype(str) + '.'\
            + df[items].dt.second.map("{:02}".format).astype(str) + '.jpg'
    
    return df

In [15]:
def location_analysis(df, flareclass):
    X = df.loc[(df.goes_class.str.startswith(flareclass))].copy()
    X[["x", "y"]] = X["fl_location"].str.strip(r"[()]").str.split(",", expand=True).astype(str)
    X['x'] = pd.to_numeric(X['x']).round(decimals=2).astype(str).replace(r'\.0$', '', regex=True)
    X[["x", "y"]] = X[['x', 'y']].astype(float)
    pos = X[(X.flare_prob>=0.5)]
    neg = X[(X.flare_prob<0.5)]
    pos_limb = len(pos.loc[(pos.x<-70) | (pos.x>70)])+len(pos.loc[(pos.y<-70) | (pos.y>70)])
    neg_limb = len(neg.loc[(neg.x<-70) | (neg.x>70)])+ len(neg.loc[(neg.y<-70) | (neg.y>70)])
    pos_center = len(pos.loc[(pos.y>=-70) & (pos.y<=70) & (pos.x>=-70) & (pos.x<=70)])
    neg_center = len(neg.loc[(neg.y>=-70) & (neg.y<=70) & (neg.x>=-70) & (neg.x<=70)])
    print(f'*************{flareclass} Class flares locations***************')
    print("Total Instances: ", len(X))
    print('With in Central Locations')
    print('TP: ', pos_center, 'FN: ', neg_center)
    print('Beyond Central Locations (Limb Locations)')
    print('TP: ', pos_limb, 'FN: ', neg_limb, '\n')
    
location_analysis(df.copy(), 'X')
location_analysis(df.copy(), 'M')

*************X Class flares locations***************
Total Instances:  880
With in Central Locations
TP:  597 FN:  71
Beyond Central Locations (Limb Locations)
TP:  164 FN:  48 

*************M Class flares locations***************
Total Instances:  8120
With in Central Locations
TP:  4464 FN:  1366
Beyond Central Locations (Limb Locations)
TP:  1197 FN:  1093 



In [16]:

new_df = date_to_filename(df)
df_x = new_df.loc[(new_df.goes_class.str.startswith('X'))]
df_m = new_df.loc[(new_df.goes_class.str.startswith('M'))]
df_c = new_df.loc[(new_df.goes_class.str.startswith('C')) & (new_df.flare_prob>=0.8)]
cols = ['timestamp', 'flare_prob', 'goes_class', 'fl_location', 'flare_start']
df_x.to_csv(r'x_class.csv', index=False, header=True, columns=cols)
df_m.to_csv(r'm_class.csv', index=False, header=True, columns=cols)
df_c.to_csv(r'c_class.csv', index=False, header=True, columns=cols)