In [1]:
import torch
import json
from PIL import Image
from modeling.attention_model import Attn_Net
from torchvision.transforms import Compose, Resize, ToTensor
from torch.nn.functional import softmax
import warnings
warnings.simplefilter("ignore", Warning)
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import os
from sklearn.metrics import confusion_matrix
from datetime import timedelta

In [2]:
class MyJP2Dataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __getitem__(self, index):
        img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
        hmi = Image.open(img_path)

        if self.transform:
            image = self.transform(hmi)
            
        y_label = torch.tensor(int(self.annotations.iloc[index, 1]))
        
        return (image, y_label, img_path)

    def __len__(self):
        return len(self.annotations)

In [3]:
# Load Data
im_size = 256
datapath = '/scratch/cpandey1/hmi_jpgs_512/'
partition1_path = '../data_labels/simplified_data_labels/Fold1_val.csv'
partition2_path = '../data_labels/simplified_data_labels/Fold2_val.csv'
partition3_path = '../data_labels/simplified_data_labels/Fold3_val.csv'
partition4_path = '../data_labels/simplified_data_labels/Fold4_val.csv'


transformations = Compose([
    Resize(im_size),
    ToTensor()
])

part1 = MyJP2Dataset(csv_file = partition1_path, 
                             root_dir = datapath,
                             transform = transformations)
part2 = MyJP2Dataset(csv_file = partition2_path, 
                             root_dir = datapath,
                             transform = transformations)
part3 = MyJP2Dataset(csv_file = partition3_path, 
                             root_dir = datapath,
                             transform = transformations)
part4 = MyJP2Dataset(csv_file = partition4_path, 
                             root_dir = datapath,
                             transform = transformations)

In [4]:
part1_loader = DataLoader(dataset=part1, batch_size=24, num_workers=4, shuffle=False)
part2_loader = DataLoader(dataset=part2, batch_size=24, num_workers=4, shuffle=False)
part3_loader = DataLoader(dataset=part3, batch_size=24, num_workers=4, shuffle=False)
part4_loader = DataLoader(dataset=part4, batch_size=24, num_workers=4, shuffle=False)

In [5]:
device = torch.device('cuda')
model_PATH1 = '../modeling/trained_models/no_attention/fold_1_no_attn_.pth'
model_PATH2 = '../modeling/trained_models/no_attention/fold_1_no_attn_.pth'
model_PATH3 = '../modeling/trained_models/no_attention/fold_1_no_attn_.pth'
model_PATH4 = '../modeling/trained_models/no_attention/fold_1_no_attn.pth'
weights1 = torch.load(model_PATH1)
weights2 = torch.load(model_PATH2)
weights3 = torch.load(model_PATH3)
weights4 = torch.load(model_PATH4)
test_model = Attn_Net(im_size=im_size, num_classes=2, attention=False).to(device)

In [6]:
# test_model1.load_state_dict(weights1['model_state_dict'])
# test_model.eval()

In [7]:
def sklearn_Compatible_preds_and_targets(model_prediction_list, model_target_list, model_path_list):
    y_pred_list = []
    preds = []
    target_list = []
    tgts = []
    path_list = []
    path = []
    y_pred_list = [a.squeeze().tolist() for a in model_prediction_list]
    preds = [item for sublist in y_pred_list for item in sublist]
    target_list = [a.squeeze().tolist() for a in model_target_list]
    tgts = [item for sublist in target_list for item in sublist]
    path_list = [a for a in model_path_list]
    path = [item for sublist in path_list for item in sublist]
    return preds,tgts, path


def accuracy_score(prediction, target):
    TN, FP, FN, TP = confusion_matrix(target, prediction).ravel()
    print("TP: ", TP, "FP: ", FP, "TN: ", TN, "FN: ", FN)
    #TSS Computation also known as "recall"
    tp_rate = TP / float(TP + FN) if TP > 0 else 0  
    fp_rate = FP / float(FP + TN) if FP > 0 else 0
    TSS = tp_rate - fp_rate
    
    #HSS2 Computation
    N = TN + FP
    P = TP + FN
    HSS = (2 * (TP * TN - FN * FP)) / float((P * (FN + TN) + (TP + FP) * N))

    return TSS, HSS

In [8]:
def predict(checkpoint, test_loader, desc ):
    test_target_list=[]
    test_prediction_list=[]
    test_path_list = []
    test_model.load_state_dict(checkpoint['model_state_dict'])
    test_model.eval()
    print('***********************', desc, '*************************')
    with torch.no_grad():
        for d, t, path in test_loader:
            # Get data to cuda if possible
            d = d.to(device=device)
            t = t.to(device=device)
    #         pa = path.to(device=device)
            test_target_list.append(t)
            test_path_list.append(list(path))
    #         print(list(path))
            # forward pass
            s,_,_,_ = test_model(d)
            #print("scores", s)

            # validation batch loss and accuracy
    #         l = criterion(s, t)
            p = softmax(s,dim=1)
    #         print(p[:,1])
            test_prediction_list.append(p[:,1])
            # accumulating the val_loss and accuracy
    #         val_loss += l.item()
            #val_acc += acc.item()
            del d,t,s,p
    a, b, c = sklearn_Compatible_preds_and_targets(test_prediction_list, test_target_list, test_path_list)
    preds = [int(i >=0.5) for i in a]
    print(accuracy_score(preds, b))
    prob_list = pd.DataFrame(
    {'timestamp': c,
     'flare_prob': a,
     'target': b
    })

    print(prob_list['target'].value_counts())
#     prob_list['timestamp'] = prob_list['timestamp'].apply(lambda row: row[35:-4])
#     prob_list['timestamp'] = pd.to_datetime(prob_list['timestamp'], format='%Y.%m.%d_%H.%M.%S')
    return prob_list

In [9]:
fold1 = predict(weights1, part1_loader, 'Fold-1 Results')
fold2 = predict(weights2, part2_loader, 'Fold-2 Results')
fold3 = predict(weights3, part3_loader, 'Fold-3 Results')
fold4 = predict(weights4, part4_loader, 'Fold-4 Results')

*********************** Fold-1 Results *************************
TP:  1483 FP:  1972 TN:  10482 FN:  851
(0.4770471874630603, 0.3991558869376005)
0    12454
1     2334
Name: target, dtype: int64
*********************** Fold-2 Results *************************
TP:  639 FP:  1732 TN:  12123 FN:  973
(0.271392963097949, 0.2246572832279457)
0    13855
1     1612
Name: target, dtype: int64
*********************** Fold-3 Results *************************
TP:  620 FP:  1214 TN:  13094 FN:  1744
(0.1774197058004065, 0.19573610996227137)
0    14308
1     2364
Name: target, dtype: int64
*********************** Fold-4 Results *************************
TP:  1856 FP:  2722 TN:  11310 FN:  834
(0.4959776485399278, 0.38637937422481466)
0    14032
1     2690
Name: target, dtype: int64


In [10]:
fold1.to_csv(r'../modeling/trained_models/no_attention/fold1_res_noattn.csv', index=False, header=True, columns=['timestamp', 'flare_prob', 'target'])
fold2.to_csv(r'../modeling/trained_models/no_attention/fold2_res_noattn.csv', index=False, header=True, columns=['timestamp', 'flare_prob', 'target'])
fold3.to_csv(r'../modeling/trained_models/no_attention/fold3_res_noattn.csv', index=False, header=True, columns=['timestamp', 'flare_prob', 'target'])
fold4.to_csv(r'../modeling/trained_models/no_attention/fold4_res_noattn.csv', index=False, header=True, columns=['timestamp', 'flare_prob', 'target'])

In [11]:
fold1['timestamp'].apply(lambda row: row[47:-4])
# prob_list['timestamp'] = pd.to_datetime(prob_list['timestamp'], format='%Y.%m.%d_%H.%M.%S')

0        2011.01.01_00.00.00
1        2011.01.01_01.00.00
2        2011.01.01_02.00.00
3        2011.01.01_03.00.00
4        2011.01.01_04.00.00
                ...         
14783    2018.03.31_19.00.00
14784    2018.03.31_20.00.00
14785    2018.03.31_21.00.00
14786    2018.03.31_22.00.00
14787    2018.03.31_23.00.00
Name: timestamp, Length: 14788, dtype: object

In [13]:
fold1_val = pd.read_csv('../modeling/trained_models/no_attention/fold1_res_noattn.csv')
fold2_val = pd.read_csv('../modeling/trained_models/no_attention/fold2_res_noattn.csv')
fold3_val = pd.read_csv('../modeling/trained_models/no_attention/fold3_res_noattn.csv')
fold4_val = pd.read_csv('../modeling/trained_models/no_attention/fold4_res_noattn.csv')
total = pd.concat([fold1_val, fold2_val, fold3_val, fold4_val])
total['timestamp'] = total['timestamp'].apply(lambda row: row[47:-4])
total['timestamp'] =  pd.to_datetime(total['timestamp'], format='%Y.%m.%d_%H.%M.%S')
total.reset_index(inplace=True)

In [14]:
details = pd.read_csv('../modeling/trained_models/attention/M_full_dataset_cleaned_1_hours_with_loc_and_time_new.csv')
details['timestamp'] = details['label'].apply(lambda row: row[16:-4])
details['timestamp'] = pd.to_datetime(details['timestamp'], format='%Y.%m.%d_%H.%M.%S')
details.drop(columns=['label'], inplace=True)
df = total.merge(details, how='left', on='timestamp')
df

Unnamed: 0,index,timestamp,flare_prob,target,goes_class,fl_location,flare_start
0,0,2011-01-01 00:00:00,0.302853,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
1,1,2011-01-01 01:00:00,0.305089,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
2,2,2011-01-01 02:00:00,0.301837,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
3,3,2011-01-01 03:00:00,0.294656,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
4,4,2011-01-01 04:00:00,0.290386,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
...,...,...,...,...,...,...,...
63644,16717,2018-12-30 19:00:00,0.224259,0,NF,unk,unk
63645,16718,2018-12-30 20:00:00,0.223603,0,NF,unk,unk
63646,16719,2018-12-30 21:00:00,0.223968,0,NF,unk,unk
63647,16720,2018-12-30 22:00:00,0.223375,0,NF,unk,unk


In [16]:
def date_to_filename(df):
    cols=['timestamp']
    for items in cols:

        df[items] = pd.to_datetime(df[items], format='%Y-%m-%d %H:%M:%S')

        #Renaming label(Date) to this format of file HMI.m2010.05.21_12.00.00 
        df[items] = df[items].dt.year.astype(str) + '/' \
            + df[items].dt.month.map("{:02}".format).astype(str) + '/'\
            + df[items].dt.day.map("{:02}".format).astype(str) + '/'+ 'HMI.m'+ df[items].dt.year.astype(str) + '.' \
            + df[items].dt.month.map("{:02}".format).astype(str) + '.'\
            + df[items].dt.day.map("{:02}".format).astype(str) + '_' \
            + df[items].dt.hour.map("{:02}".format).astype(str) + '.'\
            + df[items].dt.minute.map("{:02}".format).astype(str) + '.'\
            + df[items].dt.second.map("{:02}".format).astype(str) + '.jpg'
    
    return df

In [17]:
new_df_with_file_name = date_to_filename(df)
new_df_with_file_name

Unnamed: 0,index,timestamp,flare_prob,target,goes_class,fl_location,flare_start
0,0,2011/01/01/HMI.m2011.01.01_00.00.00.jpg,0.302853,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
1,1,2011/01/01/HMI.m2011.01.01_01.00.00.jpg,0.305089,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
2,2,2011/01/01/HMI.m2011.01.01_02.00.00.jpg,0.301837,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
3,3,2011/01/01/HMI.m2011.01.01_03.00.00.jpg,0.294656,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
4,4,2011/01/01/HMI.m2011.01.01_04.00.00.jpg,0.290386,0,B8.3,"(-56, 30)",2011-01-01 21:52:00
...,...,...,...,...,...,...,...
63644,16717,2018/12/30/HMI.m2018.12.30_19.00.00.jpg,0.224259,0,NF,unk,unk
63645,16718,2018/12/30/HMI.m2018.12.30_20.00.00.jpg,0.223603,0,NF,unk,unk
63646,16719,2018/12/30/HMI.m2018.12.30_21.00.00.jpg,0.223968,0,NF,unk,unk
63647,16720,2018/12/30/HMI.m2018.12.30_22.00.00.jpg,0.223375,0,NF,unk,unk


In [19]:
pd.set_option('display.max_rows', 5000)
X = df.loc[(df.goes_class.str.startswith('X'))].copy()
X[["x", "y"]] = X["fl_location"].str.strip(r"[()]").str.split(",", expand=True).astype(str)
X['x'] = pd.to_numeric(X['x']).round(decimals=2).astype(str).replace(r'\.0$', '', regex=True)
X[["x", "y"]] = X[['x', 'y']].astype(float)
pos = X[(X.flare_prob>=0.5)]
neg = X[(X.flare_prob<0.5)]
Pos_limb = len(pos.loc[(pos.x<-70) | (pos.x>70)])+len(pos.loc[(pos.y<-70) | (pos.y>70)])
neg_limb = len(neg.loc[(neg.x<-70) | (neg.x>70)])+ len(neg.loc[(neg.y<-70) | (neg.y>70)])
pos_center = len(pos.loc[(pos.y>=-70) & (pos.y<=70) & (pos.x>=-70) & (pos.x<=70)])
# len(pos.loc[(pos.x>-70) & (pos.x<70)]), len(pos.loc[(pos.y>-70) | (pos.y<70)])
neg_center = len(neg.loc[(neg.y>=-70) & (neg.y<=70) & (neg.x>=-70) & (neg.x<=70)])
Pos_limb, neg_limb, pos_center, neg_center, len(X)

(100, 112, 467, 201, 880)

In [20]:
X = df.loc[(df.goes_class.str.startswith('M'))].copy()
X[["x", "y"]] = X["fl_location"].str.strip(r"[()]").str.split(",", expand=True).astype(str)
X['x'] = pd.to_numeric(X['x']).round(decimals=2).astype(str).replace(r'\.0$', '', regex=True)
X[["x", "y"]] = X[['x', 'y']].astype(float)
pos = X[(X.flare_prob>=0.5)]
neg = X[(X.flare_prob<0.5)]
Pos_limb = len(pos.loc[(pos.x<-70) | (pos.x>70)])+len(pos.loc[(pos.y<-70) | (pos.y>70)])
neg_limb = len(neg.loc[(neg.x<-70) | (neg.x>70)])+ len(neg.loc[(neg.y<-70) | (neg.y>70)])
pos_center = len(pos.loc[(pos.y>=-70) & (pos.y<=70) & (pos.x>=-70) & (pos.x<=70)])
# len(pos.loc[(pos.x>-70) & (pos.x<70)]), len(pos.loc[(pos.y>-70) | (pos.y<70)])
neg_center = len(neg.loc[(neg.y>=-70) & (neg.y<=70) & (neg.x>=-70) & (neg.x<=70)])
Pos_limb, neg_limb, pos_center, neg_center, len(X)

(878, 1412, 3153, 2677, 8120)