In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import init

from CNN import *
from GradCAMUtils import *
from Utils import *

import numpy as np
import pandas as pd
import seaborn as sns

import os

from fastai import *
from fastai.text import *
from fastai.vision import *
from fastai.imports import *

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import umap

from matplotlib.widgets import Button
from matplotlib.widgets import TextBox
from matplotlib.ticker import MultipleLocator
import matplotlib.pyplot as plt

In [2]:
"""Constants"""
# sequence length indicate the maximum length for all of the sequnence 626/798
SEQUENCE_LENGTH = 798

BATCH_SIZE = 8

vocab = {'C': [0,0,1], 'H': [0,1,0], 'E': [1,0,0], '-':[0,0,0]}

sns.set(rc={'figure.figsize':(15,15)})

model_path = Path("./Models/")
path = Path("./Datasets/")

#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)

# Transform the labels from String to Integer via LabelEncoder
le_fold = preprocessing.LabelEncoder()
le_fam = preprocessing.LabelEncoder()

# torch.cuda.set_device()
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

cuda_gpu = torch.cuda.is_available()   #判断GPU是否存在可用

np.random.seed(2020)
torch.manual_seed(2020)
torch.cuda.manual_seed_all(2020)
torch.backends.cudnn.deterministic = True #make sure same results

In [3]:
# load all dataset

df_gtu = pd.read_csv("../Datasets/Training_data/GTU_GTD.csv")
df_gtlyso = pd.read_csv("../Datasets/Training_data/GTLyso_train.csv")
df_gt2 = pd.read_csv("../Datasets/Training_data/GTA2.csv")
df_gt4 = pd.read_csv("../Datasets/Training_data/GTB4.csv")
train_df = pd.read_csv("../Datasets/Training_data/train2k.csv")
test_df = pd.read_csv("../Datasets/Training_data/test2k.csv")
val_df = pd.read_csv("../Datasets/Training_data/val2k.csv")

train_df_gtu, test_df_gtu, val_df_gtu = Train_Test_Val_split(df_gtu, strantify_type="family", split_rate=0.05)
train_df_gtlyso, test_df_gtlyso, val_df_gtlyso = Train_Test_Val_split(df_gtlyso, strantify_type="family", split_rate=0.05)
train_df_gt2, test_df_gt2, val_df_gt2 = Train_Test_Val_split(df_gt2, strantify_type="family", split_rate=0.05)
train_df_gt4, test_df_gt4, val_df_gt4 = Train_Test_Val_split(df_gt4, strantify_type="family", split_rate=0.05)

3956 220 220
4932 275 275
1799 100 101
1799 100 101


In [4]:
train_df = pd.concat([train_df, train_df_gtu, train_df_gt2, train_df_gt4, train_df_gtlyso])
test_df = pd.concat([test_df, test_df_gtu, test_df_gt2, test_df_gt4, test_df_gtlyso])
val_df = pd.concat([val_df, val_df_gtu, val_df_gt2, val_df_gt4, val_df_gtlyso])

In [5]:
#dump into MultiOutput Dataset
Train_ds, Train_dl = Dataset_Loader(train_df, le_fam, le_fold, vocab, BATCH_SIZE=BATCH_SIZE, cuda = cuda_gpu)

Test_ds, Test_dl = Dataset_Loader(test_df, le_fam, le_fold, vocab, BATCH_SIZE=BATCH_SIZE, cuda = cuda_gpu)

Val_ds, Val_dl = Dataset_Loader(val_df, le_fam, le_fold, vocab, BATCH_SIZE=BATCH_SIZE, cuda = cuda_gpu)

fold = train_df['fold'].unique()
family = train_df['family'].unique()

In [7]:
# training the model

model = CNN_Attention(Fold=len(fold), Fam=len(family), Prob=0.5).cuda()

optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss().cuda()
cnn_model = Fit(50, model, criterion, optimizer, Train_dl, Val_dl, patience=2, e=0.50)

In [None]:
# pickle.dump(cnn_model,open('128_alldata_gtu.pickle','wb'))