In [1]:
import sys
import os.path as p

sys.path.append("/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/")

import easydict
from IPython.display import clear_output
from utils import get_args, get_root_dir

In [2]:
args = get_args()
args.data_dir = "../../input/data/train_dataset/"
args.root_dir = get_root_dir("../bert_last_test/")

args.num_workers = 2
args.n_epochs = 20
args.hidden_dim = 1024

In [17]:
import pickle

In [5]:
import torch
from trainer import DKTTrainer
from models.lstm.model import LSTM
from utils import get_criterion

class Loss1Trainer(DKTTrainer):  # 원래 코드
    def _collate_fn(self, batch):
        col_n = len(batch[0])
        col_list = [[] for _ in range(col_n)]
        max_seq_len = len(batch[0][-1])


        # batch의 값들을 각 column끼리 그룹화
        for row in batch:
            for i, col in enumerate(row):
                pre_padded = torch.zeros(max_seq_len)
                pre_padded[-len(col):] = col
                col_list[i].append(pre_padded)


        for i, _ in enumerate(col_list):
            col_list[i] =torch.stack(col_list[i])

        return tuple(col_list)


    def _process_batch(batch, args):

        #1 dataloader #2와 순서를 맞춰주자
        (correct, question, test, tag, time_diff, 
        head, mid, tail, mid_tail, 
        head_answerProb, mid_answerProb, tail_answerProb,
        mask) = batch


        # change to float
        mask = mask.type(torch.FloatTensor)
        correct = correct.type(torch.FloatTensor)

        #  interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용
        #    saint의 경우 decoder에 들어가는 input이다
        interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다.
        interaction = interaction.roll(shifts=1, dims=1)
        interaction_mask = mask.roll(shifts=1, dims=1)
        interaction_mask[:, 0] = 0 # set padding index to the first sequence
        interaction = (interaction * interaction_mask).to(torch.int64)
        # print(interaction)
        # exit()
        #  test_id, question_id, tag
        test = ((test + 1) * mask).to(torch.int64)
        question = ((question + 1) * mask).to(torch.int64)
        tag = ((tag + 1) * mask).to(torch.int64)

        #2 추가 feature
        # time_median = ((time_median + 1) * mask).type(torch.FloatTensor)
        head_answerProb = ((head_answerProb + 1) * mask).type(torch.FloatTensor)
        mid_answerProb = ((mid_answerProb + 1) * mask).type(torch.FloatTensor)
        tail_answerProb = ((tail_answerProb + 1) * mask).type(torch.FloatTensor)


        time_diff = ((time_diff + 1) * mask).to(torch.int64)
        head = ((head + 1) * mask).to(torch.int64)
        mid = ((mid + 1) * mask).to(torch.int64)
        tail = ((tail + 1) * mask).to(torch.int64)
        mid_tail = ((mid_tail + 1) * mask).to(torch.int64)

        # gather index
        # 마지막 sequence만 사용하기 위한 index
        gather_index = torch.tensor(np.count_nonzero(mask, axis=1))
        gather_index = gather_index.view(-1, 1) - 1


        #3 device memory로 이동

        test = test.to(args.device)
        question = question.to(args.device)
        tag = tag.to(args.device)
        correct = correct.to(args.device)

        # time_median = time_median.to(args.device)
        time_diff = time_diff.to(args.device)
        head = head.to(args.device)
        mid = mid.to(args.device)
        tail = tail.to(args.device)
        mid_tail = mid_tail.to(args.device)

        head_answerProb = head_answerProb.to(args.device)
        mid_answerProb = mid_answerProb.to(args.device)
        tail_answerProb = tail_answerProb.to(args.device)

        mask = mask.to(args.device)

        interaction = interaction.to(args.device)
        gather_index = gather_index.to(args.device)

        #4 
        return (test, question, tag, time_diff, head, mid, tail, mid_tail, 
                head_answerProb, mid_answerProb, tail_answerProb,
                mask, interaction, gather_index, correct)
    
class Loss2Trainer(Loss1Trainer):
    def _compute_loss(self, preds, targets):
        loss = get_criterion(preds, targets)
        loss = loss[:, -2:]
        loss = torch.mean(loss)
        return loss
    
class Loss5Trainer(Loss1Trainer):
    def _compute_loss(self, preds, targets):
        loss = get_criterion(preds, targets)
        loss = loss[:, -5:]
        loss = torch.mean(loss)  # scalar 형태가 아니면 loss.backward가 안된다.
        return loss
    
class Loss10Trainer(Loss1Trainer):
    def _compute_loss(self, preds, targets):
        loss = get_criterion(preds, targets)
        loss = loss[:, -10:]
        loss = torch.mean(loss)
        return loss
    
class LossAllTrainer(Loss1Trainer):
    def _compute_loss(self, preds, targets):
        loss = get_criterion(preds, targets)
        loss = torch.mean(loss)
        return loss
    
class Bert(nn.Module):
    def __init__(self, args):
        super(Bert, self).__init__()
        self.args = args
        self.args.numeric = False
        
        cate_size = len(self.args.cate_cols) + 1 # interaction
        cont_size = len(self.args.cont_cols) - 2


        #1 Embedding 
        # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
        self.embedding_interaction = nn.Embedding(3, self.args.hidden_dim)
        self.embedding_test = nn.Embedding(self.args.n_cols['testId']+1, self.args.hidden_dim)
        self.embedding_question = nn.Embedding(self.args.n_cols['assessmentItemID']+1, self.args.hidden_dim)
        self.embedding_tag = nn.Embedding(self.args.n_cols['KnowledgeTag']+1, self.args.hidden_dim)
        self.embedding_time_diff = nn.Embedding(self.args.n_cols['time_diff']+1, self.args.hidden_dim)
        self.embedding_head = nn.Embedding(self.args.n_cols['head']+1, self.args.hidden_dim)
        self.embedding_mid = nn.Embedding(self.args.n_cols['mid']+1, self.args.hidden_dim)
        self.embedding_tail = nn.Embedding(self.args.n_cols['tail']+1, self.args.hidden_dim)
        self.embedding_mid_tail = nn.Embedding(self.args.n_cols['mid_tail']+1, self.args.hidden_dim)


        # embedding projection
        self.cate_proj = nn.Sequential(
            nn.Linear((self.args.hidden_dim) * (cate_size), self.args.hidden_dim),
            nn.LayerNorm(self.args.hidden_dim),
        )

        if self.args.numeric:
            self.cont_bn = nn.BatchNorm1d(cont_size)
            self.cont_proj = nn.Sequential(
                nn.Linear(cont_size, self.args.hidden_dim),
                nn.LayerNorm(self.args.hidden_dim),
            )

            self.comb_proj = nn.Sequential(
                nn.ReLU(),
                nn.Linear(self.args.hidden_dim * 2, self.args.hidden_dim),
                nn.LayerNorm(self.args.hidden_dim),
            )

        else:
            self.comb_proj = nn.Sequential(
                nn.ReLU(),
                nn.Linear(self.args.hidden_dim, self.args.hidden_dim),
                nn.LayerNorm(self.args.hidden_dim),
            )

        # Bert config
        self.config = BertConfig( 
            3, # not used
            hidden_size=self.args.hidden_dim,
            num_hidden_layers=self.args.n_layers,
            num_attention_heads=self.args.n_heads,
            intermediate_size=self.args.hidden_dim,
            hidden_drop_out_prob=self.args.drop_out,
            attention_probs_drop_out_prob=self.args.drop_out,
            max_position_embeddings=self.args.max_seq_len           
        )

        # Defining the layers
        # Bert Layer
        self.encoder = BertModel(self.config)  

        # Fully connected layer
        self.fc = nn.Linear(self.args.hidden_dim, 1)
       
        self.activation = nn.Sigmoid()

        def get_reg():
            return nn.Sequential(
            nn.Linear(self.args.hidden_dim, self.args.hidden_dim),
            nn.LayerNorm(self.args.hidden_dim),
            nn.Dropout(self.args.drop_out),
            nn.ReLU(),            
            nn.Linear(self.args.hidden_dim, self.args.hidden_dim),
        )     

        self.reg_layer = get_reg()


    def forward(self, inputs):
        #2 process_batch의 return 
        # tail_prob
        (test, question, tag, time_diff, 
        head, mid, tail, mid_tail, 
        head_answerProb, mid_answerProb, tail_answerProb,
        mask, interaction, gather_index, correct) = inputs

        batch_size = interaction.size(0)

        #3 신나는 embedding
        embed_interaction = self.embedding_interaction(interaction)
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        embed_tag = self.embedding_tag(tag)
        embed_time_diff = self.embedding_time_diff(time_diff)
        embed_head = self.embedding_head(head)
        embed_mid = self.embedding_mid(mid)
        embed_tail = self.embedding_tail(tail)
        embed_mid_tail = self.embedding_mid_tail(mid_tail)

        #4
        # (batch_size, max_seq_len, hidden_dim * n) : (64, 20, 64 * n) 
        cate_embed = torch.cat([
                    embed_interaction,
                    embed_question,
                    embed_time_diff,
                    embed_tag,
                    embed_head,
                    embed_mid,
                    embed_tail,
                    embed_test,
                    embed_mid_tail,
                ], 2)

        #5
        # (batch_size, max_seq_len * n) : (64, 20 * n) 
        if self.args.numeric:
            cont_cat = torch.cat([
                    # time_diff,  
                    # head_answerProb,   
                    # mid_answerProb,
                    tail_answerProb,
            ], 1)
            
            # (batch_size, max_seq_len, n) : (64, 20, n) 
            cont_cat = cont_cat.view(batch_size, self.args.max_seq_len, -1)

            # (batch_size * max_seq_len, n) : (1280, 1)
            cont_bn_x = self.cont_bn(cont_cat.view(-1, cont_cat.size(-1)))

            # (batch_size, max_seq_len, n) : (64, 20, n) 
            cont_bn_x = cont_bn_x.view(batch_size, self.args.max_seq_len, -1) 

        # (batch_size, max_seq_len, hidden_dim) : (64, 20, 64)
        cate_X = self.cate_proj(cate_embed)

        if self.args.numeric:
            cont_X = self.cont_proj(cont_cat)

            # (batch_size, max_seq_len, hidden_dim * 2) : [64, 20, 128]
            X = torch.cat([cate_X, cont_X], 2)
            

        else:
            X = cate_X

        # (batch_size, max_seq_len, hidden_dim) :[64, 20, 64]
        comb_X = self.comb_proj(X)

        # Bert
        encoded_layers = self.encoder(inputs_embeds=comb_X, attention_mask=mask)
        out = encoded_layers[0]
        
        
        # base
        # out = out.contiguous().view(batch_size, -1, self.args.hidden_dim)
        # out = self.fc(out)
        # preds = self.activation(out).view(batch_size, -1)

        # reg_layer
        out = self.reg_layer(out)
        out = self.fc(out)
        preds = self.activation(out).view(batch_size, -1)

        return preds

NameError: name 'nn' is not defined

In [3]:
class Preprocess:
    def __init__(self, args):
        self.args = args

        self.args.cate_cols = []
        self.args.cont_cols = []
        self.args.features = []
        self.args.n_cols = {}

        self.train_data = None
        self.test_data = None


    def get_train_data(self):
        return self.train_data

    def get_test_data(self):
        return self.test_data

    def split_data(self, data, ratio=0.8, shuffle=True, seed=42):
        """
        split data into two parts with a given ratio.
        """
        if shuffle:
            random.seed(seed) # fix to default seed 42
            random.shuffle(data)

        size = int(len(data) * ratio)

        data_1 = data[:size]
        data_2 = data[size:]

        return data_1, data_2

    def __save_labels(self, encoder, name):
        le_path = os.path.join(self.args.asset_dir, name + '_classes.npy')
        np.save(le_path, encoder.classes_)

    def __feature_engineering(self, df):
        #1-1 categorical feature

        df['head'] = df.assessmentItemID.apply(lambda x: x[:4])
        df['mid'] = df.assessmentItemID.apply(lambda x: x[4:7])
        df['tail'] = df.assessmentItemID.apply(lambda x: x[7:])

        df['head_tail'] = df.assessmentItemID.apply(lambda x: x[:4]+x[7:])
        df['mid_tail'] = df.assessmentItemID.apply(lambda x: x[4:])

        #1-2 continuous feature

        def percentile(s):
            return np.sum(s) / len(s)

        ## time to sec
        def convert_time(s):
            timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())
            return int(timestamp)

        df['Timestamp'] = df['Timestamp'].apply(convert_time)

        ## find boundary
        # userID, testId 별 푼 문항의 누적 합
        df['UserTestCumtestnum'] = df.groupby(['userID','testId'])['answerCode'].cumcount()
        testId2maxlen = df[['assessmentItemID', 'testId']].drop_duplicates().groupby('testId').size()
        
        # test의 문항 수
        df['TestSize'] = df.testId.map(testId2maxlen)
        
        # user가 같은 test를 여러 번 푼 것인지 나타낸 변수 (처음 품 : 0, 두번 품 : 1, 세번 품 : 2)
        df['Retest'] = df['UserTestCumtestnum'] // df['TestSize']

        # boundary
        df['boundary'] = [u % t if t != 0 else 0.0 for t, u in zip(df['TestSize'], df['UserTestCumtestnum'])] 

        df = copy.deepcopy(df[df['Retest'] == 0])

        ## time diff
        time_diff = df.groupby(['userID', 'head', 'mid'])['Timestamp'].diff()
        df['time_diff'] = time_diff
        df.loc[df['boundary'] == 0, 'time_diff'] = np.NaN
        df['time_diff'].fillna(method='bfill', inplace=True)
        # df['time_diff'].fillna(0, inplace=True) -> 성능하락

        df['time_diff'] = df['time_diff'].map(lambda x: 600 if x>600 else x)

        # def cate_diff(x):
        #     if x > 600:
        #         return "A"
        #     if 500 < x <= 600:
        #         return "B"
        #     if 400 < x <= 500:
        #         return "C"
        #     if 300 < x <= 400:
        #         return "D"
        #     if 200 < x <= 300:
        #         return "E"
        #     if 100 < x <= 200:
        #         return "F"
        #     if 50 < x <= 100:
        #         return "G"
        #     if x <= 50:
        #         return "H"

        # load
        # with open('/opt/ml/code/dkt/max_bins_set.pkl', 'rb') as f:
        #     ser, bins = pickle.load(f)

        # df['time_diff'] = df['time_diff'].map(lambda x: cate_diff(x))
        # df['time_diff'] = pd.qcut(df['time_diff'], q=23).cat.rename_categories(list(range(23))).astype(str)
        # df['time_diff'] = pd.qcut(df['time_diff'], q=20).cat.rename_categories(list('abcdefghijklnmopqrst')).astype(str)
        # df['time_diff'] = pd.qcut(df['time_diff'], q=20).astype(str)

        # df['time_diff'] = pd.cut(df['time_diff'], bins=bins, labels=False, include_lowest=True)
        
        thr = 600
        df['time_diff'] = pd.cut(df['time_diff'], bins=thr).astype(str) #.cat.rename_categories(list(range(thr)))
        # df['time_diff'] = pd.cut(df['time_diff'], bins=600).astype(str)

        # head별 정답률
        answer_head_mean = df.groupby(['userID', 'head'])['answerCode'].mean()
        answer_head_mean = answer_head_mean.reset_index(level=['userID', 'head'])
        answer_head_mean.columns = ['userID', 'head', 'head_answerProb']

        df = pd.merge(df, answer_head_mean, on=['userID', 'head'], how='left')

        # mid별 정답률
        answer_mid_mean = df.groupby(['userID', 'head', 'mid'])['answerCode'].mean()
        answer_mid_mean = answer_mid_mean.reset_index(level=['userID', 'head', 'mid'])
        answer_mid_mean.columns = ['userID', 'head', 'mid', 'mid_answerProb']

        df = pd.merge(df, answer_mid_mean, on=['userID', 'head', 'mid'], how='left')
        
        # tail별 정답률
        answer_tail_mean = df.groupby(['head', 'mid', 'tail'])['answerCode'].mean()
        answer_tail_mean = answer_tail_mean.reset_index(level=['head', 'mid', 'tail'])
        answer_tail_mean.columns = ['head', 'mid', 'tail', 'tail_answerProb']

        df = pd.merge(df, answer_tail_mean, on=['head', 'mid', 'tail'], how='left')

        #2 self.args.features의 순서와 trainer #1의 순서를 맞춰주자!
        # correct, question, test, tag, time_diff, head, mid, tail, mid_tail, 
        # head_answerProb, mid_answerProb, tail_answerProb, mask = batch

        self.args.cate_cols.extend([
            'assessmentItemID', 
            'testId', 
            'KnowledgeTag',
            'time_diff',
            'head',
            'mid',
            'tail',
            'mid_tail',
            ])

        self.args.cont_cols.extend([
            # 'time_diff',
            # 'time_median',
            'head_answerProb',
            'mid_answerProb',
            'tail_answerProb',
            ])

        self.args.features.extend(
            ['answerCode'] + 
            self.args.cate_cols + 
            self.args.cont_cols
            )

        return df

    def __preprocessing(self, df, is_train = True):
        cate_cols = self.args.cate_cols

        if not os.path.exists(self.args.asset_dir):
            os.makedirs(self.args.asset_dir)
            
        for col in cate_cols:
            le = LabelEncoder()
            if is_train:
                #For UNKNOWN class
                a = df[col].unique().tolist() + ['unknown']
                le.fit(a)
                self.__save_labels(le, col)
            else:
                label_path = os.path.join(self.args.asset_dir,col+'_classes.npy')
                le.classes_ = np.load(label_path)
                
                df[col] = df[col].apply(lambda x: x if x in le.classes_ else 'unknown')

            #모든 컬럼이 범주형이라고 가정
            df[col]= df[col].astype(str)
            test = le.transform(df[col])
            df[col] = test


        cont_cols = self.args.cont_cols

        # standard scaler
        std_scaler = preprocessing.StandardScaler().fit(df[cont_cols] )
        df[cont_cols] = std_scaler.transform(df[cont_cols])
        

        # df.to_csv('peprocess_test.csv')

        return df


    def load_data_from_file(self, file_name, is_train=True):
        csv_file_path = os.path.join(self.args.data_dir, file_name)
        df = pd.read_csv(csv_file_path)
        df = self.__feature_engineering(df)
        df = self.__preprocessing(df, is_train)
        
        cols = df.columns.tolist()
        for col in cols:
            if col in self.args.cont_cols:
                self.args.n_cols[col] = len(df[col].unique())
            
            if col in self.args.cate_cols:
                self.args.n_cols[col] = len(np.load(os.path.join(self.args.asset_dir, f'{col}_classes.npy')))

        df = df.sort_values(by=['userID','Timestamp'], axis=0)

        feature_columns = self.args.features        
        
        def get_values(cols, r):
            result = []
            for col in cols:
                result.append(r[col].values)

            return result

        if is_train:
            group = df.groupby(['userID', 'head', 'mid']).apply(
                lambda r: (get_values(feature_columns, r)))
            
        else:
            group = df.groupby('userID').apply(
                lambda r: (get_values(feature_columns, r)))

        if is_train:
            # save
            mass = (self.args.cate_cols, self.args.cont_cols, self.args.features)
            with open('../dkt/code/dkt/pkl/mass.pkl', 'wb') as f:
                pickle.dump(mass, f, pickle.HIGHEST_PROTOCOL)

            save_name = f'{self.args.pkl_dir}/{self.args.pkl_name}'
            with open(save_name, 'wb') as f:
                pickle.dump((group.values, self.args.n_cols), f, pickle.HIGHEST_PROTOCOL)

        return group.values

    def load_train_data(self, file_name):
        if self.args.pkl:
            # load
            pkl_name = f'{self.args.pkl_dir}/{self.args.pkl_name}'
            with open(pkl_name, 'rb') as f:
                self.train_data, self.args.n_cols = pickle.load(f)
            
            with open('../dkt/code/dkt/pkl/mass.pkl', 'rb') as f:
                self.args.cate_cols, self.args.cont_cols, self.args.features = pickle.load(f)
        else:
            self.train_data = self.load_data_from_file(file_name)

    def load_test_data(self, file_name):
        self.test_data = self.load_data_from_file(file_name, is_train= False)

In [None]:
class Bert(nn.Module):
    
    def __init__(self, args):
        super(Bert, self).__init__()
        self.args = args
        self.args.numeric = False
        
        cate_size = len(self.args.cate_cols) + 1 # interaction
        cont_size = len(self.args.cont_cols) - 2


        #1 Embedding 
        # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
        self.embedding_interaction = nn.Embedding(3, self.args.hidden_dim)
        self.embedding_test = nn.Embedding(self.args.n_cols['testId']+1, self.args.hidden_dim)
        self.embedding_question = nn.Embedding(self.args.n_cols['assessmentItemID']+1, self.args.hidden_dim)
        self.embedding_tag = nn.Embedding(self.args.n_cols['KnowledgeTag']+1, self.args.hidden_dim)
        self.embedding_time_diff = nn.Embedding(self.args.n_cols['time_diff']+1, self.args.hidden_dim)
        self.embedding_head = nn.Embedding(self.args.n_cols['head']+1, self.args.hidden_dim)
        self.embedding_mid = nn.Embedding(self.args.n_cols['mid']+1, self.args.hidden_dim)
        self.embedding_tail = nn.Embedding(self.args.n_cols['tail']+1, self.args.hidden_dim)
        self.embedding_mid_tail = nn.Embedding(self.args.n_cols['mid_tail']+1, self.args.hidden_dim)


        # embedding projection
        self.cate_proj = nn.Sequential(
            nn.Linear((self.args.hidden_dim) * (cate_size), self.args.hidden_dim),
            nn.LayerNorm(self.args.hidden_dim),
        )

        if self.args.numeric:
            self.cont_bn = nn.BatchNorm1d(cont_size)
            self.cont_proj = nn.Sequential(
                nn.Linear(cont_size, self.args.hidden_dim),
                nn.LayerNorm(self.args.hidden_dim),
            )

            self.comb_proj = nn.Sequential(
                nn.ReLU(),
                nn.Linear(self.args.hidden_dim * 2, self.args.hidden_dim),
                nn.LayerNorm(self.args.hidden_dim),
            )

        else:
            self.comb_proj = nn.Sequential(
                nn.ReLU(),
                nn.Linear(self.args.hidden_dim, self.args.hidden_dim),
                nn.LayerNorm(self.args.hidden_dim),
            )

        # Bert config
        self.config = BertConfig( 
            3, # not used
            hidden_size=self.args.hidden_dim,
            num_hidden_layers=self.args.n_layers,
            num_attention_heads=self.args.n_heads,
            intermediate_size=self.args.hidden_dim,
            hidden_drop_out_prob=self.args.drop_out,
            attention_probs_drop_out_prob=self.args.drop_out,
            max_position_embeddings=self.args.max_seq_len           
        )

        # Defining the layers
        # Bert Layer
        self.encoder = BertModel(self.config)  

        # Fully connected layer
        self.fc = nn.Linear(self.args.hidden_dim, 1)
       
        self.activation = nn.Sigmoid()

        def get_reg():
            return nn.Sequential(
            nn.Linear(self.args.hidden_dim, self.args.hidden_dim),
            nn.LayerNorm(self.args.hidden_dim),
            nn.Dropout(self.args.drop_out),
            nn.ReLU(),            
            nn.Linear(self.args.hidden_dim, self.args.hidden_dim),
        )     

        self.reg_layer = get_reg()


    def forward(self, inputs):
        #2 process_batch의 return 
        # tail_prob
        (test, question, tag, time_diff, 
        head, mid, tail, mid_tail, 
        head_answerProb, mid_answerProb, tail_answerProb,
        mask, interaction, gather_index, correct) = inputs

        batch_size = interaction.size(0)

        #3 신나는 embedding
        embed_interaction = self.embedding_interaction(interaction)
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        embed_tag = self.embedding_tag(tag)
        embed_time_diff = self.embedding_time_diff(time_diff)
        embed_head = self.embedding_head(head)
        embed_mid = self.embedding_mid(mid)
        embed_tail = self.embedding_tail(tail)
        embed_mid_tail = self.embedding_mid_tail(mid_tail)

        #4
        # (batch_size, max_seq_len, hidden_dim * n) : (64, 20, 64 * n) 
        cate_embed = torch.cat([
                    embed_interaction,
                    embed_question,
                    embed_time_diff,
                    embed_tag,
                    embed_head,
                    embed_mid,
                    embed_tail,
                    embed_test,
                    embed_mid_tail,
                ], 2)

        #5
        # (batch_size, max_seq_len * n) : (64, 20 * n) 
        if self.args.numeric:
            cont_cat = torch.cat([
                    # time_diff,  
                    # head_answerProb,   
                    # mid_answerProb,
                    tail_answerProb,
            ], 1)
            
            # (batch_size, max_seq_len, n) : (64, 20, n) 
            cont_cat = cont_cat.view(batch_size, self.args.max_seq_len, -1)

            # (batch_size * max_seq_len, n) : (1280, 1)
            cont_bn_x = self.cont_bn(cont_cat.view(-1, cont_cat.size(-1)))

            # (batch_size, max_seq_len, n) : (64, 20, n) 
            cont_bn_x = cont_bn_x.view(batch_size, self.args.max_seq_len, -1) 

        # (batch_size, max_seq_len, hidden_dim) : (64, 20, 64)
        cate_X = self.cate_proj(cate_embed)

        if self.args.numeric:
            cont_X = self.cont_proj(cont_cat)

            # (batch_size, max_seq_len, hidden_dim * 2) : [64, 20, 128]
            X = torch.cat([cate_X, cont_X], 2)
            

        else:
            X = cate_X

        # (batch_size, max_seq_len, hidden_dim) :[64, 20, 64]
        comb_X = self.comb_proj(X)

        # Bert
        encoded_layers = self.encoder(inputs_embeds=comb_X, attention_mask=mask)
        out = encoded_layers[0]
        
        
        # base
        # out = out.contiguous().view(batch_size, -1, self.args.hidden_dim)
        # out = self.fc(out)
        # preds = self.activation(out).view(batch_size, -1)

        # reg_layer
        out = self.reg_layer(out)
        out = self.fc(out)
        preds = self.activation(out).view(batch_size, -1)

        return preds


In [13]:
args.pkl_dir = "/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/dkt/code/dkt/pkl"
args.pkl_name = 'group.pkl'
args.pkl = True

In [20]:
import os
import copy
import time
import random
import pickle

import pandas as pd
import numpy as np

from datetime import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

from torch.nn.utils.rnn import pad_sequence
import torch

In [23]:
from dkt.code.args import parse_args

In [27]:
import argparse

In [28]:
def parse_args(mode='train'):
    parser = argparse.ArgumentParser()

    
    parser.add_argument('--seed', default=42, type=int, help='seed')
    
    parser.add_argument('--device', default='cpu', type=str, help='cpu or gpu')

    parser.add_argument('--data_dir', default='/opt/ml/dkt/dkt/input/data/train_dataset', type=str, help='data directory')
    parser.add_argument('--asset_dir', default='asset/', type=str, help='data directory')
    
    parser.add_argument('--file_name', default='train_data.csv', type=str, help='train file name')
    
    parser.add_argument('--pkl', default=True, type=str2bool, help='use pkl')
    parser.add_argument('--pkl_dir', default='/opt/ml/dkt/dkt/code/dkt/pkl', type=str, help='pkl directory')
    parser.add_argument('--pkl_name', default='group.pkl', type=str, help='pkl file name')
    
    parser.add_argument('--model_dir', default='/opt/ml/dkt/dkt/code/models/', type=str, help='model directory')
    parser.add_argument('--model_name', default='model.pt', type=str, help='model file name')

    parser.add_argument('--output_dir', default='/opt/ml/dkt/dkt/code/output/', type=str, help='output directory')
    parser.add_argument('--test_file_name', default='test_data.csv', type=str, help='test file name')
    
    parser.add_argument('--max_seq_len', default=13, type=int, help='max sequence length')
    parser.add_argument('--num_workers', default=4, type=int, help='number of workers')

    # 모델
    parser.add_argument('--hidden_dim', default=1024, type=int, help='hidden dimension size')
    parser.add_argument('--n_layers', default=2, type=int, help='number of layers')
    parser.add_argument('--n_heads', default=2, type=int, help='number of heads')
    parser.add_argument('--drop_out', default=0, type=float, help='drop out rate')
    
    # 훈련
    parser.add_argument('--n_epochs', default=20, type=int, help='number of epochs')
    parser.add_argument('--batch_size', default=64, type=int, help='batch size')
    parser.add_argument('--lr', default=0.0001, type=float, help='learning rate')
    parser.add_argument('--clip_grad', default=10, type=int, help='clip grad')
    parser.add_argument('--patience', default=10, type=int, help='for early stopping')
    

    parser.add_argument('--log_steps', default=100, type=int, help='print log per n steps')

    ### 중요 ###
    parser.add_argument('--model', default='bert', type=str, help='model type')
    parser.add_argument('--optimizer', default='adam', type=str, help='optimizer type')
    parser.add_argument('--scheduler', default='plateau', type=str, help='scheduler type')


    parser.add_argument('--info', default='test', type=str, help='file info')
    
    args = parser.parse_args([])

    return args

In [29]:
args = parse_args()

NameError: name 'str2bool' is not defined

In [21]:
preprocess = Preprocess(args)

In [22]:
train_dataset = preprocess.load_train_data("train_data.csv")
test_dataset = preprocess.load_test_data("test_data.csv")

AttributeError: 'EasyDict' object has no attribute 'asset_dir'

In [11]:
preprocess.load_train_data??

[0;31mSignature:[0m [0mpreprocess[0m[0;34m.[0m[0mload_train_data[0m[0;34m([0m[0mfile_name[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
    [0;32mdef[0m [0mload_train_data[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mfile_name[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;32mif[0m [0mself[0m[0;34m.[0m[0margs[0m[0;34m.[0m[0mpkl[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0;31m# load[0m[0;34m[0m
[0;34m[0m            [0mpkl_name[0m [0;34m=[0m [0;34mf'{self.args.pkl_dir}/{self.args.pkl_name}'[0m[0;34m[0m
[0;34m[0m            [0;32mwith[0m [0mopen[0m[0;34m([0m[0mpkl_name[0m[0;34m,[0m [0;34m'rb'[0m[0;34m)[0m [0;32mas[0m [0mf[0m[0;34m:[0m[0;34m[0m
[0;34m[0m                [0mself[0m[0;34m.[0m[0mtrain_data[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0margs[0m[0;34m.[0m[0mn_cols[0m [0;34m=[0m [0mpickle[0m[0;34m.[0m[0mload[0m[0;34m([0m[0m

In [40]:
from glob import glob

In [41]:
all_csv_file = []

folders = ['feature_test', 'hyper_test', 'split_test', 'refactoring']

In [50]:
for file_path in glob('../feature_test/*/*.csv'):
    folder_path = "/".join(file_path.split("/")[:-1])
    print(os.listdir(folder_path))
    break
    print(folder_path)

['run_config.json', 'cv_0.log', 'cv_1.log', 'cv_0_model.pth', 'cv_1_model.pth', 'cv_0_test_results.csv']


In [93]:
from shutil import copy2

In [94]:
copy2??

[0;31mSignature:[0m [0mcopy2[0m[0;34m([0m[0msrc[0m[0;34m,[0m [0mdst[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0mfollow_symlinks[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mcopy2[0m[0;34m([0m[0msrc[0m[0;34m,[0m [0mdst[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0mfollow_symlinks[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Copy data and metadata. Return the file's destination.[0m
[0;34m[0m
[0;34m    Metadata is copied with copystat(). Please see the copystat function[0m
[0;34m    for more information.[0m
[0;34m[0m
[0;34m    The destination may be a directory.[0m
[0;34m[0m
[0;34m    If follow_symlinks is false, symlinks won't be followed. This[0m
[0;34m    resembles GNU's "cp -P src dst".[0m
[0;34m    """[0m[0;34m[0m
[0;34m[0m    [0;32mif[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0misdir[0m[0;34m([0m[0mdst[0m[0;34m)[0m

In [95]:
copy??

[0;31mSignature:[0m [0mcopy[0m[0;34m([0m[0msrc[0m[0;34m,[0m [0mdst[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0mfollow_symlinks[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mcopy[0m[0;34m([0m[0msrc[0m[0;34m,[0m [0mdst[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0mfollow_symlinks[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Copy data and mode bits ("cp src dst"). Return the file's destination.[0m
[0;34m[0m
[0;34m    The destination may be a directory.[0m
[0;34m[0m
[0;34m    If follow_symlinks is false, symlinks won't be followed. This[0m
[0;34m    resembles GNU's "cp -P src dst".[0m
[0;34m[0m
[0;34m    If source and destination are the same file, a SameFileError will be[0m
[0;34m    raised.[0m
[0;34m[0m
[0;34m    """[0m[0;34m[0m
[0;34m[0m    [0;32mif[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0misdir[0m[0;34m([0m[0mdst[0m

In [121]:
count = 0

In [123]:
count

-77

In [125]:
import json

for folder in folders:
    for file_path in glob(f"../{folder}/*/*.csv"):
        log_folder_path = "/".join(file_path.split("/")[:-1])
        file_name = file_path.split("/")[-1]
        
        if "valid_cv_results.json" not in os.listdir(log_folder_path):
            continue
            
        valid_cv_result = json.load(open(f"{log_folder_path}/valid_cv_results.json", 'r'))
        
        if "ensemble" in file_name:
            auc, acc = 0, 0
            
            for i, (k, v) in enumerate(valid_cv_result.items()):
                auc += float(v.split(",")[0].split(":")[1])
                acc += float(v.split(",")[1].split(":")[1])
        
            auc /= (i + 1)
            acc /= (i + 1)  
        else:
            for k, v in valid_cv_result.items():
                if k in file_name:
                    auc = float(v.split(",")[0].split(":")[1])
                    acc = float(v.split(",")[1].split(":")[1])
        
        print(file_name)
        model_name = "bert" if "bert" in folder_path else "lstm"
        new_file_name = f"../all_csvs/{model_name}_{auc*100:.4}_{acc*100:.4}_{file_name}"
        
        line_num = [line for line in open(file_path, "r")]
        
        if len(line_num) != 745:
            count += 1
            continue
        count -= 1
        
        print(len(line_num))
        
        
        print(new_file_name)
        copy(file_path, new_file_name)
        print(f"file_path: {file_path}\nauc: {auc}\nacc: {acc}")
        print()

cv_ensemble_test_results.csv
745
../all_csvs/lstm_79.59_72.22_cv_ensemble_test_results.csv
file_path: ../feature_test/LOG_[06.11_15:05]/cv_ensemble_test_results.csv
auc: 0.7958974961259273
acc: 0.7222391084093213

cv_3_test_results.csv
745
../all_csvs/lstm_79.71_72.59_cv_3_test_results.csv
file_path: ../feature_test/LOG_[06.11_15:05]/cv_3_test_results.csv
auc: 0.7970689652959844
acc: 0.7259371833839919

cv_1_test_results.csv
745
../all_csvs/lstm_80.09_72.67_cv_1_test_results.csv
file_path: ../feature_test/LOG_[06.11_15:05]/cv_1_test_results.csv
auc: 0.8009134775801442
acc: 0.7266970618034447

cv_0_test_results.csv
745
../all_csvs/lstm_79.8_72.29_cv_0_test_results.csv
file_path: ../feature_test/LOG_[06.11_15:05]/cv_0_test_results.csv
auc: 0.7979912366365132
acc: 0.7228976697061803

cv_2_test_results.csv
745
../all_csvs/lstm_78.89_71.35_cv_2_test_results.csv
file_path: ../feature_test/LOG_[06.11_15:05]/cv_2_test_results.csv
auc: 0.7888690074378204
acc: 0.7135258358662614

cv_4_test_resul

In [67]:
json.load??

[0;31mSignature:[0m
[0mjson[0m[0;34m.[0m[0mload[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfp[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcls[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mobject_hook[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mparse_float[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mparse_int[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mparse_constant[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mobject_pairs_hook[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkw[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mload[0m[0;34m([0m[0mfp[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0mcls[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mobject_hook[0m[0;34m=[0m[0;32