In [None]:
def run_utils():
    # Get the GPU device name.
    device_name = tf.test.gpu_device_name()
    # The device name should look like the following:
    if device_name == '/device:GPU:0':
        print('Found GPU at: {}'.format(device_name))
    else:
        raise SystemError('GPU device not found')

    device = None
    # If there's a GPU available...
    if torch.cuda.is_available():    
        # Tell PyTorch to use the GPU.    
        device = torch.device("cuda")
        print('There are %d GPU(s) available.' % torch.cuda.device_count())
        print('We will use the GPU:', torch.cuda.get_device_name(0))
    # If not...
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    return device

In [None]:
def get_random_seed():
    return int.from_bytes(os.urandom(4), "big")

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def load_dataset_emergent(path):
    df = pd.read_csv(path, delimiter=',', header = 0, names=['', 'claimHeadline', 'articleHeadline', 'stance', 'articleID', 'claimID'])    
    
    #df["stance"] = df["stance"].replace({"Pro": "0", "Agst": "1", "Neut": "2", "Not-rel": "3"})
    #df["stance"] = df["stance"].replace({"Not-rel": "Notrel"})

    df['claimHeadline'] = df['claimHeadline'].str.lower()
    df['articleHeadline'] = df['articleHeadline'].str.lower()
    #df['title'] = df['title'].str.lower()
    
    df = df.astype({'claimHeadline': 'str'})
    df = df.astype({'articleHeadline': 'str'})
    df = df.astype({'articleID': 'str'})
    df = df.astype({'claimID': 'str'})
    
    df["stance"] = df["stance"].replace({"for": "Pro", "against": "Agst", "observing": "Neut", "not": "Not-rel"})

    
    #df = df.drop('ideology', axis=1).copy(deep=True)
    
    print(df.iloc[0])
    
    print(df.dtypes)
    
    return df

In [None]:
def load_dataset_stance(path):
    df = pd.read_csv(path, delimiter='\t', header = 0, names=['qID', 'docID', 'stance', 'ideology', 'docCont', 'Q', 'title'])    
    
    #df["stance"] = df["stance"].replace({"Pro": "0", "Agst": "1", "Neut": "2", "Not-rel": "3"})
    #df["stance"] = df["stance"].replace({"Not-rel": "Notrel"})

    df['docCont'] = df['docCont'].str.lower()
    df['Q'] = df['Q'].str.lower()
    df['title'] = df['title'].str.lower()
    
    df = df.astype({'docCont': 'str'})
    df = df.astype({'Q': 'str'})
    df = df.astype({'title': 'str'})
    
    df.insert(2, "tokenized", df["docCont"].values)
    df["tokenized"] = df["tokenized"].str.split()
    
    #df_notneut = df.loc[df["stance"] != "Neut"]
    #df_neut = df.loc[df["stance"] == "Neut"]
    
    #print(df_neut.shape[0])
    #df_neut = df_neut.sample(800)
    #df_neut = df_neut[0:800]
    
    #df = df_notneut.append(df_neut, ignore_index = True)
    mask = (df["tokenized"].str.len() < 800) & (df["tokenized"].str.len() > 50)
    df = df.loc[mask]

    print(df['ideology'])
    print("**********")
    
    df = df.drop('tokenized', axis=1).copy(deep=True)
    
    print(df.iloc[0])
    
    print(df.dtypes)
    
    return df

In [None]:
def load_dataset_ambigious(path):
    # Load the dataset into a pandas dataframe.
    df = pd.read_csv(path, delimiter='\t', header=0, names=['qID', 'docID', 'Ambigious', 'stance', 'ideology', 'docCont', 'Q', 'title'])       

    df['docCont'] = df['docCont'].str.lower()
    #df['topic'] = df['topic'].str.lower()
    df['Q'] = df['Q'].str.lower()
    df['title'] = df['title'].str.lower()
    
    df = df.astype(str)
    
    #df.insert(0, "stanceStr", df['stance'], True)
    #df["stanceStr"] = df["stanceStr"].replace({1: "Pro", 0: "Agst"})
    print("Train")
    print ("Ambigious", df[df['Ambigious'] == "1"].shape[0])
    print ("Non-ambigious", df[df['Ambigious'] == "0"].shape[0])
    
    print(df.dtypes)

    return df

In [None]:
def merge_datasets(df, dfVal):
    from numpy import nan
    dfTrain = df.append(dfVal, ignore_index = True)
    
    #df.replace("", nan, inplace=True)
    #df.replace(" ", nan, inplace=True)
    #df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
    
    #dfLabel = df['ideology'].copy(deep=True)
    #df = df.drop('ideology', axis=1).copy(deep=True)
    
    return dfTrain

In [None]:
def create_more_notrel_docs(df):
    #path = './dataset/batches_cleaned/stance/Latest_Merged_Batches.tsv'
    
    #df = pd.read_csv(path, delimiter='\t', header = 0, names=['qID', 'docID', 'stance', 'docCont', 'Q', 'title'])
    #df['docCont'] = df['docCont'].str.lower()
    #df['Q'] = df['Q'].str.lower()
    #df['title'] = df['title'].str.lower()
    
    df_pro = df[df['stance'] == "Pro"]
    df_agst = df[df['stance'] == "Agst"]
    df_neut = df[df['stance'] == "Neut"]
    df_na = df[df['stance'] == "Not-rel"]
    
    pro_len = df_pro.shape[0]
    agst_len = df_agst.shape[0]
    neut_len = df_neut.shape[0]
    not_rel_len = df_na.shape[0]
    
    print("Pro", df_pro.shape[0])
    print("Agst", df_agst.shape[0])
    print("Neut", df_neut.shape[0])
    print("Not-rel", df_na.shape[0])
    
    print("before create more not-rel documents")
    
    #neut_len = max(pro_len, agst_len)
    rel_len = pro_len + agst_len + neut_len
    all_len = rel_len + not_rel_len
    rest = 0
    
    print("There are " + str(rel_len) + " rel instances in the original dataset!")
    print("There are " + str(not_rel_len) + " not-rel instances in the original dataset!")
    if rel_len > not_rel_len:
        rest = rel_len - not_rel_len
    else:
        print("not_rel is bigger!")
        
    print("There are " + str(rest) + " instances to be created!")
  
     
    path_queries = './dataset/batches_cleaned/stance/queries.tsv'
    df_queries = pd.read_csv(path_queries, delimiter='\t', header = 0, names=['qID', 'Q'])      
    
    
    df_pro = df[df['stance'] == "Pro"]
    df_agst = df[df['stance'] == "Agst"]
    df_neut = df[df['stance'] == "Neut"]
    df_na = df[df['stance'] == "Not-rel"]
    
    df_new = df_pro.append(df_agst, ignore_index = True)
    df_new = df_new.append(df_neut, ignore_index = True)
    #df_new = df_new.append(df_neut.sample(neut_len), ignore_index = True)
    df_new = df_new.append(df_na, ignore_index = True)

    
    tot_count = rest #old:900
    for idx in range(0, tot_count):
        valueRow = randint(1, all_len)
        valueQ = randint(1, 57)
        
        old_doc_inst = df.iloc[valueRow-1]
        
        while old_doc_inst['stance'] == "Not-rel" or old_doc_inst['stance'] == "Pro" or old_doc_inst['stance'] == "Neut":
            valueRow = randint(1, all_len)
            valueQ = randint(1, 57)
            
            old_doc_inst = df.iloc[valueRow-1]
        
        new_inst = old_doc_inst
        new_q_row = df_queries.iloc[valueQ-1]

        new_qID = new_q_row['qID']
        while new_qID == old_doc_inst['qID']:
            valueQ = randint(1, 57)
            new_q_row = df_queries.iloc[valueQ-1]
            new_qID = new_q_row['qID']
                
        new_inst['qID'] = new_qID
        new_inst['Q'] = new_q_row['Q']
        new_inst['stance'] = "Not-rel"
        new_inst['ideology'] = 'No'
            
        df_new = df_new.append(new_inst, ignore_index = True)
        
    print(df_new.shape[0])

    df_pro = df_new[df_new['stance'] == "Pro"]
    df_agst = df_new[df_new['stance'] == "Agst"]
    df_neut = df_new[df_new['stance'] == "Neut"]
    df_na = df_new[df_new['stance'] == "Not-rel"]
    
    print("Pro", df_pro.shape[0])
    print("Agst", df_agst.shape[0])
    print("Neut", df_neut.shape[0])
    print("Not-rel", df_na.shape[0])
    
    print("after create more not-rel documents")
    
    
    df_new.to_csv('./dataset/batches_cleaned/stance/Final_Dataset_AddedNotRelated.tsv', sep='\t', index=False)

In [None]:
def load_tokenizer(model):
    tokenizer = None
    from transformers import BertTokenizer, RobertaTokenizer, XLNetTokenizer, DistilBertTokenizer, AlbertTokenizer
    from transformers import AutoTokenizer, AutoConfig
    
    if 'roberta' in model:
        tokenizer = RobertaTokenizer.from_pretrained(model)
    elif 'distilbert' in model:
        tokenizer = DistilBertTokenizer.from_pretrained(model, do_lower_case=True)
    elif 'albert' in model:
        tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2')
    elif 'bert' in model:
        tokenizer = BertTokenizer.from_pretrained(model, do_lower_case=True)
    elif 'xlnet' in model:
        tokenizer = XLNetTokenizer.from_pretrained(model)
    elif 'long' in model:
        print("Long tokenizer")
        tokenizer = LongformerTokenizer.from_pretrained(model)
    
    return tokenizer

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def predict_classwise_stance_bert(P_related, P_unrelated, P_stance, stance_labels, P_ideology, ideology_labels):
    
    target_labels = torch.argmax(torch.abs(stance_labels), 1)
    predict_labels = torch.argmax(P_stance, 1)
    
    true_predict_count_ideology = 0
    con_true = 0
    lib_true = 0
    na_true = 0
    
    agree_true = 0
    agree_total = 0
    
    disagree_true = 0
    disagree_total = 0
    
    discuss_true = 0
    discuss_total = 0
    
    unrelated_true = 0
    unrelated_total = 0
    
    for idx, true_label in enumerate(target_labels):
        predict_label = predict_labels[idx] 
        if true_label == 0:
            agree_total += 1
            if predict_label == 0:
                agree_true += 1
        elif true_label == 1:
            disagree_total += 1
            if predict_label == 1:
                disagree_true += 1
        elif true_label == 2:
            discuss_total += 1
            if predict_label == 2:
                discuss_true += 1
        elif true_label == 3:
            unrelated_total += 1
            if predict_label == 3:
                unrelated_true += 1
        else:
            print("Problem!!!!")

        
    true_predict_count_stance= len((torch.eq(predict_labels, target_labels)).nonzero().flatten())


    return true_predict_count_stance, agree_true, disagree_true, discuss_true, unrelated_true, true_predict_count_ideology, con_true, lib_true, na_true, predict_labels

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def predict_classwise_stance_ideology_meta(P_relatedness, P_stance, stance_labels):

    true_predict_count_ideology = 0
    con_true = 0
    lib_true = 0
    na_true = 0
    
    P_related = torch.reshape(P_relatedness[:, 0], (-1, 1))
    P_unrelated = torch.reshape(P_relatedness[:, 1], (-1, 1))
    
    tmp1 = P_stance[:,:3]
    tmp2 = torch.reshape(torch.sum(tmp1,dim=1),[-1,1])
    tmp3 = torch.cat([tmp2,tmp2,tmp2],dim=1)
    tmp4 = torch.cat([P_related, P_related, P_related],dim=1)
    tmp5 = torch.div(tmp1,tmp3)
    tmp6 = tmp5*tmp4
    prob_stance = torch.cat([tmp6,P_unrelated],1)#tmp6

    
    target_labels = torch.argmax(torch.abs(stance_labels), 1)
    predict_labels = torch.argmax(prob_stance, 1)
            
    
    agree_true = 0
    agree_total = 0
    
    disagree_true = 0
    disagree_total = 0
    
    discuss_true = 0
    discuss_total = 0
    
    unrelated_true = 0
    unrelated_total = 0
    
    for idx, true_label in enumerate(target_labels):
        predict_label = predict_labels[idx] 
        if true_label == 0:
            agree_total += 1
            if predict_label == 0:
                agree_true += 1
        elif true_label == 1:
            disagree_total += 1
            if predict_label == 1:
                disagree_true += 1
        elif true_label == 2:
            discuss_total += 1
            if predict_label == 2:
                discuss_true += 1
        elif true_label == 3:
            unrelated_total += 1
            if predict_label == 3:
                unrelated_true += 1
        else:
            print("Problem!!!!")
        
    true_predict_count_stance= len((torch.eq(predict_labels, target_labels)).nonzero().flatten())    
    true_total = agree_true + disagree_true + discuss_true + unrelated_true

    return true_predict_count_stance, agree_true, disagree_true, discuss_true, unrelated_true, true_predict_count_ideology, con_true, lib_true, na_true

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def predict_classwise_stance_ideology_bert(P_stance, stance_labels):

    true_predict_count_ideology = 0
    con_true = 0
    lib_true = 0
    na_true = 0
    
    

    #tmp1 = prob_stance[:,:2]
    #tmp2 = torch.reshape(torch.sum(tmp1,dim=1),[-1,1])
    #tmp3 = torch.cat([tmp2,tmp2],dim=1)
    #tmp4 = torch.cat([P_oneside, P_oneside],dim=1)
    #tmp5 = torch.div(tmp1,tmp3)
    #tmp6 = tmp5*tmp4
    #prob_stance_plus_existed = torch.cat([tmp6,P_noside,P_noside],1)#tmp6
    
    target_labels = torch.argmax(torch.abs(stance_labels), 1)
    predict_labels = torch.argmax(P_stance, 1)
    
    #for idx in range(0, len(predict_labels)):
        #max_val = max(prob_stance[idx])
        #if max_val <= 0.5:
            #predict_labels[idx] = 2
            
    
    agree_true = 0
    agree_total = 0
    
    disagree_true = 0
    disagree_total = 0
    
    discuss_true = 0
    discuss_total = 0
    
    unrelated_true = 0
    unrelated_total = 0
    
    for idx, true_label in enumerate(target_labels):
        predict_label = predict_labels[idx] 
        if true_label == 0:
            agree_total += 1
            if predict_label == 0:
                agree_true += 1
        elif true_label == 1:
            disagree_total += 1
            if predict_label == 1:
                disagree_true += 1
        elif true_label == 2:
            discuss_total += 1
            if predict_label == 2:
                discuss_true += 1
        elif true_label == 3:
            unrelated_total += 1
            if predict_label == 3:
                unrelated_true += 1
        else:
            print("Problem!!!!")
        
    true_predict_count_stance= len((torch.eq(predict_labels, target_labels)).nonzero().flatten())    
    true_total = agree_true + disagree_true + discuss_true + unrelated_true

    return true_predict_count_stance, agree_true, disagree_true, discuss_true, unrelated_true, true_predict_count_ideology, con_true, lib_true, na_true, P_stance

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def predict_classwise_stance_ideology(P_relatedness, P_stance, P_exisstance, stance_labels):

    true_predict_count_ideology = 0
    con_true = 0
    lib_true = 0
    na_true = 0
    
    
    P_related = torch.reshape(P_relatedness[:, 0], (-1, 1))
    P_unrelated = torch.reshape(P_relatedness[:, 1], (-1, 1))
    
    tmp1 = P_stance[:,:3]
    tmp2 = torch.reshape(torch.sum(tmp1,dim=1),[-1,1])
    tmp3 = torch.cat([tmp2,tmp2,tmp2],dim=1)
    tmp4 = torch.cat([P_related, P_related, P_related],dim=1)
    tmp5 = torch.div(tmp1,tmp3)
    tmp6 = tmp5*tmp4
    prob_stance = torch.cat([tmp6,P_unrelated],1)#tmp6
    
    
    #existed-stance layer
    #P_oneside = torch.reshape(P_exisstance[:, 0], (-1, 1))
    #P_noside = torch.reshape(P_exisstance[:, 1], (-1, 1)) #discuss
    
    #tmp1 = prob_stance[:,:2]
    #tmp2 = torch.reshape(torch.sum(tmp1,dim=1),[-1,1])
    #tmp3 = torch.cat([tmp2,tmp2],dim=1)
    #tmp4 = torch.cat([P_oneside, P_oneside],dim=1)
    #tmp5 = torch.div(tmp1,tmp3)
    #tmp6 = tmp5*tmp4
    #prob_stance_plus_existed = torch.cat([tmp6,P_noside,P_noside],1)#tmp6
    
    target_labels = torch.argmax(torch.abs(stance_labels), 1)
    predict_labels = torch.argmax(prob_stance, 1)
    
    #for idx in range(0, len(predict_labels)):
        #max_val = max(prob_stance[idx])
        #if max_val <= 0.5:
            #predict_labels[idx] = 2
            
    
    agree_true = 0
    agree_total = 0
    
    disagree_true = 0
    disagree_total = 0
    
    discuss_true = 0
    discuss_total = 0
    
    unrelated_true = 0
    unrelated_total = 0
    
    for idx, true_label in enumerate(target_labels):
        predict_label = predict_labels[idx] 
        if true_label == 0:
            agree_total += 1
            if predict_label == 0:
                agree_true += 1
        elif true_label == 1:
            disagree_total += 1
            if predict_label == 1:
                disagree_true += 1
        elif true_label == 2:
            discuss_total += 1
            if predict_label == 2:
                discuss_true += 1
        elif true_label == 3:
            unrelated_total += 1
            if predict_label == 3:
                unrelated_true += 1
        else:
            print("Problem!!!!")
        
    true_predict_count_stance= len((torch.eq(predict_labels, target_labels)).nonzero().flatten())    
    true_total = agree_true + disagree_true + discuss_true + unrelated_true

    return true_predict_count_stance, agree_true, disagree_true, discuss_true, unrelated_true, true_predict_count_ideology, con_true, lib_true, na_true, prob_stance

In [None]:
def count_class_num(df):
    pro_num = 0
    agst_num = 0
    neut_num = 0
    notrel_num = 0
    
    if len(df.columns) == 4:
        
        pro_num = len(df[df[0] == 1])
        agst_num = len(df[df[1] == 1])
        neut_num = len(df[df[2] == 1])
        notrel_num = len(df[df[3] == 1])
    else:
        df_pro = df[df['stance'] == "Pro"]
        df_agst = df[df['stance'] == "Agst"]
        df_neut = df[df['stance'] == "Neut"]
        df_na = df[df['stance'] == "Not-rel"]
        
        pro_num = (df_pro.shape[0])
        agst_num = (df_agst.shape[0])
        neut_num = (df_neut.shape[0])
        notrel_num = (df_na.shape[0])
        
    return [pro_num, agst_num, neut_num, notrel_num]

In [None]:
def count_class_num_ideology(df):
    df_con = df[df['ideology'] == "Con"]
    df_lib = df[df['ideology'] == "Lib"]
    df_na = df[df['ideology'] == "No"]
 
    return [df_con.shape[0], df_lib.shape[0], df_na.shape[0]]

In [None]:
def count_class_num_ambigious(df):
    df_A = df[df['Ambigious'] == "1"]
    df_N = df[df['Ambigious'] == "0"]
 
    return [df_A.shape[0], df_N.shape[0]]

In [None]:
def preprocess_dataset_query_wise(df):
    query_docs = {}
    for idx in range(0, df.shape[0]):
        doc_inst = df.iloc[idx]
        my_q = doc_inst["Q"]
        if my_q not in query_docs.keys():
            query_docs[my_q] = []
            query_docs[my_q].append(doc_inst)
        else:
            query_docs[my_q].append(doc_inst)
        
    return query_docs

In [None]:
def sample_dataset_ambigious(df, seedVal):
    #create_determinism(seedVal)
    
    df_A = df[df['Ambigious'] == "1"]
    df_N = df[df['Ambigious'] == "0"]
    
    print(df.shape[0])
    
    df_new = df_A.append(df_N, ignore_index = True)

    y_copy = df_new['Ambigious'].copy(deep=True)
    X_copy = df_new.drop('Ambigious', axis=1).copy(deep=True)
    
    X = pd.DataFrame (columns=['qID', 'docID', 'stance', 'ideology', 'docCont', 'Q', 'title'])
    y = pd.DataFrame (columns=['Ambigious'])
    
    X = X_copy
    y = y_copy
    
    print(len(X))
    print(len(y))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    
    print(len(X_train))
    print(len(y_train))
    print(len(X_test))
    print(len(y_test))
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=True)
    
    X_train.insert(2, "Ambigious", y_train.values) 
    X_val.insert(2, "Ambigious", y_val.values) 
    X_test.insert(2, "Ambigious", y_test.values)
    
    
    df_A = X_train[X_train['Ambigious'] == "1"]
    df_N = X_train[X_train['Ambigious'] == "0"]
    
    
    print("****Train****")
    print("Ambigious", df_A.shape[0])
    print("Not Ambigious", df_N.shape[0])
    
    
    df_A = X_test[X_test['Ambigious'] == "1"]
    df_N = X_test[X_test['Ambigious'] == "0"]
    
    print("****Test****")
    print("Ambigious", df_A.shape[0])
    print("Not Ambigious", df_N.shape[0])
    
    X_train.to_csv('./dataset/batches_cleaned/stance/train_serp_ambigious.tsv', sep='\t', index=False)
    X_val.to_csv('./dataset/batches_cleaned/stance/val_serp_ambigious.tsv', sep='\t', index=False)
    X_test.to_csv('./dataset/batches_cleaned/stance/test_serp_ambigious.tsv', sep='\t', index=False)

    return X_train, X_val, X_test

In [None]:
def sample_dataset_stance(df, seedVal):
    #create_determinism(seedVal)
    
    df_pro = df[df['stance'] == "Pro"]
    df_agst = df[df['stance'] == "Agst"]
    df_neut = df[df['stance'] == "Neut"]
    df_na = df[df['stance'] == "Not-rel"]
    
    df_con = df[df['ideology'] == "Con"]
    df_lib = df[df['ideology'] == "Lib"]
    df_no = df[df['ideology'] == "No"]
    
    
    df_neut = df_neut

    
    print("Pro", df_pro.shape[0])
    print("Agst", df_agst.shape[0])
    print("Neut", df_neut.shape[0])
    print("Not-rel", df_na.shape[0])
    
    print("Con", df_con.shape[0])
    print("Lib", df_lib.shape[0])
    print("NA", df_no.shape[0])
    
    df_new = df_pro.append(df_agst, ignore_index = True)
    df_new = df_new.append(df_neut, ignore_index = True)
    df_new = df_new.append(df_na, ignore_index = True)

    y_copy = df_new['stance'].copy(deep=True)
    X_copy = df_new.drop('stance', axis=1).copy(deep=True)
    
    X = pd.DataFrame (columns=['qID', 'docID', 'ideology', 'docCont' 'Q', 'title'])
    y = pd.DataFrame (columns=['stance'])
    
    X = X_copy
    y = y_copy
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    
    print(len(X_train))
    print(len(y_train))
    print(len(X_test))
    print(len(y_test))
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=True)
    
    X_train.insert(2, "stance", y_train.values) 
    X_val.insert(2, "stance", y_val.values) 
    X_test.insert(2, "stance", y_test.values)
    
    
    df_pro = X_train[X_train['stance'] == "Pro"]
    df_agst = X_train[X_train['stance'] == "Agst"]
    df_neut = X_train[X_train['stance'] == "Neut"]
    df_na = X_train[X_train['stance'] == "Not-rel"]
    
    
    print("****Train****")
    print("Pro", df_pro.shape[0])
    print("Agst", df_agst.shape[0])
    print("Neut", df_neut.shape[0])
    print("Not-rel", df_na.shape[0])
    
    
    df_pro = X_test[X_test['stance'] == "Pro"]
    df_agst = X_test[X_test['stance'] == "Agst"]
    df_neut = X_test[X_test['stance'] == "Neut"]
    df_na = X_test[X_test['stance'] == "Not-rel"]
    
    print("****Test****")
    print("Pro", df_pro.shape[0])
    print("Agst", df_agst.shape[0])
    print("Neut", df_neut.shape[0])
    print("Not-rel", df_na.shape[0])
    
    X_train.to_csv('./dataset/batches_cleaned/stance/train_serp.tsv', sep='\t', index=False)
    X_val.to_csv('./dataset/batches_cleaned/stance/val_serp.tsv', sep='\t', index=False)
    X_test.to_csv('./dataset/batches_cleaned/stance/test_serp.tsv', sep='\t', index=False)

    return X_train, X_val, X_test

In [None]:
def preprocess_ideology_ambigious(ambigious_labels):
    t_ideology = []

    for idx, a_label in enumerate(ambigious_labels):
        a_label = ambigious_labels[idx]
        if a_label == "0": #con
            t_ideology.append([0])
        else:#lib
            t_ideology.append([1])
            
    t_ideology = torch.as_tensor(t_ideology, dtype=torch.int32)
    
    return t_ideology

In [None]:
def preprocess_stance_ideology_new(s_labels, i_labels):
    t_relatedness = []
    t_stance = []
    
    t_existedstance = []
    t_ideology = []
    
    t_mmd_symbol = []
    t_mmd_symbol_ = []
    
    #print(labels.shape)
        

    for idx, s_label in enumerate(s_labels):
        i_label = i_labels[idx]
        if s_label == "Not-rel": #unrelated
            t_relatedness.append([0,1])
            t_stance.append([0,0,0,1])
            t_mmd_symbol.append(0)
            t_mmd_symbol_.append(1)
            t_existedstance.append([0,1])
            t_ideology.append([0,0,1])
        elif s_label == "Pro": #agree
            t_relatedness.append([1,0])
            t_stance.append([1,0,0,0])
            t_mmd_symbol.append(1)
            t_mmd_symbol_.append(0)
            t_existedstance.append([1,0])
            if i_label == "Con":
                t_ideology.append([1,0,0])
            elif i_label == "Lib":
                t_ideology.append([0,1,0])
            else:
                t_ideology.append([0,0,1])
        elif s_label == "Agst": #disagree
            t_relatedness.append([1,0])
            t_stance.append([0,1,0,0])
            t_mmd_symbol.append(1)
            t_mmd_symbol_.append(0)
            t_existedstance.append([1,0])
            if i_label == "Con":
                t_ideology.append([1,0,0])
            elif i_label == "Lib":
                t_ideology.append([0,1,0])
            else:
                t_ideology.append([0,0,1])
        elif s_label == "Neut": #discuss
            t_relatedness.append([1,0])
            t_stance.append([0,0,1,0])
            t_mmd_symbol.append(1)
            t_mmd_symbol_.append(0)
            t_existedstance.append([0,1])
            t_ideology.append([0,0,1])
        else:
            print("Error-labels", s_label, i_label)
            
    
    t_relatedness = torch.as_tensor(t_relatedness, dtype=torch.int32)
    t_stance = torch.as_tensor(t_stance, dtype=torch.int32)
    t_existedstance = torch.as_tensor(t_existedstance, dtype=torch.int32)
    t_ideology = torch.as_tensor(t_ideology, dtype=torch.int32)
    
    t_mmd_symbol  = torch.as_tensor(t_mmd_symbol, dtype=torch.float32)
    t_mmd_symbol_ = torch.as_tensor(t_mmd_symbol_, dtype=torch.float32)
    
    return t_relatedness, t_stance, t_mmd_symbol, t_mmd_symbol_, t_existedstance, t_ideology

In [None]:
def preprocess_stance_ideology_meta(s_labels):
    
    t_stance = []
    for idx, s_label in enumerate(s_labels):
        if s_label == "Not-rel": #unrelated
            t_stance.append([0,0,0,1])
        elif s_label == "Pro": #agree
            t_stance.append([1,0,0,0])
        elif s_label == "Agst": #disagree
            t_stance.append([0,1,0,0])
        elif s_label == "Neut": #discuss
            t_stance.append([0,0,1,0])
        else:
            print("Error-labels", s_label)

    return t_stance

In [None]:
def preprocess_stance_ideology_new_meta(s_labels):
    t_relatedness = []
    t_stance = []
    
    t_mmd_symbol = []
    t_mmd_symbol_ = []
        
    num_rows = s_labels.shape[0]
    for idx in range(0, num_rows):
        row = s_labels.iloc[idx]
        if row[3] == -1 or row[3] == 1: #unrelated
            t_relatedness.append([0,1])
            t_stance.append([0,0,0,1])
            t_mmd_symbol.append(0)
            t_mmd_symbol_.append(1)
        elif row[0] == 1: #agree
            t_relatedness.append([1,0])
            t_stance.append([1,0,0,0])
            t_mmd_symbol.append(1)
            t_mmd_symbol_.append(0)
        elif row[1] == 1: #disagree
            t_relatedness.append([1,0])
            t_stance.append([0,1,0,0])
            t_mmd_symbol.append(1)
            t_mmd_symbol_.append(0)
        elif row[2] == 1: #discuss
            t_relatedness.append([1,0])
            t_stance.append([0,0,1,0])
            t_mmd_symbol.append(1)
            t_mmd_symbol_.append(0)
        else:
            print("Error-labels")
            
    
    t_relatedness = torch.as_tensor(t_relatedness, dtype=torch.int32)
    t_stance = torch.as_tensor(t_stance, dtype=torch.int32)
    
    t_mmd_symbol  = torch.as_tensor(t_mmd_symbol, dtype=torch.float32)
    t_mmd_symbol_ = torch.as_tensor(t_mmd_symbol_, dtype=torch.float32)
    
    return t_relatedness, t_stance, t_mmd_symbol, t_mmd_symbol_

In [None]:
def preprocess_stance_ideology(s_labels, i_labels):
    t_relatedness = []
    t_stance = []
    
    t_existedstance = []
    t_ideology = []
    
    t_mmd_symbol = []
    t_mmd_symbol_ = []
    
    #print(labels.shape)
        

    for idx, s_label in enumerate(s_labels):
        i_label = "Con"
        if s_label == "Not-rel": #unrelated
            t_relatedness.append([0,1])
            t_stance.append([0,0,0,1])
            t_mmd_symbol.append(0)
            t_mmd_symbol_.append(1)
            t_existedstance.append([0,1])
            t_ideology.append([0,0,1])
        elif s_label == "Pro": #agree
            t_relatedness.append([1,0])
            t_stance.append([1,0,0,0])
            t_mmd_symbol.append(1)
            t_mmd_symbol_.append(0)
            t_existedstance.append([1,0])
            if i_label == "Con":
                t_ideology.append([1,0,0])
            elif i_label == "Lib":
                t_ideology.append([0,1,0])
            else:
                t_ideology.append([0,0,1])
        elif s_label == "Agst": #disagree
            t_relatedness.append([1,0])
            t_stance.append([0,1,0,0])
            t_mmd_symbol.append(1)
            t_mmd_symbol_.append(0)
            t_existedstance.append([1,0])
            if i_label == "Con":
                t_ideology.append([1,0,0])
            elif i_label == "Lib":
                t_ideology.append([0,1,0])
            else:
                t_ideology.append([0,0,1])
        elif s_label == "Neut": #discuss
            t_relatedness.append([1,0])
            t_stance.append([0,0,1,0])
            t_mmd_symbol.append(1)
            t_mmd_symbol_.append(0)
            t_existedstance.append([0,1])
            t_ideology.append([0,0,1])
        else:
            print("Error-labels", s_label, i_label)
            
    
    t_relatedness = torch.as_tensor(t_relatedness, dtype=torch.int32)
    t_stance = torch.as_tensor(t_stance, dtype=torch.int32)
    t_existedstance = torch.as_tensor(t_existedstance, dtype=torch.int32)
    t_ideology = torch.as_tensor(t_ideology, dtype=torch.int32)
    
    t_mmd_symbol  = torch.as_tensor(t_mmd_symbol, dtype=torch.float32)
    t_mmd_symbol_ = torch.as_tensor(t_mmd_symbol_, dtype=torch.float32)
    
    return t_relatedness, t_stance, t_mmd_symbol, t_mmd_symbol_, t_existedstance, t_ideology

In [None]:
def concanListStringsLonger(list1, list2):
    list3 = []
    myLen1 = len(list1)
    if myLen1 != len(list2):
        print("Length - error")
    for idx in range(0, myLen1):
        list3.append(list1[idx] + " " + list2[idx])
    return list3

In [None]:
def concanListStringsLonger2(list1, list2):
    list3 = []
    myLen1 = len(list1)
    if myLen1 != len(list2):
        print("Length - error")
    for idx in range(0, myLen1):
        list3.append(list1[idx] + " GIZEM " + list2[idx])
    return list3

In [None]:
def concanListStrings(list1, list2):
    list3 = []
    new_labels = []
    myLen1 = len(list1)
    if myLen1 != len(list2):
        print("Length - error")
    for idx in range(0, myLen1):
        list3.append(list1[idx] + " " + list2[idx])
        #list3.append(list1[idx] + " " + list2[idx][-512:])
        #new_labels.append(labels[idx])
        #new_labels.append(labels[idx])
        
    return list3

In [None]:
def concanListStrings_sep(list1, list2):
    list3 = []
    myLen1 = len(list1)
    if myLen1 != len(list2):
        print("Length - error")
   # list3 = list(zip(list1, list2))
    for idx in range(0, myLen1):
        list3.append((list1[idx], list2[idx]))
        #list3.append(list1[idx], list2[idx])

    return list3

In [None]:
### Generate the datasets with the different fields.
def generate_datasets_ambigious(df):

    sentencesQuery= df.Q.values
    sentencesTitle = df.title.values
    sentencesCont = df.docCont.values

    labels = df.Ambigious.values
    labels_stances = df.stance.values
    
    #print(stances[0:10])

    sentencesQueryTitle = concanListStrings(sentencesQuery, sentencesTitle)
    sentencesQueryTitleCont = concanListStringsLonger(sentencesQueryTitle, sentencesCont)

    return sentencesQueryTitle, sentencesQueryTitleCont, labels, labels_stances

In [None]:
#Generate the datasets with the different fields.
def generate_datasets_emergent(df):

    sentencesQuery = df.claimHeadline.values #claim in this case
    sentencesTitle = df.articleHeadline.values
    labels = df.stance.values

    sentencesQueryTitle = concanListStrings_sep(sentencesQuery, sentencesTitle)

    return sentencesQueryTitle, labels

In [None]:
#Generate the datasets with the different fields.
def generate_datasets(df):

    sentencesQuery = df.Q.values #claim in this case
    sentencesTitle = df.title.values
    labels = df.stance.values
    labels_ideology = df.ideology.values
    #df.ideology.values
    
    sentencesCont = df.docCont.values

    sentencesQueryTitle = concanListStrings(sentencesQuery, sentencesTitle)
    sentencesTitleCont = concanListStrings(sentencesTitle, sentencesCont)
    sentencesQueryTitleCont = concanListStrings(sentencesQuery, sentencesTitleCont)        
        
    

    return sentencesQueryTitle, sentencesQueryTitleCont, labels, labels_ideology

In [None]:
def preprocessing_for_bert(tokenizer, docs, max_len, doc_stride, long = False):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []
    
    input_ids_last = []
    attention_masks_last = []
    
    content_input_ids = {}
    

    # For every sentence...
    for sent in docs:
        #print(sent)
        #print(sentences[0])
        #print(sentences[1])
        
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        
        if not long:
            encoded_sent = tokenizer.encode_plus (
                sent,  # Preprocess sentence
                add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
                max_length=64,                  # Max length to truncate/pad
                #padding='longest',         # Pad sentence to max length
                pad_to_max_length = True,
                return_tensors='pt',           # Return PyTorch tensor
                return_attention_mask=True      # Return attention mask
                )
            # Add the outputs to the lists
            input_ids.append(encoded_sent['input_ids'])
            attention_masks.append(encoded_sent['attention_mask'])
        else:
            input_ids.append(torch.tensor(tokenizer.encode(sent, padding=True)))
            input_ids_pd = pd.DataFrame(input_ids)
            attention_masks.append(torch.ones(input_ids_pd.shape, dtype=torch.long))

        
        # Print the original sentence.
        #print(' Original: ', sent)

        # Print the sentence split into tokens.
        #print('Tokenized: ', input_ids)
        
    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    # Print sentence 0, now as a list of IDs.
    #print('Original: ', docs[0])
    #print('Token IDs:', input_ids[0])
    
    return input_ids, attention_masks

In [None]:
import numpy as np
import torch

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.val_acc_max_stance = -1
        self.val_acc_max_ideology = -1
        self.delta = delta

    def __call__(self, val_loss, val_acc_stance, val_acc_ideology, model_save_state, model_save_path, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, val_acc_stance, val_acc_ideology, model_save_state, model_save_path, model)
            self.val_acc_max_stance = val_acc_stance
            self.val_acc_max_ideology = val_acc_ideology
        elif score < self.best_score + self.delta:
            self.counter += 1
            #self.save_checkpoint(val_loss, val_acc_stance, val_acc_ideology, model_save_state, model_save_path, model, tokenizer)
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, val_acc_stance, val_acc_ideology, model_save_state, model_save_path, model)
            self.val_acc_max_stance = val_acc_stance
            self.val_acc_max_ideology = val_acc_ideology
            self.counter = 0

            #self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True

    def save_checkpoint(self, val_loss, val_acc_stance, val_acc_ideology, model_save_state, model_save_path, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
            print(f'Validation acc stance : ({self.val_acc_max_stance:.6f} --> {val_acc_stance:.6f}).  Saving model ...')
            print(f'Validation acc ideology : ({self.val_acc_max_ideology:.6f} --> {val_acc_ideology:.6f}).  Saving model ...')
        #torch.save(model.module.state_dict(), 'checkpoint.pt')
        
        #torch.save(model_save_state, model_save_path)
        torch.save(model_save_state, model_save_path)  
        
        #tokenizer.save_pretrained('model_save/')
        # Good practice: save your training arguments together with the trained model
        #torch.save(model, './model_save/entire_model.pt')
        self.val_loss_min = val_loss

In [None]:
import torch
from transformers import BertModel, RobertaModel
class AmbigiousDetectionClass(torch.nn.Module):
    def __init__(self, modelUsed):
        super(AmbigiousDetectionClass, self).__init__()
        input_size = 768
        hidden_size = 768
        mmd_size = 10
        dropout_prob = 0.1
        relatedness_size = 2
        classes_size = 1
        #agreement_size = 3
        
        self.input_pl = RobertaModel.from_pretrained(modelUsed) #input
        self.l1 = torch.nn.Linear(input_size, hidden_size)
        self.bn1_hidden = torch.nn.BatchNorm1d(hidden_size, momentum=0.05)
        self.dropout = torch.nn.Dropout(dropout_prob)

        self.theta_d = torch.nn.Linear(hidden_size, mmd_size)
        self.bn1_theta = torch.nn.BatchNorm1d(mmd_size, momentum=0.05)
        
        self.probability = torch.nn.Linear(hidden_size, relatedness_size)
        self.output_prob = torch.nn.Softmax(dim = 1)
        self.output_binary = torch.nn.Sigmoid()
        self.stance = torch.nn.Linear(hidden_size, classes_size)

        #self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        relatedness_size = 2
        classes_size = 1
        
        input_1 = self.input_pl(input_ids=input_ids, attention_mask=attention_mask)
        input_1 = input_1[0]    
        input_1 = input_1[:, 0]
        
        #hidden layer
        #hidden_state = self.l1(input_1)
        #hidden_state_normalized = self.bn1_hidden(hidden_state)
        #hidden_state_normalized = torch.nn.ReLU()(hidden_state_normalized)
        #hidden_layer= self.dropout(hidden_state_normalized)

        stance_state = self.stance(input_1) #batch size x classes_size
        stance_flat = self.dropout(stance_state) #batch size x classes_size
        
        #stance_flat_reshaped = torch.reshape(stance_flat, (-1, classes_size))
        P_stance = self.output_prob(stance_flat)
        
        #output = self.classifier(hidden_state)
        return P_stance

In [None]:
import torch
from transformers import BertModel, RobertaModel, DistilBertModel
class StanceMetaLearnerSimple(torch.nn.Module):
    def __init__(self, datasetUsed):
        super(StanceMetaLearnerSimple, self).__init__()
        input_size = len(datasetUsed[0])
        hidden_size = 10
        dropout_prob = 0.5
        classes_size = 4
        mmd_size = 10
        relatedness_size = 2

        #self.input_pl = RobertaModel.from_pretrained('roberta-base')
        self.l1 = torch.nn.Linear(input_size, hidden_size)
        self.output_prob = torch.nn.Softmax(dim = 1)
        
        self.stance = torch.nn.Linear(hidden_size, classes_size)
        

    def forward(self, input_ids, mmd_pl, mmd_pl_):
        
        relatedness_size = 2
        classes_size = 4        
        
        #hidden layer
        hidden_state = self.l1(input_ids)    
        stance_state = self.stance(hidden_state) #batch size x classes_size
        P_stance = self.output_prob(stance_state)

        return P_stance

In [None]:
import torch
from transformers import BertModel, RobertaModel, DistilBertModel
class StanceDetectionUnigramClass(torch.nn.Module):
    def __init__(self, datasetUsed):
        super(StanceDetectionUnigramClass, self).__init__()
        input_size = len(datasetUsed[0])
        hidden_size_initial = 20
        hidden_size = 20
        mmd_size = 10
        dropout_prob = 0.6
        dropout_prob2 = 0.6
        relatedness_size = 2
        classes_size = 4
        ideology_class_size = 3
        #agreement_size = 3
        #self.input_pl = BertForPreTraining.from_pretrained(modelUsed) #input
        #self.input_pl = BertModel.from_pretrained(modelUsed)
        #self.input_pl = RobertaModel.from_pretrained(modelUsed)
        #self.input_pl = DistilBertModel.from_pretrained(modelUsed)
        self.l1 = torch.nn.Linear(input_size, hidden_size_initial)
        self.l2 = torch.nn.Linear(hidden_size_initial, hidden_size)
        self.l3 = torch.nn.Linear(hidden_size, hidden_size)
        self.bn1_hidden = torch.nn.BatchNorm1d(hidden_size_initial, momentum=0.05)
        self.bn2_hidden = torch.nn.BatchNorm1d(hidden_size, momentum=0.05)
        self.dropout = torch.nn.Dropout(dropout_prob)
        self.dropout2 = torch.nn.Dropout(dropout_prob2)

        self.theta_d = torch.nn.Linear(hidden_size, mmd_size)
        self.bn1_theta = torch.nn.BatchNorm1d(mmd_size, momentum=0.05)
        
        self.probability = torch.nn.Linear(hidden_size, relatedness_size)
        self.output_prob = torch.nn.Softmax(dim = 1)
        
        self.stance = torch.nn.Linear(hidden_size + relatedness_size - 1, classes_size)
        self.ideology = torch.nn.Linear(hidden_size + classes_size - 2, ideology_class_size)
        
        #for param in self.input_pl.embeddings.parameters():
            #param.requires_grad = False
        
        #for param in self.input_pl[2][0:5].parameters():
            #param.requires_grad = False

        #self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, mmd_pl, mmd_pl_):
        relatedness_size = 2
        classes_size = 4
        ideology_class_size = 3
        
        #hidden layer
        hidden_state = self.l1(input_ids)
        hidden_state_normalized = self.bn1_hidden(hidden_state)
        hidden_state_normalized = torch.nn.ReLU()(hidden_state_normalized)
        hidden_layer= self.dropout2(hidden_state_normalized)
    
        #mmd layer        
        theta_d = self.theta_d(hidden_layer)
        theta_d_normalized = self.bn1_theta(theta_d)
        theta_d_normalized = torch.nn.ReLU()(theta_d_normalized)
        theta_d_layer= self.dropout2(theta_d_normalized)
        
        
        n1 = torch.sum(mmd_pl, dim = 0) + 1e-10
        n2 = torch.sum(mmd_pl_, dim = 0)  + 1e-10
        aa = torch.reshape(mmd_pl, (-1,1))
        bb = torch.reshape(mmd_pl_, (-1,1))
        
        #calculate mmd_loss                  
        d1 = torch.div(torch.sum(torch.mul(theta_d_layer, aa), dim=1), n1)
        d2 = torch.div(torch.sum(torch.mul(theta_d_layer, bb), dim=1), n2)
                             
        mmd_loss = torch.sum(d1 - d2)

        #probability layer
        relatedness_state = self.probability(hidden_layer)
        relatedness_flat = self.dropout2(relatedness_state)
        
        relatedness_flat_reshaped = torch.reshape(relatedness_flat, (-1, relatedness_size))
        P_relatedness = self.output_prob(relatedness_flat_reshaped)
        #P_relatedness = relatedness_flat_reshaped
        
        P_related = torch.reshape(P_relatedness[:, 0], (-1, 1))
        P_unrelated = torch.reshape(P_relatedness[:, 1], (-1, 1))
        
        #stance layer
        concat_fea = torch.cat([hidden_layer, P_related], dim = 1)
        stance_state = self.stance(concat_fea) #batch size x classes_size
        stance_flat = self.dropout2(stance_state) #batch size x classes_size
        
        stance_flat_reshaped = torch.reshape(stance_flat, (-1, classes_size))
        P_stance = self.output_prob(stance_flat_reshaped) 
        

        return mmd_loss, P_relatedness, P_stance, P_related, P_unrelated

In [None]:
import torch
from transformers import BertModel, RobertaModel
class StanceDetectionClassBERTBiLSTM(torch.nn.Module):
    def __init__(self, modelUsed):
        super(StanceDetectionClassBERTBiLSTM, self).__init__()
        input_size = 128
        hidden_size = 128
        lstm_hidden_size = 32
        linear_size = 32
        mmd_size = 10
        dropout_prob = 0.6
        relatedness_size = 2
        classes_size = 4
        #agreement_size = 3
        #self.input_pl = BertForPreTraining.from_pretrained(modelUsed) #input
        #self.input_pl = BertModel.from_pretrained(modelUsed)
        self.input_pl = RobertaModel.from_pretrained(modelUsed)
        
        self.lstm = torch.nn.LSTM(hidden_size, lstm_hidden_size, bidirectional = True)
        self.bn1_hidden = torch.nn.BatchNorm1d(lstm_hidden_size*2, momentum=0.05)
        self.dropout = torch.nn.Dropout(dropout_prob)
        
        self.hidden = torch.nn.Linear(lstm_hidden_size*2, linear_size)
        
        self.probability = torch.nn.Linear(hidden_size, relatedness_size)
        self.output_prob = torch.nn.Softmax(dim = 1)
        self.stance = torch.nn.Linear(linear_size, classes_size)

        #self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, mmd_pl, mmd_pl_):
        relatedness_size = 2
        classes_size = 4
        
        input_1 = self.input_pl(input_ids=input_ids, attention_mask=attention_mask)
        #for param in self.input_pl.parameters():
            #param.requires_grad = False
            
        ##for name, param in model.named_parameters():
        #    if 'classifier' not in name: # classifier layer
        #        param.requires_grad = False

        input_1 = input_1[0]
        input_1 = input_1[:, 0]
        
        lstm_out, _ = self.lstm(input_1.view(len(input_ids), 1, -1))
        lstm_out_reshaped = lstm_out.view(len(input_ids), -1)
        
        #BiLSTM hidden layer
        #hidden_state = self.bilstm(input_1_lstm)
        hidden_state_normalized = self.bn1_hidden(lstm_out_reshaped)
        hidden_state_normalized = torch.nn.ReLU()(hidden_state_normalized)
        hidden_layer= self.dropout(hidden_state_normalized)
        
        hidden_layer = self.hidden(hidden_layer)

        stance_state = self.stance(hidden_layer) #batch size x classes_size
        stance_flat = self.dropout(stance_state) #batch size x classes_size
        
        stance_flat_reshaped = torch.reshape(stance_flat, (-1, classes_size))
        P_stance = self.output_prob(stance_flat_reshaped)

        return P_stance

In [None]:
import torch
from transformers import BertModel, RobertaModel, DistilBertModel, AlbertModel, XLNetModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
class StanceIdeologyDetectionClassPrev(torch.nn.Module):
    def __init__(self, modelUsed):
        super(StanceIdeologyDetectionClassPrev, self).__init__()
        input_size = 1024
        hidden_size_initial = 100
        hidden_size = 50
        mmd_size = 10
        dropout_prob = 0.1
        dropout_prob2 = 0.1
        relatedness_size = 2
        classes_size = 4
        exist_stance_size = 2
        ideology_class_size = 3
        
        self.input_pl = None
        if 'roberta' in modelUsed:
            self.input_pl = RobertaModel.from_pretrained(modelUsed)
        elif 'distilbert' in modelUsed:
            self.input_pl = DistilBertModel.from_pretrained(modelUsed)
        elif 'albert' in modelUsed:
            self.input_pl = AlbertModel.from_pretrained(modelUsed)
        elif 'bert' in modelUsed:
            self.input_pl = BertModel.from_pretrained(modelUsed)
        elif 'xlnet' in modelUsed:
            input_size = 1024
            self.input_pl = XLNetModel.from_pretrained(modelUsed)

        self.l1 = torch.nn.Linear(input_size, hidden_size_initial)
        self.l2 = torch.nn.Linear(hidden_size_initial, hidden_size)
        self.l3 = torch.nn.Linear(hidden_size, hidden_size)
        
        self.bn1_hidden = torch.nn.BatchNorm1d(hidden_size_initial, momentum=0.05)
        self.bn2_hidden = torch.nn.BatchNorm1d(hidden_size, momentum=0.05)
        self.dropout = torch.nn.Dropout(dropout_prob)
        self.relu = torch.nn.ReLU(hidden_size)

        self.theta_d = torch.nn.Linear(hidden_size, mmd_size)
        self.bn1_theta = torch.nn.BatchNorm1d(mmd_size, momentum=0.05)
        self.relu_theta = torch.nn.ReLU(mmd_size)
    
        self.probability = torch.nn.Linear(hidden_size, relatedness_size)
        self.probability_existstance = torch.nn.Linear(hidden_size, exist_stance_size)
        self.output_prob = torch.nn.Softmax(dim = 1)
        
        self.stance = torch.nn.Linear(hidden_size + relatedness_size + exist_stance_size - 2, classes_size)
        self.stance2 = torch.nn.Linear(hidden_size, classes_size)
        self.existstance = torch.nn.Linear(hidden_size + relatedness_size + exist_stance_size - 1, exist_stance_size)
        self.ideology = torch.nn.Linear(hidden_size + classes_size - 2, ideology_class_size)
        
        #for param in self.input_pl.embeddings.parameters():
            #param.requires_grad = False
        
        #for param in self.input_pl[2][0:5].parameters():
            #param.requires_grad = False

        #self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, mmd_pl, mmd_pl_):
        relatedness_size = 2
        classes_size = 4
        exist_stance_size = 2
        ideology_class_size = 3

        
        input_1 = self.input_pl(input_ids = input_ids, attention_mask = attention_mask)
        last_hidden_state_cls = input_1[0][:, 0, :]
    
        frozen_layer_list = [0, 1, 2, 3]
        params = list(self.input_pl.named_parameters())
    
        #hidden layer
        hidden_state = self.l1(last_hidden_state_cls)
        hidden_state_normalized = self.bn1_hidden(hidden_state)
        hidden_state_normalized = self.relu(hidden_state_normalized)
        hidden_layer= self.dropout(hidden_state_normalized)
        
        #hidden layer
        hidden_state = self.l2(hidden_layer)
        hidden_state_normalized = self.bn2_hidden(hidden_state)
        hidden_state_normalized = self.relu(hidden_state_normalized)
        hidden_layer= self.dropout(hidden_state_normalized)
        
        #hidden layer
        hidden_state = self.l3(hidden_layer)
        hidden_state_normalized = self.bn2_hidden(hidden_state)
        hidden_state_normalized = self.relu(hidden_state_normalized)
        hidden_layer= self.dropout(hidden_state_normalized)
        
        stance_state = self.stance2(hidden_layer) #batch size x classes_size
        P_stance = self.output_prob(stance_state) 

        return P_stance

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, RobertaModel, DistilBertModel, AlbertModel, XLNetModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class StanceIdeologyDetectionClassMixout(nn.Module):
    def __init__(self, modelUsed):
        super(StanceIdeologyDetectionClassMixout, self).__init__()
        input_size = 1024
        hidden_size_initial = 100
        hidden_size = 100
        hidden_size2 = 100
        mmd_size = 10
        dropout_prob = 0.6
        dropout_prob2 = 0.1
        relatedness_size = 2
        classes_size = 4
        exist_stance_size = 2
        ideology_class_size = 3
        
        if 'roberta' in modelUsed:
            self.input_pl = RobertaModel.from_pretrained(modelUsed)
        elif 'distilbert' in modelUsed:
            self.input_pl = DistilBertModel.from_pretrained(modelUsed)
        elif 'albert' in modelUsed:
            self.input_pl = AlbertModel.from_pretrained(modelUsed)
        elif 'bert' in modelUsed:
            self.input_pl = BertModel.from_pretrained(modelUsed)
        elif 'xlnet' in modelUsed:
            self.input_pl = XLNetModel.from_pretrained(modelUsed)
        elif 'long' in modelUsed:
            self.input_pl = LongformerModel.from_pretrained(modelUsed, gradient_checkpointing=True)


        self.l1 = torch.nn.Linear(input_size, hidden_size_initial)
        self.l2 = torch.nn.Linear(hidden_size_initial, hidden_size)
        self.l3 = torch.nn.Linear(hidden_size, hidden_size2)
        
        self.bn1_hidden = torch.nn.BatchNorm1d(hidden_size_initial, momentum=0.05)
        self.bn2_hidden = torch.nn.BatchNorm1d(hidden_size, momentum=0.05)
        self.bn3_hidden = torch.nn.BatchNorm1d(hidden_size2, momentum=0.05)
        
        self.dropout = torch.nn.Dropout(dropout_prob)
        self.dropout2 = torch.nn.Dropout(dropout_prob2)
        
        self.leaky = torch.nn.LeakyReLU(hidden_size_initial)
        self.relu = torch.nn.ReLU(hidden_size_initial)
        self.relu2 = torch.nn.ReLU(hidden_size)
        self.relu3 = torch.nn.ReLU(hidden_size2)
        self.gelu = torch.nn.GELU()

        self.theta_d = torch.nn.Linear(hidden_size_initial, mmd_size)
        self.theta_d2 = torch.nn.Linear(mmd_size, mmd_size)
        self.bn1_theta = torch.nn.BatchNorm1d(mmd_size, momentum=0.05)
        self.relu_theta = torch.nn.ReLU(mmd_size)
        self.leaky_theta = torch.nn.LeakyReLU(mmd_size)
        self.gelu_theta = torch.nn.GELU()
    
        self.probability = torch.nn.Linear(hidden_size_initial, relatedness_size)
        self.probability_existstance = torch.nn.Linear(hidden_size2, exist_stance_size)
        self.output_prob = torch.nn.Softmax(dim = 1)
        
        self.stance = torch.nn.Linear(hidden_size_initial + relatedness_size - 1, classes_size)
        self.stance2 = torch.nn.Linear(input_size, classes_size)
        #self.existstance = torch.nn.Linear(hidden_size + relatedness_size + exist_stance_size - 1, exist_stance_size)
        
        #for param in self.input_pl.embeddings.parameters():
            #param.requires_grad = False
        
        #for param in self.input_pl[2][0:5].parameters():
            #param.requires_grad = False

        #self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, mmd_pl, mmd_pl_, epoch_num):
        relatedness_size = 2
        classes_size = 4
        exist_stance_size = 2
        ideology_class_size = 3
        
        input_1 = self.input_pl(input_ids = input_ids, attention_mask = attention_mask)
        last_hidden_state_cls = input_1[0][:, 0, :]
    
                        
        #for param in self.input_pl.parameters():
            #param.requires_grad = False
        
        
        frozen_layer_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        new_frozen_layers = frozen_layer_list
        #if (epoch_num  % 12) != 0:
            #new_frozen_layers = frozen_layer_list[-((epoch_num%12)-1):]
            
        #params = list(self.input_pl.named_parameters())
        #for name, param in params:
        #    if 'embeddings' in name:
         #       param.requires_grad = False
            #if 'encoder.layer' in name:
            #for layer_num in new_frozen_layers:
                #if str(layer_num) in name:
                    #param.requires_grad = True
        
        #for name, param in params:
            #if 'encoder' not in name: # classifier layer
                #param.requires_grad = False
        
        #freeze_layer_count = 9
        #for layer in self.input_pl.bert.encoder.layer[:freeze_layer_count]:
            #for param in layer.parameters():
                #param.requires_grad = False

            
        ##for name, param in model.named_parameters():
        #    if 'classifier' not in name: # classifier layer
        #        param.requires_grad = False

        #input_1 = input_1[0]
        #input_1 = input_1[:, 0]

        
        #bert_hidden_states = input_1[2]
        #print(len(bert_hidden_states))
        
        #sentence_embedding = torch.mean(bert_hidden_states[-1], dim=1).squeeze()
        #print(sentence_embedding)
        #print(sentence_embedding.size())
        
        
        # get last four layers
        #last_four_layers = [bert_hidden_states[i] for i in (-1, -2, -3, -4)]
        # cast layers to a tuple and concatenate over the last dimension
        #cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)
        #print(cat_hidden_states.size())
        
        
        #print("**********")

        # take the mean of the concatenated vector over the token dimension
        #cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze()
        #print(cat_sentence_embedding)
        #print(cat_sentence_embedding.size())
        
        #hidden layer
        hidden_state = self.l1(last_hidden_state_cls)
        hidden_state_normalized = self.bn1_hidden(hidden_state)
        hidden_state_normalized = self.relu(hidden_state_normalized)
        hidden_layer= self.dropout(hidden_state_normalized)
        
        #concat_hidd_inp = torch.cat([last_hidden_state_cls, hidden_layer], dim = 1)     

        #concat_hidd_inp_2 = torch.cat([last_hidden_state_cls, hidden_layer], dim = 1)
        
        #hidden layer
        #hidden_state = self.l2(hidden_layer)
        #hidden_state_normalized = self.bn3_hidden(hidden_state)
        #hidden_state_normalized = self.relu(hidden_state_normalized)
        #hidden_layer= self.dropout(hidden_state_normalized)
        
        #concat_hidd_inp_3 = torch.cat([last_hidden_state_cls, hidden_layer], dim = 1)
        
        #mmd layer        
        theta_d = self.theta_d(hidden_layer)
        theta_d_normalized = self.bn1_theta(theta_d)
        theta_d_normalized = self.relu_theta(theta_d_normalized)
        theta_d_layer= self.dropout(theta_d_normalized)
        #theta_d_layer = theta_d_normalized
        
        #+ 1e-10
        n1 = torch.sum(mmd_pl, dim = 0) + 1e-3
        n2 = torch.sum(mmd_pl_, dim = 0)  + 1e-3
        aa = torch.reshape(mmd_pl, (-1,1))
        bb = torch.reshape(mmd_pl_, (-1,1))
        
        #calculate mmd_loss                  
        d1 = torch.div(torch.sum(torch.mul(theta_d_layer, aa), dim=1), n1)
        d2 = torch.div(torch.sum(torch.mul(theta_d_layer, bb), dim=1), n2)
                             
        mmd_loss = torch.sum(d1 - d2)
        
        #probability layer
        relatedness_state = self.probability(hidden_layer)
        relatedness_flat = self.dropout2(relatedness_state)
        #relatedness_flat = relatedness_state
        
        relatedness_flat_reshaped = torch.reshape(relatedness_flat, (-1, relatedness_size))
        P_relatedness = self.output_prob(relatedness_flat_reshaped)
        
        P_related = torch.reshape(P_relatedness[:, 0], (-1, 1))
        P_unrelated = torch.reshape(P_relatedness[:, 1], (-1, 1))
        
        #********************#

        P_exist_stance = P_related
        
        #stance layer
        concat_fea = torch.cat([hidden_layer, P_related], dim = 1)        
        stance_state = self.stance(concat_fea) #batch size x classes_size
        stance_flat = self.dropout2(stance_state) #batch size x classes_size
        #stance_flat = stance_state
        
        #stance_flat_reshaped = torch.reshape(stance_flat, (-1, classes_size))
        #P_stance = self.output_prob(stance_flat_reshaped)
        
        #stance_state = self.stance2(last_hidden_state_cls) #batch size x classes_size
        #stance_flat = self.dropout(stance_state) #batch size x classes_size
        P_stance = self.output_prob(stance_flat) 

        return mmd_loss, P_relatedness, P_stance, P_exist_stance

In [None]:
import torch
from transformers import BertModel, RobertaModel, DistilBertModel, AlbertModel, XLNetModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
class StanceIdeologyDetectionClass(torch.nn.Module):
    def __init__(self, modelUsed):
        super(StanceIdeologyDetectionClass, self).__init__()
        input_size = 1024
        hidden_size_initial = 100
        hidden_size = 100
        hidden_size2 = 50
        mmd_size = 10
        dropout_prob = 0.4
        dropout_prob2 = 0.4
        relatedness_size = 2
        classes_size = 4
        exist_stance_size = 2
        ideology_class_size = 3
        
        self.input_pl = None
        if 'roberta' in modelUsed:
            self.input_pl = RobertaModel.from_pretrained(modelUsed)
        elif 'distilbert' in modelUsed:
            self.input_pl = DistilBertModel.from_pretrained(modelUsed)
        elif 'albert' in modelUsed:
            self.input_pl = AlbertModel.from_pretrained(modelUsed)
        elif 'bert' in modelUsed:
            self.input_pl = BertModel.from_pretrained(modelUsed)
        elif 'xlnet' in modelUsed:
            self.input_pl = XLNetModel.from_pretrained(modelUsed)

        self.l1 = torch.nn.Linear(input_size, hidden_size_initial)
        self.l2 = torch.nn.Linear(hidden_size_initial, hidden_size)
        self.l3 = torch.nn.Linear(hidden_size, hidden_size2)
        self.bn1_hidden = torch.nn.BatchNorm1d(hidden_size_initial, momentum=0.05)
        self.bn2_hidden = torch.nn.BatchNorm1d(hidden_size, momentum=0.05)
        self.bn3_hidden = torch.nn.BatchNorm1d(hidden_size2, momentum=0.05)
        self.dropout = torch.nn.Dropout(dropout_prob)
        self.relu = torch.nn.ReLU(hidden_size_initial)
        self.relu2 = torch.nn.ReLU(hidden_size)
        self.relu3 = torch.nn.ReLU(hidden_size2)
        self.gelu = torch.nn.GELU()

        self.theta_d = torch.nn.Linear(hidden_size, mmd_size)
        self.theta_d2 = torch.nn.Linear(mmd_size, mmd_size)
        self.bn1_theta = torch.nn.BatchNorm1d(mmd_size, momentum=0.05)
        self.relu_theta = torch.nn.ReLU(mmd_size)
        self.gelu_theta = torch.nn.GELU()
    
        self.probability = torch.nn.Linear(hidden_size, relatedness_size)
        self.probability_existstance = torch.nn.Linear(hidden_size, exist_stance_size)
        self.output_prob = torch.nn.Softmax(dim = 1)
        
        self.stance = torch.nn.Linear(hidden_size + relatedness_size + exist_stance_size - 2, classes_size)
        self.existstance = torch.nn.Linear(hidden_size + relatedness_size + exist_stance_size - 1, exist_stance_size)
        self.ideology = torch.nn.Linear(hidden_size + classes_size - 2, ideology_class_size)
        
        #for param in self.input_pl.embeddings.parameters():
            #param.requires_grad = False
        
        #for param in self.input_pl[2][0:5].parameters():
            #param.requires_grad = False

        #self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, mmd_pl, mmd_pl_, epoch_num):
        relatedness_size = 2
        classes_size = 4
        exist_stance_size = 2
        ideology_class_size = 3
        
        input_1 = self.input_pl(input_ids = input_ids, attention_mask = attention_mask)
        last_hidden_state_cls = input_1[0][:, 0, :]
    
                        
        for param in self.input_pl.parameters():
            param.requires_grad = False
        
        
        frozen_layer_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        new_frozen_layers = frozen_layer_list
        if (epoch_num  % 12) != 0:
            new_frozen_layers = frozen_layer_list[-((epoch_num%12)-1):]
            
        params = list(self.input_pl.named_parameters())
        for name, param in params:
        #    if 'embeddings' in name:
         #       param.requires_grad = False
            #if 'encoder.layer' in name:
            for layer_num in new_frozen_layers:
                if str(layer_num) in name:
                    param.requires_grad = True
        
        #for name, param in params:
            #if 'encoder' not in name: # classifier layer
                #param.requires_grad = False
        
        #freeze_layer_count = 9
        #for layer in self.input_pl.bert.encoder.layer[:freeze_layer_count]:
            #for param in layer.parameters():
                #param.requires_grad = False

            
        ##for name, param in model.named_parameters():
        #    if 'classifier' not in name: # classifier layer
        #        param.requires_grad = False

        #input_1 = input_1[0]
        #input_1 = input_1[:, 0]

        
        #bert_hidden_states = input_1[2]
        #print(len(bert_hidden_states))
        
        #sentence_embedding = torch.mean(bert_hidden_states[-1], dim=1).squeeze()
        #print(sentence_embedding)
        #print(sentence_embedding.size())
        
        
        # get last four layers
        #last_four_layers = [bert_hidden_states[i] for i in (-1, -2, -3, -4)]
        # cast layers to a tuple and concatenate over the last dimension
        #cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)
        #print(cat_hidden_states.size())
        
        
        #print("**********")

        # take the mean of the concatenated vector over the token dimension
        #cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze()
        #print(cat_sentence_embedding)
        #print(cat_sentence_embedding.size())
        
        #hidden layer
        hidden_state = self.l1(last_hidden_state_cls)
        hidden_state_normalized = self.bn1_hidden(hidden_state)
        hidden_state_normalized = self.relu(hidden_state_normalized)
        hidden_layer= self.dropout(hidden_state_normalized)
        
        #mmd layer        
        theta_d = self.theta_d(hidden_layer)
        theta_d_normalized = self.bn1_theta(theta_d)
        theta_d_normalized = self.relu_theta(theta_d_normalized)
        theta_d_layer= self.dropout(theta_d_normalized)
        
        #+ 1e-10
        n1 = torch.sum(mmd_pl, dim = 0) + 1e-3
        n2 = torch.sum(mmd_pl_, dim = 0)  + 1e-3
        aa = torch.reshape(mmd_pl, (-1,1))
        bb = torch.reshape(mmd_pl_, (-1,1))
        
        #calculate mmd_loss                  
        d1 = torch.div(torch.sum(torch.mul(theta_d_layer, aa), dim=1), n1)
        d2 = torch.div(torch.sum(torch.mul(theta_d_layer, bb), dim=1), n2)
                             
        mmd_loss = torch.sum(d1 - d2)
        
        #probability layer
        relatedness_state = self.probability(hidden_layer)
        relatedness_flat = self.dropout(relatedness_state)
        
        relatedness_flat_reshaped = torch.reshape(relatedness_flat, (-1, relatedness_size))
        P_relatedness = self.output_prob(relatedness_flat_reshaped)
        
        P_related = torch.reshape(P_relatedness[:, 0], (-1, 1))
        P_unrelated = torch.reshape(P_relatedness[:, 1], (-1, 1))
        
        #********************#
        
        #probability layer - exist stance
        exist_stance = self.probability_existstance(hidden_layer)
        exist_stance_flat = self.dropout(exist_stance)
        
        exist_stance_flat_reshaped = torch.reshape(exist_stance_flat, (-1, exist_stance_size))
        P_exist_stance = self.output_prob(exist_stance_flat_reshaped)
        
        #existed-stance layer
        P_existedstance = torch.reshape(P_exist_stance[:, 0], (-1, 1))
        P_notexistedstance = torch.reshape(P_exist_stance[:, 1], (-1, 1)) #discuss
        
        #********************#
        
        #stance layer
        concat_fea = torch.cat([hidden_layer, P_related, P_existedstance], dim = 1)        
        stance_state = self.stance(concat_fea) #batch size x classes_size
        stance_flat = self.dropout(stance_state) #batch size x classes_size
        
        stance_flat_reshaped = torch.reshape(stance_flat, (-1, classes_size))
        P_stance = self.output_prob(stance_flat_reshaped) 

        return mmd_loss, P_relatedness, P_stance, P_exist_stance

In [None]:
#!pip install adabelief-pytorch==0.2.0

In [None]:
#from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
def prepare_for_training_ambigious(input_idsTrain, attention_masksTrain, ideology_labels_Train, input_idsVal, attention_masksVal, 
                                   ideology_labels_Val, modelUsed, batch_size=16, epochs = 50, num_warmup_steps=0, learning_rate=5e-5):
    # Combine the training inputs into a TensorDataset.

    from transformers import BertForSequenceClassification, AdamW, BertConfig, RobertaConfig, AutoModelWithLMHead
    from transformers import DistilBertForSequenceClassification, RobertaForSequenceClassification
    
    from torch.utils.data import DataLoader, RandomSampler
    
    t_train_stance = preprocess_ideology_ambigious(ideology_labels_Train)
    
    datasetTrain = TensorDataset(input_idsTrain, attention_masksTrain, t_train_stance)

    # Combine the training inputs into a TensorDataset.
    t_val_stance  = preprocess_ideology_ambigious(ideology_labels_Val)
    
    
    datasetVal = TensorDataset(input_idsVal, attention_masksVal, t_val_stance)
    
    model = AmbigiousDetectionClass(modelUsed)

    # Tell pytorch to run this model on the GPU.
    model.cuda()

    # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
    # I believe the 'W' stands for 'Weight Decay fix"
    
    
    optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  betas=(0.9, 0.999), 
                  eps=1e-08, 
                  weight_decay=1e-3,
                  correct_bias=True
               )

    train_dataloader = DataLoader(
            datasetTrain,  # The training samples.
            sampler =  RandomSampler(datasetTrain), # Select batches randomly
            batch_size = batch_size, # Trains with this batch size., 
            num_workers=8
        )
    batch_size = batch_size


    from transformers import get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup

    # Number of training epochs. The BERT authors recommend between 2 and 4. 
    # We chose to run for 4, but we'll see later that this may be over-fitting the
    # training data.
    epochs = epochs

    # Total number of training steps is [number of batches] x [number of epochs]. 
    # (Note that this is not the same as the number of training samples).
    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    schedulerOld = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = num_warmup_steps, # Default value in run_glue.py
                                            num_training_steps = total_steps)
    
    scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, # Default value in run_glue.py
                                            num_training_steps = total_steps, num_cycles = 5)
    
    loss_fct = torch.nn.BCELoss()
    return model, datasetTrain, datasetVal, optimizer, schedulerOld

In [None]:
#from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
def prepare_for_training_stance_ideology_metalearner(instancesTrain, labelsTrain, instancesVal, labelsVal, batch_size=16, epochs = 50, num_warmup_steps=0, learning_rate=5e-5):
    # Combine the training inputs into a TensorDataset.

    epochs = 1000
    from transformers import BertForSequenceClassification, AdamW, BertConfig, RobertaConfig, AutoModelWithLMHead
    from transformers import DistilBertForSequenceClassification, RobertaForSequenceClassification
    
    from torch.utils.data import DataLoader, RandomSampler
    
    t_instancesTrain = torch.as_tensor(instancesTrain.to_numpy(), dtype=torch.float32)
    t_train_relatedness, t_train_stance, t_train_mmd_symbol, t_train_mmd_symbol_ = preprocess_stance_ideology_new_meta(labelsTrain)
    datasetTrain = TensorDataset(t_instancesTrain, t_train_relatedness, t_train_stance, t_train_mmd_symbol, t_train_mmd_symbol_)
    #print(labelsTrain)

    #t_labels_train = torch.as_tensor(labelsTrain.to_numpy(), dtype=torch.int64)
    #t_instancesTrain  = torch.as_tensor(instancesTrain, dtype=torch.float32)

    #datasetTrain = TensorDataset(t_instancesTrain, t_labels_train)

    # Combine the training inputs into a TensorDataset.
    t_instancesVal = torch.as_tensor(instancesVal.to_numpy(), dtype=torch.float32)
    #t_labels_val = torch.as_tensor(labelsVal.to_numpy(), dtype=torch.int64)
    
    t_val_relatedness, t_val_stance, t_val_mmd_symbol, t_val_mmd_symbol_ = preprocess_stance_ideology_new_meta(labelsVal)
    datasetVal = TensorDataset(t_instancesVal, t_val_relatedness, t_val_stance, t_val_mmd_symbol, t_val_mmd_symbol_)
    
    #datasetVal = TensorDataset(t_instancesVal, t_labels_val)
    
    
    model = StanceMetaLearner(t_instancesTrain)    
    optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  betas=(0.9, 0.999), 
                  eps=1e-08, 
                  weight_decay=1e-4,
                  correct_bias=True
               )

    train_dataloader = DataLoader(
            datasetTrain,  # The training samples.
            sampler =  RandomSampler(datasetTrain), # Select batches randomly
            batch_size = batch_size, # Trains with this batch size., 
            num_workers=8
        )
    batch_size = batch_size


    from transformers import get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup

    # Number of training epochs. The BERT authors recommend between 2 and 4. 
    # We chose to run for 4, but we'll see later that this may be over-fitting the
    # training data.
    epochs = epochs

    # Total number of training steps is [number of batches] x [number of epochs]. 
    # (Note that this is not the same as the number of training samples).
    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    schedulerOld = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = num_warmup_steps, # Default value in run_glue.py
                                            num_training_steps = total_steps)
    
    scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, # Default value in run_glue.py
                                            num_training_steps = total_steps, num_cycles = 5)
    
    return model, datasetTrain, datasetVal, optimizer, schedulerOld

In [None]:
#from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
def prepare_for_training_stance_ideology_paper(input_idsTrain, attention_masksTrain, labelsTrain, labelsTrain_ideology, input_idsVal, attention_masksVal, labelsVal, labelsVal_ideology, modelUsed, batch_size, epochs, num_warmup_steps, learning_rate):
    # Combine the training inputs into a TensorDataset.

    from transformers import BertForSequenceClassification, AdamW, BertConfig, RobertaConfig, AutoModelWithLMHead
    from transformers import DistilBertForSequenceClassification, RobertaForSequenceClassification
    
    from torch.utils.data import DataLoader, RandomSampler
    
    t_train_relatedness, t_train_stance, t_train_mmd_symbol, t_train_mmd_symbol_, t_train_existedstance, t_train_ideology = preprocess_stance_ideology(labelsTrain, labelsTrain_ideology)
 
    datasetTrain = TensorDataset(input_idsTrain, attention_masksTrain, t_train_relatedness, t_train_stance, t_train_mmd_symbol, t_train_mmd_symbol_, t_train_existedstance, t_train_ideology)

    # Combine the training inputs into a TensorDataset.
    t_val_relatedness, t_val_stance, t_val_mmd_symbol, t_val_mmd_symbol_, t_val_existedstance, t_val_ideology = preprocess_stance_ideology(labelsVal, labelsVal_ideology)
    datasetVal = TensorDataset(input_idsVal, attention_masksVal, t_val_relatedness, t_val_stance, t_val_mmd_symbol, t_val_mmd_symbol_, t_val_existedstance, t_val_ideology)
    
    #modelUsed = fasttext.load_model('model.bin')
    model = StanceIdeologyDetectionClassMixout(modelUsed)
    
    
    for name, module in model.named_modules():
        if name in ['dropout', 'dropout2'] and isinstance(module, nn.Dropout):
            setattr(model, name, nn.Dropout(0))
        if name in ['l1'] and isinstance(module, nn.Linear):
            target_state_dict = module.state_dict()
            bias = True if module.bias is not None else False
            new_module = MixLinear(module.in_features, module.out_features, 
                                  bias, target_state_dict['weight'], 0.4)
            new_module.load_state_dict(target_state_dict)
            setattr(model, name, new_module)
        if name in ['input_pl', 'theta_d', 'probability', 'stance', 'l2', 'l3'] and isinstance(module, nn.Linear):
            target_state_dict = module.state_dict()
            bias = True if module.bias is not None else False
            new_module = MixLinear(module.in_features, module.out_features, 
                                   bias, target_state_dict['weight'], 0.9)
            new_module.load_state_dict(target_state_dict)
            setattr(model, name, new_module)
    print("After applying mixout")
    print(model)
    

    #print(model)
    # Tell pytorch to run this model on the GPU.
    #model.cuda()
        
    # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
    # I believe the 'W' stands for 'Weight Decay fix"
    ##optimizer = AdamW(
    #[
    ##    {"params": model.input_pl.parameters(), "lr": 1e-5},
   #     {"params": model.l1.parameters(), "lr": learning_rate},
    #    {"params": model.l2.parameters(), "lr": learning_rate},
   #     {"params": model.bn1_hidden.parameters(), "lr": learning_rate},
    #    {"params": model.dropout.parameters(), "lr": learning_rate},
    #    {"params": model.relu.parameters(), "lr": learning_rate},
    #    {"params": model.theta_d.parameters(), "lr": learning_rate},
   #     {"params": model.bn1_theta.parameters(), "lr": learning_rate},
    #    {"params": model.relu_theta.parameters(), "lr": learning_rate},
    #    {"params": model.probability.parameters(), "lr": learning_rate},
    #    {"params": model.output_prob.parameters(), "lr": learning_rate},
    #    {"params": model.stance.parameters(), "lr": learning_rate},
     #   {"params": model.ideology.parameters(), "lr": learning_rate},
    #],
    ##    betas=(0.9, 0.999), 
    #    eps=1e-08, 
    #    weight_decay=1e-5,
    #    correct_bias=True
    #)
    
    optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  betas=(0.9, 0.999), 
                  eps=1e-06, 
                  weight_decay=0.1,
                  correct_bias=True
           )
    
    #from adabelief_pytorch import AdaBelief
    #optimizer = AdaBelief(model.parameters(), lr=learning_rate, eps=1e-16, betas=(0.9,0.999), weight_decouple = True, rectify = True, fixed_decay = False, amsgrad = False)


    train_dataloader = DataLoader(
            datasetTrain,  # The training samples.
            sampler =  RandomSampler(datasetTrain), # Select batches randomly
            batch_size = 16, # Trains with this batch size., 
            num_workers=8
        )
    batch_size = batch_size


    from transformers import get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup

    # Number of training epochs. The BERT authors recommend between 2 and 4. 
    # We chose to run for 4, but we'll see later that this may be over-fitting the
    # training data.
    epochs = epochs

    # Total number of training steps is [number of batches] x [number of epochs]. 
    # (Note that this is not the same as the number of training samples).
    total_steps = len(train_dataloader)*(batch_size//2)* epochs

    # Create the learning rate scheduler.
    schedulerOld = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = num_warmup_steps, # Default value in run_glue.py
                                            num_training_steps = total_steps)
    
    scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, # Default value in run_glue.py
                                            num_training_steps = total_steps, num_cycles = 5)
    
    return model, datasetTrain, datasetVal, optimizer, schedulerOld

In [None]:
def prepare_for_training_stance_ideology(input_idsTrain, attention_masksTrain, labelsTrain, labelsTrain_ideology, input_idsVal, attention_masksVal, labelsVal, labelsVal_ideology, modelUsed, batch_size, epochs, num_warmup_steps, learning_rate):
    # Combine the training inputs into a TensorDataset.

    from transformers import BertForSequenceClassification, AdamW, BertConfig, RobertaConfig, AutoModelWithLMHead
    from transformers import DistilBertForSequenceClassification, RobertaForSequenceClassification
    
    from torch.utils.data import DataLoader, RandomSampler
    
    t_train_relatedness, t_train_stance, t_train_mmd_symbol, t_train_mmd_symbol_, t_train_existedstance, t_train_ideology = preprocess_stance_ideology(labelsTrain, labelsTrain_ideology)
 
    datasetTrain = TensorDataset(input_idsTrain, attention_masksTrain, t_train_relatedness, t_train_stance, t_train_mmd_symbol, t_train_mmd_symbol_, t_train_existedstance, t_train_ideology)

    # Combine the training inputs into a TensorDataset.
    t_val_relatedness, t_val_stance, t_val_mmd_symbol, t_val_mmd_symbol_, t_val_existedstance, t_val_ideology = preprocess_stance_ideology(labelsVal, labelsVal_ideology)
    datasetVal = TensorDataset(input_idsVal, attention_masksVal, t_val_relatedness, t_val_stance, t_val_mmd_symbol, t_val_mmd_symbol_, t_val_existedstance, t_val_ideology)
    
    #modelUsed = fasttext.load_model('model.bin')
    model = AmbigiousDetectionClass(modelUsed)
    

    #print(model)
    # Tell pytorch to run this model on the GPU.
    #model.cuda()
        
    # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
    # I believe the 'W' stands for 'Weight Decay fix"
    ##optimizer = AdamW(
    #[
    ##    {"params": model.input_pl.parameters(), "lr": 1e-5},
   #     {"params": model.l1.parameters(), "lr": learning_rate},
    #    {"params": model.l2.parameters(), "lr": learning_rate},
   #     {"params": model.bn1_hidden.parameters(), "lr": learning_rate},
    #    {"params": model.dropout.parameters(), "lr": learning_rate},
    #    {"params": model.relu.parameters(), "lr": learning_rate},
    #    {"params": model.theta_d.parameters(), "lr": learning_rate},
   #     {"params": model.bn1_theta.parameters(), "lr": learning_rate},
    #    {"params": model.relu_theta.parameters(), "lr": learning_rate},
    #    {"params": model.probability.parameters(), "lr": learning_rate},
    #    {"params": model.output_prob.parameters(), "lr": learning_rate},
    #    {"params": model.stance.parameters(), "lr": learning_rate},
     #   {"params": model.ideology.parameters(), "lr": learning_rate},
    #],
    ##    betas=(0.9, 0.999), 
    #    eps=1e-08, 
    #    weight_decay=1e-5,
    #    correct_bias=True
    #)
    
    optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  betas=(0.9, 0.98), 
                  eps=1e-08, 
                  weight_decay=0.1,
                  correct_bias=True
           )

    train_dataloader = DataLoader(
            datasetTrain,  # The training samples.
            sampler =  RandomSampler(datasetTrain), # Select batches randomly
            batch_size = batch_size, # Trains with this batch size., 
            num_workers=8
        )
    batch_size = batch_size


    from transformers import get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup

    # Number of training epochs. The BERT authors recommend between 2 and 4. 
    # We chose to run for 4, but we'll see later that this may be over-fitting the
    # training data.
    epochs = epochs

    # Total number of training steps is [number of batches] x [number of epochs]. 
    # (Note that this is not the same as the number of training samples).
    total_steps = len(train_dataloader) * batch_size//16 * epochs

    # Create the learning rate scheduler.
    schedulerOld = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = num_warmup_steps, # Default value in run_glue.py
                                            num_training_steps = total_steps)
    
    scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, # Default value in run_glue.py
                                            num_training_steps = total_steps, num_cycles = 5)
    
    return model, datasetTrain, datasetVal, optimizer, schedulerOld

In [None]:
def optimizer_to(optim, device):
    for param in optim.state.values():
        # Not sure there are any global tensors in the state dict
        if isinstance(param, torch.Tensor):
            param.data = param.data.to(device)
            if param._grad is not None:
                param._grad.data = param._grad.data.to(device)
        elif isinstance(param, dict):
            for subparam in param.values():
                if isinstance(subparam, torch.Tensor):
                    subparam.data = subparam.data.to(device)
                    if subparam._grad is not None:
                        subparam._grad.data = subparam._grad.data.to(device)

In [None]:
def return_batches_datasets(datasetTrain, datasetVal, batch_size = 16):
    from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
        
    # Create the DataLoaders for our training and validation sets.
    # We'll take training samples in random order. 
    train_dataloader = DataLoader(
            datasetTrain,  # The training samples.
            sampler =  RandomSampler(datasetTrain), # Select batches randomly
            batch_size = batch_size, # Trains with this batch size., 
            num_workers=8, drop_last=True
        )

    # For validation the order doesn't matter, so we'll just read them sequentially.
    validation_dataloader = DataLoader(
            datasetVal, # The validation samples.
            sampler = SequentialSampler(datasetVal), # Pull out batches sequentially.
            batch_size = batch_size, # Evaluate with this batch size.
            num_workers=8, drop_last=True
        )
    
    
    #validation_dataloader = DataLoader(
    #        datasetVal, # The validation samples.
    #        sampler = SequentialSampler(datasetVal), # Pull out batches sequentially.
    #        batch_size = batch_size, # Evaluate with this batch size.
    #        num_workers=0, drop_last=True
    #)
    
    return train_dataloader, validation_dataloader

In [None]:
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, RandomSampler
#from tensorboardX import SummaryWriter
from sklearn.metrics import confusion_matrix
#import EarlyStopping
def train_stance_ideology_metalearner(train_nums, val_nums, train_nums_ideology, val_nums_ideology, model_save_path, model, datasetTrain, datasetVal, epochs, batch_size, optimizer, scheduler, patience, verbose, delta, seedVal, continue_train = False):
    
    epochs = 1000
    pro_val_num = val_nums[0]
    agst_val_num = val_nums[1]
    neut_val_num = val_nums[2]
    notrel_val_num = val_nums[3]
    
    stance_all_num = pro_val_num + agst_val_num + neut_val_num + notrel_val_num
    
    con_val_num = 0.1
    lib_val_num = 0.1
    na_val_num = 0.1
    
    con_train_num = 0.1
    lib_train_num = 0.1
    na_train_num = 0.1
    
    #con_train_num = train_nums_ideology[0]
    #lib_train_num = train_nums_ideology[1]
    #na_train_num = train_nums_ideology[2]
    
    my_max_train_stance = max(pro_val_num, agst_val_num, neut_val_num, notrel_val_num)
    my_max_train = max(con_train_num, lib_train_num, na_train_num)
    
    #con_val_num = val_nums_ideology[0]
    #lib_val_num = val_nums_ideology[1]
    #na_val_num = val_nums_ideology[2]
    
    my_max = max(con_val_num, lib_val_num, na_val_num)
    
    ideology_all_num = con_val_num + lib_val_num + na_val_num
    
    writer = SummaryWriter()
    min_val_loss = 100
    
    relatedness_size = 2
    classes_size = 4
    loss_fct_relatedness = torch.nn.BCEWithLogitsLoss()
    
    loss_fct_stance = torch.nn.CrossEntropyLoss()
    #loss_fct = torch.nn.BCEWithLogitsLoss()
    
    alpha = 1.3
    beta = 1e-3
    theta = 0
    gamma = 0
    
    batch_size_max_once = 64

    if batch_size < batch_size_max_once:
        batch_size_max_once = batch_size
        
    accumulation_steps = batch_size/batch_size_max_once
    
    es = EarlyStopping(patience,verbose, delta)
    writer = SummaryWriter()

    # We'll store a number of quantities such as training and validation loss, 
    # validation accuracy, and timings.
    training_stats = []

    # Measure the total training time for the whole run.
    total_t0 = time.time()
    train_dataloader, validation_dataloader = return_batches_datasets(datasetTrain, datasetVal, batch_size_max_once)
    
    epoch_start = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        #multi-gpu
        if torch.cuda.device_count() > 1:
            print("Let's use", torch.cuda.device_count(), "GPUs!")
            # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
            model = torch.nn.DataParallel(model)
            
    print(device)
    
    weights_stance = torch.tensor([my_max_train_stance/pro_val_num, my_max_train_stance/agst_val_num, my_max_train_stance/neut_val_num, my_max_train_stance/notrel_val_num]).to(device) 
    loss_fct_relatedness_weighted = torch.nn.BCEWithLogitsLoss(pos_weight = weights_stance)
            
    if continue_train:    
        #'./model_save/fnc/model_emergentbert_epoch90_withoutsep_serp.t7'
        checkpoint = torch.load(model_save_path)
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        epoch_start = checkpoint['epoch']
    
    torch.cuda.empty_cache()
    model.to(device)
    optimizer_to(optimizer,device)
    
    
    # For each epoch...
    batch_epoch_count = 1
    for epoch_i in range(epoch_start, epoch_start + epochs):
        
        print("---------Epoch----------" + str(epoch_i))
        
        # ========================================
        #               Training
        # ========================================
    
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0
        model.train()
        model.zero_grad()
        optimizer.zero_grad()
        # For each batch of training data...
        mini_batch_avg_loss = 0
        
        if batch_epoch_count % 500 == 0:
            batch_size = batch_size*2
            accumulation_steps = int(batch_size/batch_size_max_once)
        batch_epoch_count = batch_epoch_count + 1

        #train_size = len(train_dataloader) / float(accumulation_steps)
        
        print("Batch Size: " + str(batch_size))
        print(float(accumulation_steps))
        
        #print("Learning rate: ", scheduler.get_last_lr())
        for step, batch in enumerate(train_dataloader):
            elapsed = format_time(time.time() - t0)
            
            b_input_ids = batch[0].to(device)
            b_relatedness = batch[1].to(device)
            b_labels = batch[2].to(device)
            b_mmd_symbol = batch[3].to(device)
            b_mmd_symbol_ = batch[4].to(device)
        
            # Perform a forward pass (evaluate the model on this training batch).
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # It returns different numbers of parameters depending on what arguments
            # arge given and what flags are set. For our useage here, it returns
            # the loss (because we provided labels) and the "logits"--the model
            # outputs prior to activation.
            mmd_loss, P_relatedness, P_stance = model(input_ids = b_input_ids, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_)
            #P_stance = model(input_ids = b_input_ids, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_)

            
            relatedness_loss = loss_fct_relatedness(P_relatedness, b_relatedness.float())
            stance_loss = loss_fct_relatedness(P_stance, b_labels.float())

            
            loss = alpha * stance_loss + beta * mmd_loss + relatedness_loss
            #loss = stance_loss
            loss = loss / accumulation_steps 
            total_train_loss += loss.item()
                
            loss.backward()
            if (step+1) % accumulation_steps == 0:             # Wait for several backward steps
                
                # Clip the norm of the gradients to 1.0.
                # This is to help prevent the "exploding gradients" problem.
                    #torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
                    
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                
                # Update parameters and take a step using the computed gradient.
                # The optimizer dictates the "update rule"--how the parameters are
                # modified based on their gradients, the learning rate, etc.
                optimizer.step()

                # Update the learning rate.
                scheduler.step()
                
                #for param_group in optimizer.param_groups:
                #print("Learning Rate: ", optimizer.param_groups["lr"])
                
                                
                # Always clear any previously calculated gradients before performing a
                # backward pass. PyTorch doesn't do this automatically because 
                # accumulating the gradients is "convenient while training RNNs". 
                # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
                model.zero_grad()
                optimizer.zero_grad()       
    
        print("Learning rate: ", scheduler.get_last_lr())
        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader) * accumulation_steps
        #avg_train_loss = total_train_loss / len(train_dataloader)
    
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        #print("")
        print("  Average training loss: {0:.6f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))
        
        
        
        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        #print("")
        #print("Running Validation...")

        t1 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables 
        total_true_eval_stance = 0
        total_true_eval_ideology = 0
        total_eval_loss = 0
        nb_eval_steps = 0
        
        agree_val_true = 0
        disagree_val_true = 0 
        discuss_val_true = 0 
        unrelated_val_true = 0
        
        con_val_true = 0
        lib_val_true = 0
        na_val_true = 0
        
        total_true = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_relatedness = batch[1].to(device)
            b_labels = batch[2].to(device)
            b_mmd_symbol = batch[3].to(device)
            b_mmd_symbol_ = batch[4].to(device)
        
            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():        

                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which 
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.
                
                mmd_loss, P_relatedness, P_stance = model(input_ids = b_input_ids, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_)
                #P_stance = model(input_ids = b_input_ids, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_)
            
                relatedness_loss = loss_fct_relatedness(P_relatedness, b_relatedness.float())
                stance_loss = loss_fct_relatedness(P_stance, b_labels.float())

            
                loss_val = alpha * stance_loss + beta * mmd_loss + relatedness_loss
                #loss_val = stance_loss
                total_eval_loss += loss_val.item()

                # Move logits and labels to CPU
                P_relatedness = P_relatedness.to('cpu')
                b_relatedness = b_relatedness.to('cpu')
                P_stance = P_stance.to('cpu')
                b_labels = b_labels.to('cpu')

                acc_list = predict_classwise_stance_ideology_meta(P_relatedness, P_stance, b_labels)
                total_true_eval_stance += acc_list[0]
                ###
                agree_val_true += acc_list[1]
                disagree_val_true += acc_list[2]
                discuss_val_true += acc_list[3]
                unrelated_val_true += acc_list[4]

        # Report the final accuracy for this validation run.
        print(total_true_eval_stance)
        avg_val_accuracy_stance = total_true_eval_stance / stance_all_num
        print("Avg Val Accuracy Stance: {0:.6f}".format(avg_val_accuracy_stance))
        print("Total True")
        print(total_true)
        print("*************")
        avg_val_agree_accuracy = agree_val_true / pro_val_num
        print("Avg Val Agree Accuracy: {0:.6f}".format(avg_val_agree_accuracy))
        avg_val_disagree_accuracy = disagree_val_true / agst_val_num
        print("Avg Val Disagree Accuracy: {0:.6f}".format(avg_val_disagree_accuracy))
        avg_val_discuss_accuracy = discuss_val_true / neut_val_num
        print("Avg Val Discuss Accuracy: {0:.6f}".format(avg_val_discuss_accuracy))
        avg_val_unrelated_accuracy = unrelated_val_true / notrel_val_num
        print("Avg Val Unrelated Accuracy: {0:.6f}".format(avg_val_unrelated_accuracy))
        
        relative_score = 0.25*avg_val_unrelated_accuracy + 0.75*(avg_val_agree_accuracy + avg_val_disagree_accuracy + avg_val_discuss_accuracy)/3
        
        print("*****************")
        print("Relative score: {0:.6f}".format(relative_score))
        print("*****************")

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)
    
        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t1)
        
        if avg_val_loss < min_val_loss:
            min_val_loss = avg_val_loss
    
        print("Avg Validation Loss: {0:.6f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))
        
        
        avg_val_accuracy_ideology = 0

        #avg_val_accuracy_ideology = 0
        # Record all statistics from this epoch.
        training_stats.append(
            {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Stance Accur.': avg_val_accuracy_stance,
            'Valid. Ideology Accur.': avg_val_accuracy_ideology,
            'Training Time': training_time,
            'Validation Time': validation_time
            }
        )
        
        model_save_state = {
            'epoch': epoch_i + 1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
            }
    
        
        es.__call__(avg_val_loss, avg_val_accuracy_stance, avg_val_accuracy_ideology, model_save_state, model_save_path, model)
        last_epoch = epoch_i + 1
        if es.early_stop == True:
            break  # early stop criterion is met, we can stop now

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
    
    
    min_val_loss = es.val_loss_min
    max_val_acc = es.val_acc_max_stance

    return training_stats, last_epoch, min_val_loss, max_val_acc

In [None]:
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, RandomSampler
#from tensorboardX import SummaryWriter
from sklearn.metrics import confusion_matrix
#import EarlyStopping
def train_stance_ideology_paper(train_nums, val_nums, train_nums_ideology, val_nums_ideology, model_save_path, model, datasetTrain, datasetVal, epochs, batch_size, optimizer, scheduler, patience, verbose, delta, seedVal, continue_train = False):
    
    pro_val_num = val_nums[0]
    agst_val_num = val_nums[1]
    neut_val_num = val_nums[2]
    notrel_val_num = val_nums[3]
    
    ambigious_num = val_nums[0]
    not_ambigious_num = val_nums[1]
    
    ambigious_total = ambigious_num + not_ambigious_num
    
    stance_all_num = pro_val_num + agst_val_num + neut_val_num + notrel_val_num
    
    con_val_num = 0.1
    lib_val_num = 0.1
    na_val_num = 0.1
    
    con_train_num = 0.1
    lib_train_num = 0.1
    na_train_num = 0.1
    
    #con_train_num = train_nums_ideology[0]
    #lib_train_num = train_nums_ideology[1]
    #na_train_num = train_nums_ideology[2]
    
    my_max_train_stance = max(pro_val_num, agst_val_num, neut_val_num, notrel_val_num)
    my_max_train = max(con_train_num, lib_train_num, na_train_num)
    
    #con_val_num = val_nums_ideology[0]
    #lib_val_num = val_nums_ideology[1]
    #na_val_num = val_nums_ideology[2]
    
    my_max = max(con_val_num, lib_val_num, na_val_num)
    
    ideology_all_num = con_val_num + lib_val_num + na_val_num
    
    writer = SummaryWriter()
    min_val_loss = 100
    
    relatedness_size = 2
    classes_size = 4
    loss_fct_relatedness = torch.nn.BCEWithLogitsLoss()
    
    loss_fct_stance = torch.nn.CrossEntropyLoss()
    #loss_fct = torch.nn.BCEWithLogitsLoss()
    
    alpha = 1.3
    beta =  1e-3
    theta = 0
    gamma = 0
    
    batch_size_max_once = 2

    if batch_size < batch_size_max_once:
        batch_size_max_once = batch_size
        
    accumulation_steps = batch_size/batch_size_max_once
    
    es = EarlyStopping(patience,verbose, delta)
    writer = SummaryWriter()

    # We'll store a number of quantities such as training and validation loss, 
    # validation accuracy, and timings.
    training_stats = []

    # Measure the total training time for the whole run.
    total_t0 = time.time()
    train_dataloader, validation_dataloader = return_batches_datasets(datasetTrain, datasetVal, batch_size_max_once)
    
    epoch_start = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        #multi-gpu
        if torch.cuda.device_count() > 1:
            print("Let's use", torch.cuda.device_count(), "GPUs!")
            # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
            model = torch.nn.DataParallel(model)
            
    print(device)
    
    
            
    if continue_train:    
        #'./model_save/fnc/model_emergentbert_epoch90_withoutsep_serp.t7'
        checkpoint = torch.load(model_save_path)
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        epoch_start = checkpoint['epoch']
    
    torch.cuda.empty_cache()
    model.to(device)
    optimizer_to(optimizer,device)
    
    
     #pos_weight=torch.FloatTensor ([28.36 / 0.5090]
    
     #pos_weight = torch.tensor([1.0, 1.0, 1.0])
     #pos_weight = pos_weight.to(device)
     #criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    weights_ideology = torch.tensor([my_max_train/con_train_num, my_max_train/lib_train_num, my_max_train/na_train_num]).to(device)   
    weights_stance = torch.tensor([my_max_train_stance/pro_val_num, my_max_train_stance/agst_val_num, my_max_train_stance/neut_val_num, my_max_train_stance/notrel_val_num]).to(device) 
    loss_fct_relatedness_weighted = torch.nn.BCEWithLogitsLoss(pos_weight = weights_stance)
    loss_fct_ideology_weighted = torch.nn.BCEWithLogitsLoss(pos_weight = weights_ideology)
    
    # For each epoch...
    batch_epoch_count = 1
    for epoch_i in range(epoch_start, epoch_start + epochs):
        
        print("---------Epoch----------" + str(epoch_i))
        
        # ========================================
        #               Training
        # ========================================
    
        # Perform one full pass over the training set.

        #print("")
        #print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        #print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0
        # Put the model into training mode. Don't be mislead--the call to 
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        model.train()
        model.zero_grad()
        optimizer.zero_grad()
        # For each batch of training data...
        mini_batch_avg_loss = 0
        #train_size = len(train_dataloader)
        
        if batch_epoch_count % 500 == 0:
            batch_size = batch_size*2
            accumulation_steps = int(batch_size/batch_size_max_once)
        batch_epoch_count = batch_epoch_count + 1

        #train_size = len(train_dataloader) / float(accumulation_steps)
        
        print("Batch Size: " + str(batch_size))
        print(float(accumulation_steps))

        #print("Learning rate: ", scheduler.get_last_lr())
        for step, batch in enumerate(train_dataloader):
            elapsed = format_time(time.time() - t0)
        
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_relatedness = batch[2].to(device)
            b_labels = batch[3].to(device)
            b_mmd_symbol = batch[4].to(device)
            b_mmd_symbol_ = batch[5].to(device)
            b_existedstances = batch[6].to(device)
            b_ideologies = batch[7].to(device)
        
            
            # Perform a forward pass (evaluate the model on this training batch).
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # It returns different numbers of parameters depending on what arguments
            # arge given and what flags are set. For our useage here, it returns
            # the loss (because we provided labels) and the "logits"--the model
            # outputs prior to activation.

            mmd_loss, P_relatedness, P_stance, P_existedstance = model(input_ids = b_input_ids, attention_mask = b_input_mask, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_, epoch_num = epoch_i)
                
                
                
            relatedness_loss = loss_fct_relatedness(P_relatedness, b_relatedness.float())
            stance_loss = loss_fct_relatedness(P_stance, b_labels.float())
            #existedstance_loss = loss_fct_relatedness(P_existedstance, b_existedstances.float())

            
            loss = alpha * stance_loss + beta * mmd_loss + relatedness_loss
            #loss = stance_loss
            loss = loss / accumulation_steps 
            total_train_loss += loss.item()
                
            loss.backward()
            if ((step+1) % accumulation_steps == 0) or (step + 1 == len(train_dataloader)):             
                # Wait for several backward steps
                    

                # Clip the norm of the gradients to 1.0.
                # This is to help prevent the "exploding gradients" problem.
                    #torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
                    
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                
                # Update parameters and take a step using the computed gradient.
                # The optimizer dictates the "update rule"--how the parameters are
                # modified based on their gradients, the learning rate, etc.
                optimizer.step()

                # Update the learning rate.
                scheduler.step()
                
                #for param_group in optimizer.param_groups:
                #print("Learning Rate: ", optimizer.param_groups["lr"])
                
                                
                # Always clear any previously calculated gradients before performing a
                # backward pass. PyTorch doesn't do this automatically because 
                # accumulating the gradients is "convenient while training RNNs". 
                # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
                model.zero_grad()
                optimizer.zero_grad()       
    
        print("Learning rate: ", scheduler.get_last_lr())
        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader) * accumulation_steps
        #avg_train_loss = total_train_loss / len(train_dataloader)
    
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        #print("")
        print("  Average training loss: {0:.6f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))
        
        
        
        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        #print("")
        #print("Running Validation...")

        t1 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables 
        total_true_eval_stance = 0
        total_true_eval_ideology = 0
        total_eval_loss = 0
        nb_eval_steps = 0
        
        agree_val_true = 0
        disagree_val_true = 0 
        discuss_val_true = 0 
        unrelated_val_true = 0
        
        con_val_true = 0
        lib_val_true = 0
        na_val_true = 0
        
        total_true = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
        
            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using 
            # the `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_relatedness = batch[2].to(device)
            b_labels = batch[3].to(device)
            b_mmd_symbol = batch[4].to(device)
            b_mmd_symbol_ = batch[5].to(device)
            b_existedstances = batch[6].to(device)
            b_ideologies = batch[7].to(device)
        
            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():        

                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which 
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.

                mmd_loss, P_relatedness, P_stance, P_existedstance = model(input_ids = b_input_ids, attention_mask = b_input_mask, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_, epoch_num = 12)
                #P_stance = model(input_ids = b_input_ids, attention_mask = b_input_mask, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_)

                
                #CrossEntropy Loss
                relatedness_loss = loss_fct_relatedness(P_relatedness, b_relatedness.float())
                stance_loss = loss_fct_relatedness(P_stance, b_labels.float())
                #existedstance_loss = loss_fct_relatedness(P_existedstance, b_existedstances.float())
                
                loss_val = alpha * stance_loss + beta * mmd_loss + relatedness_loss
                #loss_val = stance_loss
                total_eval_loss += loss_val.item()

                # Move logits and labels to CPU
                P_relatedness = P_relatedness.to('cpu')
                b_relatedness = b_relatedness.to('cpu')
                P_stance = P_stance.to('cpu')
                b_labels = b_labels.to('cpu')
                #P_existedstance = P_existedstance.to('cpu')
                #b_existedstances = b_existedstances.to('cpu')
                
                

                # Calculate the accuracy for this batch of test sentences, and
                # accumulate it over all batches.
                #total_eval_accuracy += predict(P_relatedness, P_stance, b_labels)

                acc_list = predict_classwise_stance_ideology(P_relatedness, P_stance, P_existedstance, b_labels)
                #acc_list = predict_classwise_stance_ideology_bert(P_stance, b_labels)
                total_true_eval_stance += acc_list[0]
                ###
                agree_val_true += acc_list[1]
                disagree_val_true += acc_list[2]
                discuss_val_true += acc_list[3]
                unrelated_val_true += acc_list[4]
                
                total_true_eval_ideology += acc_list[5]
                con_val_true += acc_list[6]
                lib_val_true += acc_list[7]
                na_val_true += acc_list[8]
                
                                
                ##print("Batch Next")
                #for idx in range(0, len(P_stance)):
                    
                    #print(P_stance[idx], b_labels[idx], acc_list[9][idx]) 

        # Report the final accuracy for this validation run.
        avg_val_accuracy_stance = total_true_eval_stance / stance_all_num
        avg_val_accuracy_ideology = total_true_eval_ideology / ideology_all_num
        print("Avg Val Accuracy Stance: {0:.6f}".format(avg_val_accuracy_stance))
        print("Avg Val Accuracy Ideology: {0:.6f}".format(avg_val_accuracy_ideology))
        print("Total True")
        print(total_true)
        print("*************")
        avg_val_agree_accuracy = agree_val_true / pro_val_num
        print("Avg Val Agree Accuracy: {0:.6f}".format(avg_val_agree_accuracy))
        avg_val_disagree_accuracy = disagree_val_true / agst_val_num
        print("Avg Val Disagree Accuracy: {0:.6f}".format(avg_val_disagree_accuracy))
        avg_val_discuss_accuracy = discuss_val_true / neut_val_num
        print("Avg Val Discuss Accuracy: {0:.6f}".format(avg_val_discuss_accuracy))
        avg_val_unrelated_accuracy = unrelated_val_true / notrel_val_num
        print("Avg Val Unrelated Accuracy: {0:.6f}".format(avg_val_unrelated_accuracy))
        
        relative_score = 0.25*avg_val_unrelated_accuracy + 0.75*(avg_val_agree_accuracy + avg_val_disagree_accuracy + avg_val_discuss_accuracy)/3
        
        print("*****************")
        print("Relative score: {0:.6f}".format(relative_score))
        print("*****************")
        print("-------------")
        avg_val_con_accuracy = con_val_true / con_val_num
        print("Avg Val Con Accuracy: {0:.6f}".format(avg_val_con_accuracy))
        avg_lib_accuracy = lib_val_true / lib_val_num
        print("Avg Val Lib Accuracy: {0:.6f}".format(avg_lib_accuracy))
        avg_na_discuss_accuracy = na_val_true / na_val_num
        print("Avg Val NA Accuracy: {0:.6f}".format(avg_na_discuss_accuracy))

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)
        
        print("Total Validation loss", total_eval_loss)
        print("Len-validation loader", len(validation_dataloader))
    
        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t1)
        
        if avg_val_loss < min_val_loss:
            min_val_loss = avg_val_loss
    
        print("Avg Validation Loss: {0:.6f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        #avg_val_accuracy_ideology = 0
        # Record all statistics from this epoch.
        training_stats.append(
            {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Stance Accur.': avg_val_accuracy_stance,
            'Valid. Ideology Accur.': avg_val_accuracy_ideology,
            'Training Time': training_time,
            'Validation Time': validation_time
            }
        )
        
        model_save_state = {
            'epoch': epoch_i + 1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
            }
    
        es.__call__(avg_val_loss, avg_val_accuracy_stance, avg_val_accuracy_ideology, model_save_state, model_save_path, model)
        last_epoch = epoch_i + 1
        if es.early_stop == True:
            break  # early stop criterion is met, we can stop now

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
    
    
    min_val_loss = es.val_loss_min
    max_val_acc = es.val_acc_max_stance

    return training_stats, last_epoch, min_val_loss, max_val_acc

In [None]:
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, RandomSampler
#from tensorboardX import SummaryWriter
from sklearn.metrics import confusion_matrix
#import EarlyStopping
def train_stance_ideology(train_nums, val_nums, train_nums_ideology, val_nums_ideology, model_save_path, model, datasetTrain, datasetVal, epochs, batch_size, optimizer, scheduler, patience, verbose, delta, seedVal, continue_train = False):
    
    ##pro_val_num = val_nums[0]
    #agst_val_num = val_nums[1]
    #neut_val_num = val_nums[2] + 0.01
    #notrel_val_num = val_nums[3]
    
    pro_val_num = 0.1
    agst_val_num = 0.1
    neut_val_num = 0.1
    notrel_val_num = 0.1
    
    stance_all_num = pro_val_num + agst_val_num + neut_val_num + notrel_val_num
    
    con_val_num = 0.1
    lib_val_num = 0.1
    na_val_num = 0.1
    
    con_train_num = 0.1
    lib_train_num = 0.1
    na_train_num = 0.1
    
    #con_train_num = train_nums_ideology[0]
    #lib_train_num = train_nums_ideology[1]
    #na_train_num = train_nums_ideology[2]
    
    my_max_train_stance = max(pro_val_num, agst_val_num, neut_val_num, notrel_val_num)
    my_max_train = max(con_train_num, lib_train_num, na_train_num)
    
    #con_val_num = val_nums_ideology[0]
    #lib_val_num = val_nums_ideology[1]
    #na_val_num = val_nums_ideology[2]
    
    my_max = max(con_val_num, lib_val_num, na_val_num)
    
    ideology_all_num = con_val_num + lib_val_num + na_val_num
    
    writer = SummaryWriter()
    min_val_loss = 100
    
    relatedness_size = 2
    classes_size = 4
    loss_fct_relatedness = torch.nn.BCEWithLogitsLoss()
    
    loss_fct_stance = torch.nn.CrossEntropyLoss()
    #loss_fct = torch.nn.BCEWithLogitsLoss()
    
    alpha = 1.3
    beta = 1e-3
    theta = 0
    gamma = 0
    
    batch_size_max_once = 16

    if batch_size < batch_size_max_once:
        batch_size_max_once = batch_size
        
    accumulation_steps = batch_size/batch_size_max_once
    
    es = EarlyStopping(patience,verbose, delta)
    writer = SummaryWriter()

    # We'll store a number of quantities such as training and validation loss, 
    # validation accuracy, and timings.
    training_stats = []

    # Measure the total training time for the whole run.
    total_t0 = time.time()
    train_dataloader, validation_dataloader = return_batches_datasets(datasetTrain, datasetVal, batch_size_max_once)
    
    epoch_start = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        #multi-gpu
        if torch.cuda.device_count() > 1:
            print("Let's use", torch.cuda.device_count(), "GPUs!")
            # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
            model = torch.nn.DataParallel(model)
            
    print(device)
    
    
            
    if continue_train:    
        #'./model_save/fnc/model_emergentbert_epoch90_withoutsep_serp.t7'
        checkpoint = torch.load(model_save_path)
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        epoch_start = checkpoint['epoch']
    
    torch.cuda.empty_cache()
    model.to(device)
    optimizer_to(optimizer,device)
    
    
     #pos_weight=torch.FloatTensor ([28.36 / 0.5090]
    
     #pos_weight = torch.tensor([1.0, 1.0, 1.0])
     #pos_weight = pos_weight.to(device)
     #criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    weights_ideology = torch.tensor([my_max_train/con_train_num, my_max_train/lib_train_num, my_max_train/na_train_num]).to(device)   
    weights_stance = torch.tensor([my_max_train_stance/pro_val_num, my_max_train_stance/agst_val_num, my_max_train_stance/neut_val_num, my_max_train_stance/notrel_val_num]).to(device) 
    loss_fct_relatedness_weighted = torch.nn.BCEWithLogitsLoss(pos_weight = weights_stance)
    loss_fct_ideology_weighted = torch.nn.BCEWithLogitsLoss(pos_weight = weights_ideology)
    
    # For each epoch...
    batch_epoch_count = 1
    for epoch_i in range(epoch_start, epoch_start + epochs):
        
        print("---------Epoch----------" + str(epoch_i))
        
        # ========================================
        #               Training
        # ========================================
    
        # Perform one full pass over the training set.

        #print("")
        #print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        #print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0
        # Put the model into training mode. Don't be mislead--the call to 
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        model.train()
        model.zero_grad()
        optimizer.zero_grad()
        # For each batch of training data...
        mini_batch_avg_loss = 0
        #train_size = len(train_dataloader)
        
        if batch_epoch_count % 500 == 0:
            batch_size = batch_size*2
            accumulation_steps = int(batch_size/batch_size_max_once)
        batch_epoch_count = batch_epoch_count + 1

        #train_size = len(train_dataloader) / float(accumulation_steps)
        
        print("Batch Size: " + str(batch_size))
        print(float(accumulation_steps))
        
        #print("Learning rate: ", scheduler.get_last_lr())
        for step, batch in enumerate(train_dataloader):
            elapsed = format_time(time.time() - t0)
        
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

        
            
            # Perform a forward pass (evaluate the model on this training batch).
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # It returns different numbers of parameters depending on what arguments
            # arge given and what flags are set. For our useage here, it returns
            # the loss (because we provided labels) and the "logits"--the model
            # outputs prior to activation.

            #mmd_loss, P_relatedness, P_stance, P_existedstance = model(input_ids = b_input_ids, attention_mask = b_input_mask, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_)
            P_stance = model(input_ids = b_input_ids, attention_mask = b_input_mask)
                
                
                
            #relatedness_loss = loss_fct_relatedness(P_relatedness, b_relatedness.float())
            stance_loss = loss_fct_relatedness(P_stance, b_labels.float())
            #existedstance_loss = loss_fct_relatedness(P_existedstance, b_existedstances.float())

            
            #loss = alpha * stance_loss + theta * existedstance_loss + beta * mmd_loss + relatedness_loss
            loss = stance_loss
            loss = loss / accumulation_steps 
            total_train_loss += loss.item()
                
            loss.backward()
            if (step+1) % accumulation_steps == 0:             # Wait for several backward steps
                    

                # Clip the norm of the gradients to 1.0.
                # This is to help prevent the "exploding gradients" problem.
                    #torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
                    
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                
                # Update parameters and take a step using the computed gradient.
                # The optimizer dictates the "update rule"--how the parameters are
                # modified based on their gradients, the learning rate, etc.
                optimizer.step()

                # Update the learning rate.
                scheduler.step()
                
                #for param_group in optimizer.param_groups:
                #print("Learning Rate: ", optimizer.param_groups["lr"])
                
                                
                # Always clear any previously calculated gradients before performing a
                # backward pass. PyTorch doesn't do this automatically because 
                # accumulating the gradients is "convenient while training RNNs". 
                # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
                model.zero_grad()
                optimizer.zero_grad()       
    
        print("Learning rate: ", scheduler.get_last_lr())
        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader) * accumulation_steps
        #avg_train_loss = total_train_loss / len(train_dataloader)
    
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        #print("")
        print("  Average training loss: {0:.6f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))
        
        
        
        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        #print("")
        #print("Running Validation...")

        t1 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables 
        total_true_eval_stance = 0
        total_true_eval_ideology = 0
        total_eval_loss = 0
        nb_eval_steps = 0
        
        agree_val_true = 0
        disagree_val_true = 0 
        discuss_val_true = 0 
        unrelated_val_true = 0
        
        con_val_true = 0
        lib_val_true = 0
        na_val_true = 0
        
        total_true = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
        
            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using 
            # the `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
        
            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():        

                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which 
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.

                #mmd_loss, P_relatedness, P_stance, P_existedstance = model(input_ids = b_input_ids, attention_mask = b_input_mask, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_)
                P_stance = model(input_ids = b_input_ids, attention_mask = b_input_mask)

                
                #CrossEntropy Loss
                #relatedness_loss = loss_fct_relatedness(P_relatedness, b_relatedness.float())
                stance_loss = loss_fct_relatedness(P_stance, b_labels.float())
                #existedstance_loss = loss_fct_relatedness(P_existedstance, b_existedstances.float())
                
                #loss_val = alpha * stance_loss + beta * mmd_loss + relatedness_loss
                loss_val = stance_loss
                total_eval_loss += loss_val.item()

                # Move logits and labels to CPU
                #P_relatedness = P_relatedness.to('cpu')
                #b_relatedness = b_relatedness.to('cpu')
                P_stance = P_stance.to('cpu')
                b_labels = b_labels.to('cpu')
                #P_existedstance = P_existedstance.to('cpu')
                #b_existedstances = b_existedstances.to('cpu')
                
                

                # Calculate the accuracy for this batch of test sentences, and
                # accumulate it over all batches.
                #total_eval_accuracy += predict(P_relatedness, P_stance, b_labels)

                #acc_list = predict_classwise_stance_ideology(P_relatedness, P_stance, P_existedstance, b_labels)
                acc_list = predict_classwise_stance_ideology_bert(P_stance, b_labels)
                total_true_eval_stance += acc_list[0]
                ###
                agree_val_true += acc_list[1]
                disagree_val_true += acc_list[2]
                discuss_val_true += acc_list[3]
                unrelated_val_true += acc_list[4]
                
                total_true_eval_ideology += acc_list[5]
                con_val_true += acc_list[6]
                lib_val_true += acc_list[7]
                na_val_true += acc_list[8]
                
                predict_labels = acc_list[9]
                
                                
                ##print("Batch Next")
                #for idx in range(0, len(P_stance)):
                    
                    #print(P_stance[idx], b_labels[idx], acc_list[9][idx]) 

        # Report the final accuracy for this validation run.
        avg_val_accuracy_stance = total_true_eval_stance / stance_all_num
        avg_val_accuracy_ideology = total_true_eval_ideology / ideology_all_num
        print("Avg Val Accuracy Stance: {0:.6f}".format(avg_val_accuracy_stance))
        print("Avg Val Accuracy Ideology: {0:.6f}".format(avg_val_accuracy_ideology))
        print("Total True")
        print(total_true)
        print("*************")
        avg_val_agree_accuracy = agree_val_true / pro_val_num
        print("Avg Val Agree Accuracy: {0:.6f}".format(avg_val_agree_accuracy))
        avg_val_disagree_accuracy = disagree_val_true / agst_val_num
        print("Avg Val Disagree Accuracy: {0:.6f}".format(avg_val_disagree_accuracy))
        avg_val_discuss_accuracy = discuss_val_true / neut_val_num
        print("Avg Val Discuss Accuracy: {0:.6f}".format(avg_val_discuss_accuracy))
        avg_val_unrelated_accuracy = unrelated_val_true / notrel_val_num
        print("Avg Val Unrelated Accuracy: {0:.6f}".format(avg_val_unrelated_accuracy))
        
        relative_score = 0.25*avg_val_unrelated_accuracy + 0.75*(avg_val_agree_accuracy + avg_val_disagree_accuracy + avg_val_discuss_accuracy)/3
        
        print("*****************")
        print("Relative score: {0:.6f}".format(relative_score))
        print("*****************")
        print("-------------")
        avg_val_con_accuracy = con_val_true / con_val_num
        print("Avg Val Con Accuracy: {0:.6f}".format(avg_val_con_accuracy))
        avg_lib_accuracy = lib_val_true / lib_val_num
        print("Avg Val Lib Accuracy: {0:.6f}".format(avg_lib_accuracy))
        avg_na_discuss_accuracy = na_val_true / na_val_num
        print("Avg Val NA Accuracy: {0:.6f}".format(avg_na_discuss_accuracy))

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)
        
        print("Total Validation loss", total_eval_loss)
        print("Len-validation loader", len(validation_dataloader))
    
        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t1)
        
        if avg_val_loss < min_val_loss:
            min_val_loss = avg_val_loss
    
        print("Avg Validation Loss: {0:.6f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        #avg_val_accuracy_ideology = 0
        # Record all statistics from this epoch.
        training_stats.append(
            {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Stance Accur.': avg_val_accuracy_stance,
            'Valid. Ideology Accur.': avg_val_accuracy_ideology,
            'Training Time': training_time,
            'Validation Time': validation_time
            }
        )
        
        model_save_state = {
            'epoch': epoch_i + 1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
            }
    
        es.__call__(avg_val_loss, avg_val_accuracy_stance, avg_val_accuracy_ideology, model_save_state, model_save_path, model)
        last_epoch = epoch_i + 1
        if es.early_stop == True:
            break  # early stop criterion is met, we can stop now

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
    
    
    min_val_loss = es.val_loss_min
    max_val_acc = es.val_acc_max_stance

    return training_stats, last_epoch, min_val_loss, max_val_acc

In [None]:
def print_summary(training_stats):
    # Display floats with two decimal places.
    pd.set_option('precision', 4)
    
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)

    # Create a DataFrame from our training statistics.
    df_stats = pd.DataFrame(data=training_stats)

    # Use the 'epoch' as the row index.
    df_stats = df_stats.set_index('epoch')

    # A hack to force the column headers to wrap.
    #df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])


    # Display the table.
    print(df_stats)
    return df_stats

In [None]:
def plot_results(df_stats, last_epoch):
    # Use plot styling from seaborn.
    sns.set(style='darkgrid')

    # Increase the plot size and font size.
    sns.set(font_scale=1.5)
    plt.rcParams["figure.figsize"] = (12,6)
    
    plot1 = plt.figure(1)
    
    plt.plot(df_stats['Training Loss'], 'b-o', label="Training_Loss")
    plt.plot(df_stats['Valid. Loss'], 'g-o', label="Val_Loss")

    # Label the plot.
    plt.title("Training & Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    #plt.autoscale(enable=True, axis='x')
    
    plot2 = plt.figure(2)

    x_ticks = []
    for currEpoch in range(1, last_epoch+1):
        x_ticks.append(currEpoch)
    #plt.xticks(x_ticks)
    plt.xticks(rotation=90)
    
    plt.plot(df_stats['Valid. Stance Accur.'], 'b-o', label="Valid. Stance Accur.")
    plt.plot(df_stats['Valid. Ideology Accur.'], 'g-o', label="Valid. Ideology Accur.")

    # Label the plot.
    plt.title("Val Stance & Ideology Acc")
    plt.xlabel("Epoch")
    plt.ylabel("Acc")
    plt.legend()
    
    plt.show()

In [None]:
from torch.utils.data import DataLoader, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig

def run_test_stance_ideology_meta(test_nums, test_nums_ideology, model_current, model_savepath, instancesTest, stance_labels_Test, ideology_labels_Test, batch_size = 16):       
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #loss_fct = torch.nn.BCELoss()
    loss_fct_relatedness = torch.nn.BCEWithLogitsLoss()
    
    t_instancesTest = torch.as_tensor(instancesTest.to_numpy(), dtype=torch.float32)
    t_test_relatedness, t_test_stance, t_test_mmd_symbol, t_test_mmd_symbol_ = preprocess_stance_ideology_new_meta(stance_labels_Test)
    prediction_data = TensorDataset(t_instancesTest, t_test_relatedness, t_test_stance, t_test_mmd_symbol, t_test_mmd_symbol_)
    
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size, num_workers=8, drop_last=False)
    
    pro_val_num = test_nums[0]
    agst_val_num = test_nums[1]
    neut_val_num = test_nums[2]
    notrel_val_num = test_nums[3]
    
    ##con_val_num = 0.1
    ##lib_val_num = 0.1
    ##na_val_num = 0.1
    
    con_val_num = 0.1
    lib_val_num = 0.1
    na_val_num = 0.1
    
    
    total_num = pro_val_num + agst_val_num + neut_val_num + notrel_val_num
    tokenizer = load_tokenizer(model_current)

        
    #model = StanceMetaLearner(model_current)
    checkpoint = torch.load(model_savepath)
    model = StanceMetaLearnerSimple(t_instancesTest)
    model.load_state_dict(checkpoint['state_dict'])   
    
    
    optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  betas=(0.9, 0.999), 
                  eps=1e-08, 
                  weight_decay=1e-4,
                  correct_bias=True
    )
    optimizer.load_state_dict(checkpoint['optimizer'])
    epoch_start = checkpoint['epoch']
    
    torch.cuda.empty_cache()
    model.to(device)
    optimizer_to(optimizer,device)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)
    
    #model.cuda()
    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    total_test_loss = 0.0
    
    total_test_accuracy_stance = 0.0
    total_test_accuracy_ideology = 0.0
    ####
    agree_test_accuracy = 0.0
    disagree_test_accuracy = 0.0
    discuss_test_accuracy = 0.0
    unrelated_test_accuracy = 0.0
    
    ideology_test_accuracy = 0.0
    ideology_test_con_accuracy = 0.0
    ideology_test_lib_accuracy = 0.0
    ideology_test_na_accuracy = 0.0
    predictions , true_labels = [], []
    
    alpha = 1.3
    theta = 0
    beta = 1e-3
    gamma = 0
    # Predict
    my_predictions = []
    my_all_predictions = []
    for batch in prediction_dataloader:
      #Add batch to GPU
        
        #batch = tuple(t.to(device) for t in batch)
        
            
        b_input_ids = batch[0].to(device)
        b_relatedness = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_mmd_symbol = batch[3].to(device)
        b_mmd_symbol_ = batch[4].to(device)

        with torch.no_grad():         
            # Forward pass, calculate logit predictions
            
            
            mmd_loss, P_relatedness, P_stance = model(input_ids = b_input_ids, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_)
            relatedness_loss = loss_fct_relatedness(P_relatedness, b_relatedness.float())
            stance_loss = loss_fct_relatedness(P_stance, b_labels.float())

            
            loss = alpha * stance_loss + beta * mmd_loss + relatedness_loss
            #loss = stance_loss
            loss = loss / accumulation_steps 
            total_train_loss += loss.item()

            P_relatedness = P_relatedness.to('cpu')
            b_relatedness = b_relatedness.to('cpu')
            P_stance = P_stance.to('cpu')
            b_labels = b_labels.to('cpu')
            
            acc_list = predict_classwise_stance_ideology_meta(P_stance, b_labels)
            total_test_accuracy_stance += acc_list[0]
            ###
            agree_test_accuracy += acc_list[1]
            disagree_test_accuracy += acc_list[2]
            discuss_test_accuracy += acc_list[3]
            unrelated_test_accuracy += acc_list[4]
            #prob_stance = acc_list[9]
            #prob_val_list = P_stance.numpy()
            
            #predict_labels = torch.argmax(prob_stance, 1)
            #my_len = len(prob_stance)
            
            #for prob in prob_val_list:
                #my_all_predictions.append(prob)
            #for idx in range(0, my_len):
                #my_predictions.append(str(prob_stance[idx]) + " " + str(predict_labels[idx]) + "\n")

    #arr = np.array(my_all_predictions)
    #file1 = open('myfile.txt', 'w')
    #file1.writelines(my_predictions)
    #file1.close()
    # Report the final accuracy for this validation run.
    avg_test_loss = total_test_loss / len(prediction_dataloader)
    avg_test_accuracy_stance = total_test_accuracy_stance / total_num
    
    avg_agree_test_acc = agree_test_accuracy / pro_val_num
    avg_disagree_test_acc = disagree_test_accuracy / agst_val_num
    avg_discuss_test_acc = discuss_test_accuracy / neut_val_num
    avg_unrelated_test_acc = unrelated_test_accuracy / notrel_val_num

    return avg_test_loss, avg_test_accuracy_stance, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc

In [None]:
from torch.utils.data import DataLoader, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig

def run_test_stance_ideology2(test_nums, test_nums_ideology, model_current, model_savepath, all_input_ids_Test, all_input_masks_Test, stance_labels_Test, ideology_labels_Test, batch_size = 16):       
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #loss_fct = torch.nn.BCELoss()
    loss_fct_relatedness = torch.nn.BCEWithLogitsLoss()
    
    #weights_ideology = torch.tensor([my_max_train/con_train_num, my_max_train/lib_train_num, 0.01]).to(device)   
    #loss_fct_ideology_weighted = torch.nn.BCEWithLogitsLoss(pos_weight = weights_ideology)
    
    t_test_relatedness, t_test_stance, t_test_mmd_symbol, t_test_mmd_symbol_, t_test_existedstance, t_test_ideology = preprocess_stance_ideology(stance_labels_Test, ideology_labels_Test)
    #t_test_stance = preprocess_ideology_new(stance_labels_Test)

    # Create the DataLoader.
    prediction_data = TensorDataset(all_input_ids_Test, all_input_masks_Test, t_test_relatedness, t_test_stance, t_test_mmd_symbol, t_test_mmd_symbol_, t_test_existedstance, t_test_ideology)
    #prediction_data = TensorDataset(all_input_ids_Test, all_input_masks_Test, t_test_stance)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size, num_workers=8, drop_last=False)
    
    pro_val_num = test_nums[0]
    agst_val_num = test_nums[1]
    neut_val_num = test_nums[2]
    notrel_val_num = test_nums[3]
    
    ##con_val_num = 0.1
    ##lib_val_num = 0.1
    ##na_val_num = 0.1
    
    con_val_num = 0.1
    lib_val_num = 0.1
    na_val_num = 0.1
    
    
    total_num = pro_val_num + agst_val_num + neut_val_num + notrel_val_num
    tokenizer = load_tokenizer(model_current)

        
    model = StanceIdeologyDetectionClass(model_current)
    checkpoint = torch.load(model_savepath)
    model.load_state_dict(checkpoint['state_dict'])   
    
    
    optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  betas=(0.9, 0.999), 
                  eps=1e-08, 
                  weight_decay=1e-4,
                  correct_bias=True
    )
    optimizer.load_state_dict(checkpoint['optimizer'])
    epoch_start = checkpoint['epoch']
    
    torch.cuda.empty_cache()
    model.to(device)
    optimizer_to(optimizer,device)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)
    
    #model.cuda()
    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    total_test_loss = 0.0
    
    total_test_accuracy_stance = 0.0
    total_test_accuracy_ideology = 0.0
    ####
    agree_test_accuracy = 0.0
    disagree_test_accuracy = 0.0
    discuss_test_accuracy = 0.0
    unrelated_test_accuracy = 0.0
    
    ideology_test_accuracy = 0.0
    ideology_test_con_accuracy = 0.0
    ideology_test_lib_accuracy = 0.0
    ideology_test_na_accuracy = 0.0
    predictions , true_labels = [], []
    
    alpha = 1.3
    theta = 0
    beta = 1e-3
    gamma = 0
    # Predict
    my_predictions = []
    my_all_predictions = []
    for batch in prediction_dataloader:
      #Add batch to GPU
        
        #batch = tuple(t.to(device) for t in batch)
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_relatedness = batch[2].to(device)
        b_labels = batch[3].to(device)
        b_mmd_symbol = batch[4].to(device)
        b_mmd_symbol_ = batch[5].to(device)
        b_existedstances = batch[6].to(device)
        b_ideologies = batch[7].to(device)
  


        # Telling the model not to compute or store gradients, saving memory and 
        # speeding up prediction
        with torch.no_grad():         
            # Forward pass, calculate logit predictions
            
            n1 = torch.sum(b_mmd_symbol, dim=0)
            n2 = torch.sum(b_mmd_symbol_, dim=0)
        
            aa = torch.reshape(b_mmd_symbol, (-1,1))
            bb = torch.reshape(b_mmd_symbol_, (-1,1))
            
            
            mmd_loss, P_relatedness, P_stance, P_existedstance = model(input_ids = b_input_ids, attention_mask = b_input_mask, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_)
            
            #mmd_loss, P_relatedness, P_stance, P_oneside, P_ideology = model(input_ids = b_input_ids, attention_mask = b_input_mask)
            #P_stance = model(input_ids = b_input_ids, attention_mask = b_input_mask, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_)
                
            
            relatedness_loss = loss_fct_relatedness(P_relatedness, b_relatedness.float())
            #relatedness_loss = 0
            stance_loss = loss_fct_relatedness(P_stance, b_labels.float())
            #existedstance_loss = loss_fct_relatedness(P_existedstance, b_existedstances.float())
            #ideology_loss = loss_fct_relatedness(P_ideology, b_ideologies.float())
                
    
            loss_test = alpha * stance_loss + beta * mmd_loss + relatedness_loss
            #loss_test = stance_loss
            total_test_loss += loss_test.item()
            
            # Move logits and labels to CPU
            P_relatedness = P_relatedness.to('cpu')
            b_relatedness = b_relatedness.to('cpu')
            P_stance = P_stance.to('cpu')
            b_labels = b_labels.to('cpu')
            ##b_existedstances = b_existedstances.to('cpu')
            #P_existedstance = P_existedstance.to('cpu')
            #P_ideology = P_ideology.to('cpu')
            #b_ideologies = b_ideologies.to('cpu')
            

            acc_list = predict_classwise_stance_ideology(P_relatedness, P_stance, b_labels)
            total_test_accuracy_stance += acc_list[0]
            ###
            agree_test_accuracy += acc_list[1]
            disagree_test_accuracy += acc_list[2]
            discuss_test_accuracy += acc_list[3]
            unrelated_test_accuracy += acc_list[4]
            predict_labels = acc_list[9]
            
            predict_labels = torch.argmax(prob_stance, 1)
            my_len = len(predict_labels)
            
            for prob in predict_labels:
                my_all_predictions.append(prob)
            #for idx in range(0, my_len):
                #my_predictions.append(str(prob_stance[idx]) + " " + str(predict_labels[idx]) + "\n")

    arr = np.array(my_all_predictions)
    #file1 = open('myfile.txt', 'w')
    #file1.writelines(my_predictions)
    #file1.close()
    # Report the final accuracy for this validation run.
    avg_test_loss = total_test_loss / len(prediction_dataloader)
    avg_test_accuracy_stance = total_test_accuracy_stance / total_num
    
    avg_agree_test_acc = agree_test_accuracy / pro_val_num
    avg_disagree_test_acc = disagree_test_accuracy / agst_val_num
    avg_discuss_test_acc = discuss_test_accuracy / neut_val_num
    avg_unrelated_test_acc = unrelated_test_accuracy / notrel_val_num

    return avg_test_loss, avg_test_accuracy_stance, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc, arr

In [None]:
from torch.utils.data import DataLoader, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig

def run_test_stance_ideology(test_nums, test_nums_ideology, model_current, model_savepath, all_input_ids_Test, 
                             all_input_masks_Test, stance_labels_Test, ideology_labels_Test, batch_size = 16):       

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #loss_fct = torch.nn.BCELoss()
    loss_fct_relatedness = torch.nn.BCEWithLogitsLoss()
    
    #weights_ideology = torch.tensor([my_max_train/con_train_num, my_max_train/lib_train_num, 0.01]).to(device)   
    #loss_fct_ideology_weighted = torch.nn.BCEWithLogitsLoss(pos_weight = weights_ideology)
    
    t_test_relatedness, t_test_stance, t_test_mmd_symbol, t_test_mmd_symbol_, t_test_existedstance, t_test_ideology = preprocess_stance_ideology(stance_labels_Test, ideology_labels_Test)
    #t_test_stance = preprocess_ideology_new(stance_labels_Test)

    # Create the DataLoader.
    prediction_data = TensorDataset(all_input_ids_Test, all_input_masks_Test, t_test_relatedness, t_test_stance, t_test_mmd_symbol, t_test_mmd_symbol_, t_test_existedstance, t_test_ideology)
    #prediction_data = TensorDataset(all_input_ids_Test, all_input_masks_Test, t_test_stance)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size, num_workers=8, drop_last=False)
    
    pro_val_num = test_nums[0]
    agst_val_num = test_nums[1]
    neut_val_num = test_nums[2]
    notrel_val_num = test_nums[3]
    
    ##con_val_num = 0.1
    ##lib_val_num = 0.1
    ##na_val_num = 0.1
    
    con_val_num = 0.1
    lib_val_num = 0.1
    na_val_num = 0.1
    
    
    total_num = pro_val_num + agst_val_num + neut_val_num + notrel_val_num
    tokenizer = load_tokenizer(model_current)

        
    model = StanceIdeologyDetectionClassMixout(model_current)
    checkpoint = torch.load(model_savepath)
    model.load_state_dict(checkpoint['state_dict'])   
    
    
    optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  betas=(0.9, 0.999), 
                  eps=1e-08, 
                  weight_decay=1e-4,
                  correct_bias=True
    )
    optimizer.load_state_dict(checkpoint['optimizer'])
    epoch_start = checkpoint['epoch']
    
    torch.cuda.empty_cache()
    model.to(device)
    optimizer_to(optimizer,device)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)
    
    #model.cuda()
    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    total_test_loss = 0.0
    
    total_test_accuracy_stance = 0.0
    total_test_accuracy_ideology = 0.0
    ####
    agree_test_accuracy = 0.0
    disagree_test_accuracy = 0.0
    discuss_test_accuracy = 0.0
    unrelated_test_accuracy = 0.0
    
    ideology_test_accuracy = 0.0
    ideology_test_con_accuracy = 0.0
    ideology_test_lib_accuracy = 0.0
    ideology_test_na_accuracy = 0.0
    predictions , true_labels = [], []
    
    alpha = 1.3
    theta = 0
    beta = 1e-3
    gamma = 0
    # Predict
    my_all_batch_preds = []
    my_predictions = []
    my_all_predictions = []
    for batch in prediction_dataloader:
      #Add batch to GPU
        
        #batch = tuple(t.to(device) for t in batch)
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_relatedness = batch[2].to(device)
        b_labels = batch[3].to(device)
        b_mmd_symbol = batch[4].to(device)
        b_mmd_symbol_ = batch[5].to(device)
        b_existedstances = batch[6].to(device)
        b_ideologies = batch[7].to(device)
  


        # Telling the model not to compute or store gradients, saving memory and 
        # speeding up prediction
        with torch.no_grad():         
            # Forward pass, calculate logit predictions
            
            n1 = torch.sum(b_mmd_symbol, dim=0)
            n2 = torch.sum(b_mmd_symbol_, dim=0)
        
            aa = torch.reshape(b_mmd_symbol, (-1,1))
            bb = torch.reshape(b_mmd_symbol_, (-1,1))
            
            
            mmd_loss, P_relatedness, P_stance, P_existedstance = model(input_ids = b_input_ids, attention_mask = b_input_mask, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_, epoch_num = 12)
            
            #mmd_loss, P_relatedness, P_stance, P_oneside, P_ideology = model(input_ids = b_input_ids, attention_mask = b_input_mask)
            #P_stance = model(input_ids = b_input_ids, attention_mask = b_input_mask, mmd_pl = b_mmd_symbol, mmd_pl_ = b_mmd_symbol_)
                
            
            relatedness_loss = loss_fct_relatedness(P_relatedness, b_relatedness.float())
            #relatedness_loss = 0
            stance_loss = loss_fct_relatedness(P_stance, b_labels.float())
            #existedstance_loss = loss_fct_relatedness(P_existedstance, b_existedstances.float())
            #ideology_loss = loss_fct_relatedness(P_ideology, b_ideologies.float())
                
    
            loss_test = alpha * stance_loss + beta * mmd_loss + relatedness_loss
            loss_test = loss_test
            total_test_loss += loss_test.item()
            
            # Move logits and labels to CPU
            P_relatedness = P_relatedness.to('cpu')
            b_relatedness = b_relatedness.to('cpu')
            P_stance = P_stance.to('cpu')
            b_labels = b_labels.to('cpu')
            b_existedstances = b_existedstances.to('cpu')
            #P_existedstance = P_existedstance.to('cpu')
            #P_ideology = P_ideology.to('cpu')
            b_ideologies = b_ideologies.to('cpu')
                
            

            acc_list = predict_classwise_stance_ideology(P_relatedness, P_stance, P_existedstance, b_labels)
            total_test_accuracy_stance += acc_list[0]
            ###
            agree_test_accuracy += acc_list[1]
            disagree_test_accuracy += acc_list[2]
            discuss_test_accuracy += acc_list[3]
            unrelated_test_accuracy += acc_list[4]
            prob_stance = acc_list[9]
            prob_val_list = prob_stance.numpy()
            
            predict_labels = torch.argmax(prob_stance, 1)
            my_len = len(prob_stance)
            my_all_batch_preds.extend(predict_labels)
            for prob in prob_val_list:
                my_all_predictions.append(prob)
            #for idx in range(0, my_len):
                #my_predictions.append(str(prob_stance[idx]) + " " + str(predict_labels[idx]) + "\n")
    
    #arr = np.array(my_all_batch_preds)
    file1 = open('myfile.txt', 'w')
    for elt in my_all_predictions:
        file1.write(str(elt))
        file1.write(str("\n"))
    file1.close()
    # Report the final accuracy for this validation run.
    avg_test_loss = total_test_loss / len(prediction_dataloader)
    avg_test_accuracy_stance = total_test_accuracy_stance / total_num
    
    avg_agree_test_acc = agree_test_accuracy / pro_val_num
    avg_disagree_test_acc = disagree_test_accuracy / agst_val_num
    avg_discuss_test_acc = discuss_test_accuracy / neut_val_num
    avg_unrelated_test_acc = unrelated_test_accuracy / notrel_val_num

    return avg_test_loss, avg_test_accuracy_stance, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc, my_all_batch_preds

In [None]:
from numpy import nan
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.datasets import make_classification
#from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import CalibratedClassifierCV
from random import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import make_column_transformer



def run_wholeprocess_stance_serp_ambigious(train_path, val_path, max_len, doc_stride, batch_size, num_warmup_steps, learning_rate, epochs, seedVal):
    device = run_utils()
    #model_save_path = './model_save/fnc/model_emergentbert_epoch90_withoutsep_serp.t7'  
    model_save_path = './models/BERT_SERP/bert_titleonly_finetuned'  
    #model_fnc = './model_save/fnc/model_fnc.t7'   
    #model_current = "ponmari/Question-Answering"
        
    #model_current = 'distilbert-base-uncased'
    #model_current = './models/BERT_SERP/bert_titleonly_finetuned'
    model_current = 'roberta-base'
    #model_current = 'bert-large-uncased'
    #model_current = 'xlnet-base-cased'
    #model_current = 'bert-base-multilingual-cased'
    #model_current = 'albert-large-v1'
    #model_current = './models/ROBERTA/checkpoint-40000'
    #model_current = './models/BERT_SERP/bert_titleonly_finetuned_robertalarge' 
    tokenizer = load_tokenizer(model_current)

#--------------LOAD DATASETS--------------#

    
    ##df = load_dataset_emergent(train_path)
    #dfVal = load_dataset_emergent(val_path)
    #dfTest = load_dataset_emergent(test_path)
    
    #df = pd.concat([df, dfVal], ignore_index=True)
    

    #train_path = './dataset/batches_cleaned/stance/Train_latest.tsv'
    #val_path = './dataset/batches_cleaned/stance/Val_latest.tsv'
    #test_path = './dataset/batches_cleaned/stance/Test_latest.tsv'

    trainPer = 1.2
    valPer = 0.2
    testPer = 0.2
    
    #df_all = load_dataset_ambigious('./dataset/batches_cleaned/stance/FullDataset_16.09.2021.tsv')
    #create_more_notrel_docs(df_all)
    #df_all_added = load_dataset_stance('./dataset/batches_cleaned/stance/Final_Dataset_AddedNotRelated.tsv')
    #df, dfVal, dfTest = sample_dataset_ambigious(df_all, seedVal)
    
    
    
    
    df = load_dataset_ambigious('./dataset/batches_cleaned/stance/train_serp_ambigious.tsv')
    dfVal = load_dataset_ambigious('./dataset/batches_cleaned/stance/val_serp_ambigious.tsv')
    dfTest = load_dataset_ambigious('./dataset/batches_cleaned/stance/test_serp_ambigious.tsv')
    #dfTest = load_dataset_stance('./dataset/batches_cleaned/stance/test_serp.tsv')
    
    #dfTest = dfTest.append(df_all_neut, ignore_index = True)
    #dfTest.to_csv('./dataset/batches_cleaned/stance/test_serp_allneut.tsv', sep='\t', index=False)
    
    #df, dfTest = merge_datasets(df, dfVal, dfTest)
    
    

    
    train_nums_ideology = []
    val_nums_ideology = []
    test_nums_ideology = []
    
    #train_nums_ideology = count_class_num_ideology (df)
    #val_nums_ideology = count_class_num_ideology (dfVal)
    #test_nums_ideology = count_class_num_ideology (dfTest)

    
    sentencesQueryCont_Train = []
    labelsTrain = []
    
    labelsTrain_ideology = []
    labelsVal_ideology = []
    labelsTest_ideology = []

    #sentencesQueryTitle_Train, labelsTrain = generate_datasets_emergent (df)
    sentencesQueryTitle_Train, sentencesQueryTitleCont_Train, labelsTrain, labelsTrain_ideology = generate_datasets_ambigious (df)

    
    #print(labelsTrain)

    sentencesQueryCont_Val = []
    labelsVal = []

    #--------------DATASETS-------------#
    
    #sentencesQueryTitle_Val, labelsVal = generate_datasets_emergent (dfVal)
    sentencesQueryTitle_Val, sentencesQueryTitleCont_Val, labelsVal, labelsVal_ideology = generate_datasets_ambigious (dfVal)
    
    
    sentencesQueryCont_Test = []
    labelsTest = []
    
    #sentencesQueryTitle_Test, labelsTest = generate_datasets_emergent (dfTest)
    sentencesQueryTitle_Test, sentencesQueryTitleCont_Test, labelsTest, labelsTest_ideology = generate_datasets_ambigious (dfTest)
    
    
    train_nums = count_class_num_ambigious (df)
    val_nums = count_class_num_ambigious (dfVal)
    test_nums = count_class_num_ambigious (dfTest)
    
    print(train_nums)
        
    # Report the number of sentences.
    print('Number of training sentences: {:,}'.format(df.shape[0]))
    print('Number of val sentences: {:,}'.format(dfVal.shape[0]))
    print('Number of test sentences: {:,}'.format(dfTest.shape[0]))
    
    print("Starting neural net")
    
    all_input_ids_Train, all_input_masks_Train  = preprocessing_for_bert(tokenizer, sentencesQueryTitleCont_Train, max_len, doc_stride) #train
    all_input_ids_Val, all_input_masks_Val = preprocessing_for_bert(tokenizer, sentencesQueryTitleCont_Val, max_len, doc_stride) #train
    all_input_ids_Test, all_input_masks_Test = preprocessing_for_bert(tokenizer, sentencesQueryTitleCont_Test, max_len, doc_stride) #train
    
    train_nums_ideology = []
    val_nums_ideology = []
    test_nums_ideology = []
    
    
    model, train_dataloader, validation_dataloader, optimizer, scheduler = prepare_for_training_ambigious(all_input_ids_Train, all_input_masks_Train, labelsTrain, all_input_ids_Val,
                                                                                                              all_input_masks_Val, labelsVal, model_current, batch_size, epochs, num_warmup_steps, learning_rate)    
    training_stats, last_epoch, min_val_loss, max_val_acc = train_stance_ideology (train_nums, val_nums, train_nums_ideology, val_nums_ideology, model_save_path, model, train_dataloader, validation_dataloader, epochs, batch_size, optimizer,
                                                                     scheduler, patience, verbose, delta, seedVal, False)
    
    
    #test_loss, test_acc, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc, arr = run_test_stance_ideology(train_nums, train_nums_ideology, model_current, model_save_path, all_input_ids_Train, 
    #                                                                                                                                        #all_input_masks_Train, labelsTrain, labelsTrain_ideology)
    
    #test_loss, test_acc, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc, arr = run_test_stance_ideology(val_nums, val_nums_ideology, model_current, model_save_path, all_input_ids_Val, 
    #                                                                                                                                        all_input_masks_Val, labelsVal, labelsVal_ideology)
    
    test_loss, test_acc, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc, arr_test = run_test_stance_ideology(test_nums, test_nums_ideology, model_current, model_save_path, all_input_ids_Test, 
                                                                                                                                            all_input_masks_Test, labelsTest, labelsTest_ideology)
    #df_stats = print_summary(training_stats)
    #plot_results(df_stats, last_epoch)
    
    print("****************")
    print('Test Loss: ' + str(test_loss))
    print('Test Stance Acc: ' + str(test_acc))
    
    print('Agree Class Acc: ' + str(avg_agree_test_acc))
    print('Disagree Class Acc: ' + str(avg_disagree_test_acc))
    print('Discuss Class Acc: ' + str(avg_discuss_test_acc))
    print('Unrelated Class Acc: ' + str(avg_unrelated_test_acc))

In [None]:
from numpy import nan
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import CalibratedClassifierCV
from random import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import make_column_transformer
#from transformers import LongformerModel



def run_wholeprocess_stance_serp_DT(train_path, val_path, max_len, doc_stride, batch_size, num_warmup_steps, learning_rate, epochs, seedVal):
    device = run_utils()
    #model_save_path = './model_save/fnc/model_emergentbert_epoch90_withoutsep_serp.t7'  
    model_save_path = './models/BERT_SERP/bert_titleonly_finetuned'  
    #model_fnc = './model_save/fnc/model_fnc.t7'   
    #model_current = "ponmari/Question-Answering"
        
    #model_current = 'distilbert-base-uncased'
    #model_current = './models/BERT_SERP/bert_titleonly_finetuned'
    model_current = 'roberta-large'
    #model_current = './models/2b/'
    #model_current = 'bert-large-uncased'
    #model_current = 'xlnet-large-cased'
    #model_current = 'albert-xxlarge-v2'
    #model_current = 'bert-base-multilingual-cased'
    #model_current = 'albert-large-v2'
    #model_current = './models/ROBERTA/checkpoint-40000'
    #model_current = './models/BERT_SERP/bert_titleonly_finetuned_robertalarge'
    #model_current = 'allenai/longformer-large-4096'
    tokenizer = load_tokenizer(model_current)

#--------------LOAD DATASETS--------------#

    train_path = 'emergent_train.csv'
    val_path = 'emergent_val.csv'
    test_path = 'emergent_test.csv'
    
    ##df = load_dataset_emergent(train_path)
    #dfVal = load_dataset_emergent(val_path)
    #dfTest = load_dataset_emergent(test_path)
    
    #df = pd.concat([df, dfVal], ignore_index=True)
    

    #train_path = './dataset/batches_cleaned/stance/Train_latest.tsv'
    #val_path = './dataset/batches_cleaned/stance/Val_latest.tsv'
    #test_path = './dataset/batches_cleaned/stance/Test_latest.tsv'

    trainPer = 1.2
    valPer = 0.2
    testPer = 0.2
    
    df_all = load_dataset_stance('./dataset/batches_cleaned/stance/MergedDataset_20.06.2021.tsv')
    create_more_notrel_docs(df_all)
    df_all_added = load_dataset_stance('./dataset/batches_cleaned/stance/Final_Dataset_AddedNotRelated.tsv')
    df, dfVal, dfTest = sample_dataset_stance(df_all_added, seedVal)
    
    
    df = load_dataset_stance('./dataset/batches_cleaned/stance/train_serp.tsv')
    dfVal = load_dataset_stance('./dataset/batches_cleaned/stance/val_serp.tsv')
    dfTest = load_dataset_stance('./dataset/batches_cleaned/stance/test_serp.tsv')
    #dfTest = load_dataset_stance('./dataset/batches_cleaned/stance/test_serp.tsv')
    
    #dfTest = dfTest.append(df_all_neut, ignore_index = True)
    #dfTest.to_csv('./dataset/batches_cleaned/stance/test_serp_allneut.tsv', sep='\t', index=False)
    
    #df, dfTest = merge_datasets(df, dfVal, dfTest)
    
    

    
    train_nums_ideology = []
    val_nums_ideology = []
    test_nums_ideology = []
    
    #train_nums_ideology = count_class_num_ideology (df)
    #val_nums_ideology = count_class_num_ideology (dfVal)
    #test_nums_ideology = count_class_num_ideology (dfTest)

    
    sentencesQueryCont_Train = []
    labelsTrain = []
    
    labelsTrain_ideology = []
    labelsVal_ideology = []
    labelsTest_ideology = []

    #sentencesQueryTitle_Train, labelsTrain = generate_datasets_emergent (df)
    sentencesQueryTitle_Train, sentencesQueryTitleCont_Train, labelsTrain, labelsTrain_ideology = generate_datasets (df)

    
    #print(labelsTrain)

    sentencesQueryCont_Val = []
    labelsVal = []

    #--------------DATASETS-------------#
    
    #sentencesQueryTitle_Val, labelsVal = generate_datasets_emergent (dfVal)
    sentencesQueryTitle_Val, sentencesQueryTitleCont_Val, labelsVal, labelsVal_ideology = generate_datasets (dfVal)
    
    
    sentencesQueryCont_Test = []
    labelsTest = []
    
    #sentencesQueryTitle_Test, labelsTest = generate_datasets_emergent (dfTest)
    sentencesQueryTitle_Test, sentencesQueryTitleCont_Test, labelsTest, labelsTest_ideology = generate_datasets (dfTest)
    
    
    train_nums = count_class_num (df)
    val_nums = count_class_num (dfVal)
    test_nums = count_class_num (dfTest)
    
    print(train_nums)
        
    # Report the number of sentences.
    print('Number of training sentences: {:,}'.format(df.shape[0]))
    print('Number of val sentences: {:,}'.format(dfVal.shape[0]))
    print('Number of test sentences: {:,}'.format(dfTest.shape[0]))
    
    print("Starting neural net")
    
    all_input_ids_Train, all_input_masks_Train  = preprocessing_for_bert(tokenizer, sentencesQueryTitleCont_Train, max_len, doc_stride) #train
    all_input_ids_Val, all_input_masks_Val = preprocessing_for_bert(tokenizer, sentencesQueryTitleCont_Val, max_len, doc_stride) #train
    all_input_ids_Test, all_input_masks_Test = preprocessing_for_bert(tokenizer, sentencesQueryTitleCont_Test, max_len, doc_stride) #train
    
    train_nums_ideology = []
    val_nums_ideology = []
    test_nums_ideology = []
    
    
    #model, train_dataloader, validation_dataloader, optimizer, scheduler = prepare_for_training_stance_ideology_paper(all_input_ids_Train, all_input_masks_Train, labelsTrain, labelsTrain_ideology, all_input_ids_Val,
    #                                                                                                          all_input_masks_Val, labelsVal, labelsVal_ideology, model_current, batch_size, epochs, num_warmup_steps, learning_rate)    
    #training_stats, last_epoch, min_val_loss, max_val_acc = train_stance_ideology_paper (train_nums, val_nums, train_nums_ideology, val_nums_ideology, model_save_path, model, train_dataloader, validation_dataloader, epochs, batch_size, optimizer,
    #                                                                 scheduler, patience, verbose, delta, seedVal, False)
    
    
    #test_loss, test_acc, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc, arr = run_test_stance_ideology(train_nums, train_nums_ideology, model_current, model_save_path, all_input_ids_Train, 
    #                                                                                                                                        all_input_masks_Train, labelsTrain, labelsTrain_ideology)
    
    #test_loss, test_acc, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc, arr = run_test_stance_ideology(val_nums, val_nums_ideology, model_current, model_save_path, all_input_ids_Val, 
    #                                                                                                                                        all_input_masks_Val, labelsVal, labelsVal_ideology)
    
    #test_loss, test_acc, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc, arr_test = run_test_stance_ideology(test_nums, test_nums_ideology, model_current, model_save_path, all_input_ids_Test, 
    #                                                                                                                                        all_input_masks_Test, labelsTest, labelsTest_ideology)
    #df_stats = print_summary(training_stats)
    #plot_results(df_stats, last_epoch)
    
    print("****************")
    #print('Test Loss: ' + str(test_loss))
    #print('Test Stance Acc: ' + str(test_acc))
    
    #print('Agree Class Acc: ' + str(avg_agree_test_acc))
    #print('Disagree Class Acc: ' + str(avg_disagree_test_acc))
    #print('Discuss Class Acc: ' + str(avg_discuss_test_acc))
    #print('Unrelated Class Acc: ' + str(avg_unrelated_test_acc))
    
    #t_test_relatedness, t_test_stance, t_test_mmd_symbol, t_test_mmd_symbol_, t_test_existedstance, t_test_ideology = preprocess_stance_ideology(labelsTest, labelsTest_ideology)
    #true_labels = torch.argmax(t_test_stance, 1)
    #pred_stance = torch.as_tensor(arr_test, dtype=torch.int32)
    
    #print("Confusion Matrix")
    
    
    #print(true_labels.shape[0])
    #print(pred_stance.shape[0])
    
    #print (pd.DataFrame(confusion_matrix(true_labels, pred_stance), columns=['Pro','Agst','Neut','Not-rel']))
    print("*********")
    
    
    
    df = pd.concat([df, dfVal], ignore_index=True)
    sentencesQueryTitle_Train, sentencesQueryTitleCont_Train, labelsTrain, labelsTrain_ideology = generate_datasets (df)
    
    
    vectorizer = CountVectorizer()
    
    clf1 = SVC(kernel= "linear", class_weight= 'balanced', max_iter = 10000, probability=True, C=1)
    #clf = CalibratedClassifierCV(svm) 
    #svm = LinearSVC(probability=True)
    #svm = SVC(kernel='rbf', random_state=0, gamma=.01, C=1)
    X_train = vectorizer.fit_transform(sentencesQueryTitleCont_Train)
    X_test = vectorizer.transform(sentencesQueryTitleCont_Test)
    
    sc = StandardScaler(with_mean=False)
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    #param_grid = {'C': [0.1, 1, 10, 100, 1000], 
    #          'kernel': ['linear']} 
  
    #grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
    
  
    # fitting the model for grid search
    #grid = grid.fit(X_train, labelsTrain)
    #grid_predictions = grid.predict(X_test)
  
    # print classification report

    
    clf1 = clf1.fit(X_train, labelsTrain)
    y_pred1 = clf1.predict(X_test)
    print(classification_report(labelsTest, y_pred1))
    
    train_predictions_prob_svm = clf1.predict_proba(X_train)
    test_predictions_prob_svm = clf1.predict_proba(X_test)
    
    #tree_param = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
    #clf = GridSearchCV(DecisionTreeClassifier(), tree_param, cv=5)
    #clf.fit(X_train, labelsTrain)
    
    #grid_predictions = clf.predict(X_test)

    # print classification report
    #print(classification_report(labelsTest, grid_predictions))
    
    
    print("*****************")

    
    df = pd.DataFrame(test_predictions_prob_svm)
    df.to_csv('svm_preds' + ".tsv", sep='\t', index=False)
    
    # Create Decision Tree classifer object
    #clf = DecisionTreeClassifier()
    
    clf2 = RandomForestClassifier(max_depth=2, class_weight= 'balanced', n_estimators = 1000)

    # Train Decision Tree Classifer
    fit = clf2.fit(X_train, labelsTrain)

    #Predict the response for test dataset
    y_pred2 = clf2.predict(X_test)
    
    print("*****************")
    
    print(classification_report(labelsTest, y_pred2))
    train_predictions_prob_rf = clf2.predict_proba(X_train)
    test_predictions_prob_rf = clf1.predict_proba(X_test)
    
    df1 = pd.DataFrame(test_predictions_prob_rf) 
    df1.to_csv('rf_preds' + ".tsv", sep='\t', index=False)
    
    
    df2 = pd.DataFrame(labelsTest)
    df2.to_csv('test_labels' + ".tsv", sep='\t', index=False)
    
    
    clf3 = XGBClassifier(use_label_encoder=True, class_weight= 'balanced', eval_metric='auc')
    clf3.fit(X_train, labelsTrain)
    y_pred3 = clf3.predict(X_test)

    print(classification_report(labelsTest, y_pred3))
    train_predictions_prob_xgboost = clf3.predict_proba(X_train)
    test_predictions_prob_xgboost = clf3.predict_proba(X_test)
    
    df3 = pd.DataFrame(test_predictions_prob_xgboost)
    df3.to_csv('xgboost_preds' + ".tsv", sep='\t', index=False)
    
    
    print(train_predictions_prob_svm[0])
    train_predictions_prob_svm = pd.DataFrame(train_predictions_prob_svm)
    train_predictions_prob_rf = pd.DataFrame(train_predictions_prob_rf)
    train_predictions_prob_xgboost = pd.DataFrame(train_predictions_prob_xgboost)
    train_predictions_prob_neural = pd.DataFrame(arr)
    
    test_predictions_prob_svm = pd.DataFrame(test_predictions_prob_svm)
    test_predictions_prob_rf = pd.DataFrame(test_predictions_prob_rf)
    test_predictions_prob_xgboost = pd.DataFrame(test_predictions_prob_xgboost)
    test_predictions_prob_neural = pd.DataFrame(arr_test)
    
    
    if train_predictions_prob_neural.isnull().values.any():
        print("train predictions - NAN")
    if test_predictions_prob_neural.isnull().values.any():
        print("test predictions - NAN")
        
    print(train_predictions_prob_svm.shape[0], train_predictions_prob_neural.shape[0])
    print(len(sentencesQueryTitleCont_Train), X_train.shape[0], len(labelsTrain))
    all_pred_prob = pd.concat([train_predictions_prob_svm, train_predictions_prob_rf, train_predictions_prob_xgboost], axis = 1, ignore_index=True)
    all_pred_prob_test = pd.concat([test_predictions_prob_svm, test_predictions_prob_rf, test_predictions_prob_xgboost], axis = 1, ignore_index=True)
    
    clf1.fit(all_pred_prob, labelsTrain)
    y_pred_new = clf1.predict(all_pred_prob_test)
    
    print(classification_report(labelsTest, y_pred_new))
    
    all_preds = pd.concat([df, df1, df2, df3], axis = 1, ignore_index=True)
    all_preds.to_csv('all_preds' + ".tsv", sep='\t', index=False)
    
    eclf1 = VotingClassifier(estimators=[('svm', clf1), ('rf', clf2), ('xg', clf3)], voting='hard')
    eclf1 = eclf1.fit(X_train, labelsTrain)
    y_pred4 = eclf1.predict(X_test)
    print(classification_report(labelsTest, y_pred4))
    
    eclf2 = VotingClassifier(estimators=[('svm', clf1), ('rf', clf2), ('xg', clf3)], voting='soft')
    eclf2 = eclf2.fit(X_train, labelsTrain)
    y_pred5 = eclf2.predict(X_test)
    print(classification_report(labelsTest, y_pred5))

In [None]:
from numpy import nan
from transformers import AutoTokenizer, AutoModelForQuestionAnswering


def run_wholeprocess_stance_serp(train_path, val_path, max_len, doc_stride, batch_size, num_warmup_steps, learning_rate, epochs, seedVal):
    device = run_utils()
    #model_save_path = './model_save/fnc/model_emergentbert_epoch90_withoutsep_serp.t7'  
    model_save_path = './models/BERT_SERP/bert_titleonly_finetuned'  
    #model_fnc = './model_save/fnc/model_fnc.t7'   
    #model_current = "ponmari/Question-Answering"
        
    #model_current = 'distilbert-base-uncased'
    #model_current = './models/BERT_SERP/bert_titleonly_finetuned'
    #model_current = 'roberta-base'
    #model_current = 'bert-base-uncased'
    #model_current = 'xlnet-base-cased'
    #model_current = 'bert-base-multilingual-cased'
    model_current = 'albert-base-v2'
    #model_current = './models/mnli_model/'
    #model_current = './model_save/fnc/model_emergentbert_epoch90.t7'
    tokenizer = load_tokenizer(model_current)

#--------------LOAD DATASETS--------------#

    train_path = 'emergent_train.csv'
    val_path = 'emergent_val.csv'
    test_path = 'emergent_test.csv'
    
    ##df = load_dataset_emergent(train_path)
    #dfVal = load_dataset_emergent(val_path)
    #dfTest = load_dataset_emergent(test_path)
    
    #df = pd.concat([df, dfVal], ignore_index=True)
    

    #train_path = './dataset/batches_cleaned/stance/Train_latest.tsv'
    #val_path = './dataset/batches_cleaned/stance/Val_latest.tsv'
    #test_path = './dataset/batches_cleaned/stance/Test_latest.tsv'

    trainPer = 1.2
    valPer = 0.2
    testPer = 0.2
    
    df_all = load_dataset_stance('./dataset/batches_cleaned/stance/MergedDataset_20.06.2021.tsv')
    create_more_notrel_docs(df_all)
    df_all_added = load_dataset_stance('./dataset/batches_cleaned/stance/Final_Dataset_AddedNotRelated.tsv')
    df, dfVal, dfTest = sample_dataset_stance(df_all_added, seedVal)
    
    
    df = load_dataset_stance('./dataset/batches_cleaned/stance/train_serp.tsv')
    dfVal = load_dataset_stance('./dataset/batches_cleaned/stance/val_serp.tsv')
    dfTest = load_dataset_stance('./dataset/batches_cleaned/stance/test_serp.tsv')
    #dfTest = load_dataset_stance('./dataset/batches_cleaned/stance/test_serp.tsv')
    
    #dfTest = dfTest.append(df_all_neut, ignore_index = True)
    #dfTest.to_csv('./dataset/batches_cleaned/stance/test_serp_allneut.tsv', sep='\t', index=False)
    
    #df, dfTest = merge_datasets(df, dfVal, dfTest)
    
    #df = pd.concat([df, dfVal], ignore_index=True)
    
    train_nums = count_class_num (df)
    val_nums = count_class_num (dfVal)
    test_nums = count_class_num (dfTest)
    
    train_nums_ideology = []
    val_nums_ideology = []
    test_nums_ideology = []
    
    #train_nums_ideology = count_class_num_ideology (df)
    #val_nums_ideology = count_class_num_ideology (dfVal)
    #test_nums_ideology = count_class_num_ideology (dfTest)

    
    sentencesQueryCont_Train = []
    labelsTrain = []
    
    labelsTrain_ideology = []
    labelsVal_ideology = []
    labelsTest_ideology = []

    #sentencesQueryTitle_Train, labelsTrain = generate_datasets_emergent (df)
    sentencesQueryTitle_Train, sentencesQueryTitleCont_Train, labelsTrain, labelsTrain_ideology = generate_datasets (df)

    
    #print(labelsTrain)

    sentencesQueryCont_Val = []
    labelsVal = []

    #--------------DATASETS-------------#
    
    #sentencesQueryTitle_Val, labelsVal = generate_datasets_emergent (dfVal)
    sentencesQueryTitle_Val, sentencesQueryTitleCont_Val, labelsVal, labelsVal_ideology = generate_datasets (dfVal)
    
    
    sentencesQueryCont_Test = []
    labelsTest = []
    
    #sentencesQueryTitle_Test, labelsTest = generate_datasets_emergent (dfTest)
    sentencesQueryTitle_Test, sentencesQueryTitleCont_Test, labelsTest, labelsTest_ideology = generate_datasets (dfTest)
    
        
    # Report the number of sentences.
    print('Number of training sentences: {:,}'.format(df.shape[0]))
    print('Number of val sentences: {:,}'.format(dfVal.shape[0]))
    print('Number of test sentences: {:,}'.format(dfTest.shape[0]))

    all_input_ids_Train, all_input_masks_Train  = preprocessing_for_bert(tokenizer, sentencesQueryTitleCont_Train, max_len, doc_stride) #train
    all_input_ids_Val, all_input_masks_Val = preprocessing_for_bert(tokenizer, sentencesQueryTitleCont_Val, max_len, doc_stride) #train
    all_input_ids_Test, all_input_masks_Test = preprocessing_for_bert(tokenizer, sentencesQueryTitleCont_Test, max_len, doc_stride) #train
    
    
    #all_input_ids_Train, all_input_masks_Train, a, b  = transform_sequences_longer(tokenizer, sentencesQueryTitle_Train, labelsTrain, labelsTrain_ideology, max_len, doc_stride) #train
    #all_input_ids_Val, all_input_masks_Val, c, d = transform_sequences_longer(tokenizer, sentencesQueryTitle_Val, labelsVal, labelsVal_ideology, max_len, doc_stride) #train
    #all_input_ids_Test, all_input_masks_Test, e, f = transform_sequences_longer(tokenizer, sentencesQueryTitle_Test, labelsTest, labelsTest_ideology, max_len, doc_stride) #train

    #--------------TRAINING-------------#
    

    
    model, train_dataloader, validation_dataloader, optimizer, scheduler = prepare_for_training_stance_ideology_paper(all_input_ids_Train, all_input_masks_Train, labelsTrain, labelsTrain_ideology, all_input_ids_Val,
                                                                                                              all_input_masks_Val, labelsVal, labelsVal_ideology, model_current, batch_size, epochs, num_warmup_steps, learning_rate)    
    training_stats, last_epoch, min_val_loss, max_val_acc = train_stance_ideology_paper (train_nums, val_nums, train_nums_ideology, val_nums_ideology, model_save_path, model, train_dataloader, validation_dataloader, epochs, batch_size, optimizer,
                                                                      scheduler, patience, verbose, delta, seedVal, False)
    #
    #model, train_dataloader, validation_dataloader, optimizer, scheduler = prepare_for_training_stance_ideology(all_input_ids_Train, all_input_masks_Train, labelsTrain, labelsTrain_ideology, all_input_masks_Train,
    #                                                                                                           all_input_ids_Train, labelsTrain, labelsTrain_ideology, model_current, batch_size, epochs, num_warmup_steps, learning_rate)    
    #training_stats, last_epoch, min_val_loss, max_val_acc = train_stance_ideology_novel (train_nums, train_nums, train_nums_ideology, train_nums_ideology, model_save_path, model, tokenizer, train_dataloader, validation_dataloader, epochs, batch_size, optimizer,
    #                                                                      scheduler, patience, verbose, delta, seedVal, False)

        
    #df_stats = print_summary(training_stats)
    #plot_results(df_stats, last_epoch)

    #print('Min Val Loss: ' + str(min_val_loss))
    #print('Max Val Acc: ' + str(max_val_acc))
    #test_loss, test_acc, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc = run_test_stance_ideology(train_nums, train_nums_ideology, model_save_path, all_input_ids_Train, all_input_masks_Train, labelsTrain, labelsTrain_ideology)
    
    
    #print('Test Loss: ' + str(test_loss))
    #print('Test Stance Acc: ' + str(test_acc))
    
    #print('Agree Class Acc: ' + str(avg_agree_test_acc))
    #print('Disagree Class Acc: ' + str(avg_disagree_test_acc))
    #print('Discuss Class Acc: ' + str(avg_discuss_test_acc))
    #print('Unrelated Class Acc: ' + str(avg_unrelated_test_acc))
    
    #test_loss, test_acc, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc = run_test_stance_ideology(val_nums, val_nums_ideology, model_save_path, all_input_ids_Val, all_input_masks_Val, labelsVal, labelsVal_ideology)
    
    
    #print('Test Loss: ' + str(test_loss))
    #print('Test Stance Acc: ' + str(test_acc))
    
    #print('Agree Class Acc: ' + str(avg_agree_test_acc))
    #print('Disagree Class Acc: ' + str(avg_disagree_test_acc))
    #print('Discuss Class Acc: ' + str(avg_discuss_test_acc))
    #print('Unrelated Class Acc: ' + str(avg_unrelated_test_acc))

    test_loss, test_acc, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc, arr = run_test_stance_ideology(test_nums, test_nums_ideology, model_current, model_save_path, all_input_ids_Test, 
                                                                                                                                            all_input_masks_Test, labelsTest, labelsTest_ideology)
    
    print('Test Loss: ' + str(test_loss))
    print('Test Stance Acc: ' + str(test_acc))
    
    print('Agree Class Acc: ' + str(avg_agree_test_acc))
    print('Disagree Class Acc: ' + str(avg_disagree_test_acc))
    print('Discuss Class Acc: ' + str(avg_discuss_test_acc))
    print('Unrelated Class Acc: ' + str(avg_unrelated_test_acc))

In [None]:
from numpy import nan
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.utils import resample


def run_wholeprocess_stance_serp2(train_path, val_path, max_len, doc_stride, batch_size, num_warmup_steps, learning_rate, epochs, seedVal):
    device = run_utils()
    #model_save_path = './model_save/fnc/model_emergentbert_epoch90_withoutsep_serp.t7'  
     
    #model_fnc = './model_save/fnc/model_fnc.t7'   
    #model_current = "ponmari/Question-Answering"
        
    #model_current = 'distilbert-base-uncased'
    #model_current = './models/BERT_SERP/bert_titleonly_finetuned'
    #model_current = 'roberta-large'
    #model_current = 'bert-base-uncased'
    #model_current = 'xlnet-base-cased'
    #model_current = 'bert-base-multilingual-cased'
    #model_current = 'albert-base-v2'
    #model_current = './models/mnli_model/'
    #model_current = './model_save/fnc/model_emergentbert_epoch90.t7'
    

#--------------LOAD DATASETS--------------#

    train_path = 'emergent_train.csv'
    val_path = 'emergent_val.csv'
    test_path = 'emergent_test.csv'
    
    ##df = load_dataset_emergent(train_path)
    #dfVal = load_dataset_emergent(val_path)
    #dfTest = load_dataset_emergent(test_path)
    
    #df = pd.concat([df, dfVal], ignore_index=True)
    

    #train_path = './dataset/batches_cleaned/stance/Train_latest.tsv'
    #val_path = './dataset/batches_cleaned/stance/Val_latest.tsv'
    #test_path = './dataset/batches_cleaned/stance/Test_latest.tsv'
    
    labelsTrain_ideology = []
    labelsVal_ideology = []
    labelsTest_ideology = []
    
    train_nums_ideology = 0
    val_nums_ideology = 0
    test_nums_ideology = 0
    
    

    trainPer = 1.2
    valPer = 0.2
    testPer = 0.2
    
    #df_all = load_dataset_stance('./dataset/batches_cleaned/stance/MergedDataset_20.06.2021.tsv')
    #df_all_added = create_more_notrel_docs(df_all)
    
    df_all_added = load_dataset_stance('./dataset/batches_cleaned/stance/Final_Dataset_AddedNotRelated.tsv')
    y = df_all_added['stance'].copy(deep=True)
    X = df_all_added.drop('stance', axis=1).copy(deep=True)
    
    print(seedVal)
    #split train and test datasets once
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state=seedVal)
    
    
        
    train_nums = count_class_num (df_all_added)
    # Report the number of sentences.
    print('Number of all sentences: ', train_nums)
    
    dfTrain = X_train.copy(deep=True)
    dfTest = X_test.copy(deep=True)
    
    dfTrain.insert(2, "stance", y_train.values)
    dfTest.insert(2, "stance", y_test.values) 
    
    dfTrain.to_csv('./dataset/batches_cleaned/stance/train_serp.tsv', sep='\t', index=False)
    dfTest.to_csv('./dataset/batches_cleaned/stance/test_serp.tsv', sep='\t', index=False)
    
    test_nums = count_class_num (dfTest)
    print('Number of test sentences: {:,}'.format(dfTest.shape[0]))
    
    #X_train.to_csv('./dataset/batches_cleaned/stance/train_serp.tsv', sep='\t', index=False)
    #X_val.to_csv('./dataset/batches_cleaned/stance/val_serp.tsv', sep='\t', index=False)
    
    k_fold = 5
    model_list = ['xlnet-base-cased', 'xlnet-base-cased']
    
    
    num_base_models = len(model_list)
    model_save_list = []
    model_losses = []
    all_val_folds_all_predictions = np.empty((100, num_base_models*4))
    all_val_encoded_targets = np.empty((100, 4))
    best_val_losses = []
    best_model_savepaths = []
    for idx in range(0, k_fold):
        X = pd.DataFrame (columns=['qID', 'docID', 'ideology', 'docCont' 'Q', 'title'])
        y = pd.DataFrame (columns=['stance'])
        
        X_2 = pd.DataFrame (columns=['qID', 'docID', 'ideology', 'docCont' 'Q', 'title'])
        y_2 = pd.DataFrame (columns=['stance'])
        
        print("******************")
        print("Training is starting for cross validation dataset " + str(idx))
        print("******************")
        
        seedVal = get_random_seed()
        print(seedVal)
        
        #creating a k-fold CV
        X, X_2, y, y_2 = train_test_split(X_train, y_train, test_size=1.0/k_fold, shuffle = True, random_state = seedVal)
        
        df = X.copy(deep=True)
        dfVal = X_2.copy(deep=True)
        
        df.insert(2, "stance", y.values)
        dfVal.insert(2, "stance", y_2.values)
        
        df.to_csv('./dataset/batches_cleaned/stance/train_serp' + "_" + str(idx) + ".tsv", sep='\t', index=False)
        dfVal.to_csv('./dataset/batches_cleaned/stance/val_serp' + "_" + str(idx) + ".tsv", sep='\t', index=False)
        
        #df = pd.read_csv('./dataset/batches_cleaned/stance/train_serp' + "_" + str(idx) + ".tsv", sep='\t')
        #dfVal = pd.read_csv('./dataset/batches_cleaned/stance/val_serp' + "_" + str(idx) + ".tsv", sep='\t')
        
        #df = pd.read_csv(path, delimiter='\t', header = 0, names=['qID', 'docID', 'stance', 'ideology', 'docCont', 'Q', 'title']
        
        
    
        train_nums = count_class_num (df)
        val_nums = count_class_num (dfVal)

    
        #sentencesQueryCont_Train = []
        labelsTrain = []

        #sentencesQueryTitle_Train, labelsTrain = generate_datasets_emergent (df)
        sentencesQueryTitle_Train, sentencesQueryTitleCont_Train, labelsTrain, labelsTrain_ideology = generate_datasets (df)


        #sentencesQueryCont_Val = []
        labelsVal = []

        #--------------DATASETS-------------#
    
        #sentencesQueryTitle_Val, labelsVal = generate_datasets_emergent (dfVal)
        sentencesQueryTitle_Val, sentencesQueryTitleCont_Val, labelsVal, labelsVal_ideology = generate_datasets (dfVal)
        
        encoded_current_val_targets = preprocess_stance_ideology_meta(labelsVal)
        if idx == 0:
            all_val_encoded_targets = encoded_current_val_targets
        else:
            all_val_encoded_targets = np.concatenate((all_val_encoded_targets, encoded_current_val_targets))
    
        # Report the number of sentences.
        print('Number of training sentences: ', train_nums)
        print('Number of val sentences: ', val_nums)
        
        #first: bert-base, second: roberta-base, third: xlnet-base
        
        current_val_fold_all_predictions = np.empty((dfVal.shape[0], num_base_models))
        
        for idxModel in range(0, num_base_models):
            
            model_current = model_list[idxModel]
            tokenizer = load_tokenizer(model_current)
            
            #tokenize the dataset for current k-fold training and val
            model_save_path = './models/BERT_SERP/model_finetuned' + "_" + str(idxModel) + "_" + str(idx)
            if idxModel % 2 == 1:
            
                all_input_ids_Train, all_input_masks_Train  = preprocessing_for_bert(tokenizer, sentencesQueryTitleCont_Train, max_len, doc_stride) #train
                all_input_ids_Val, all_input_masks_Val = preprocessing_for_bert(tokenizer, sentencesQueryTitleCont_Val, max_len, doc_stride) #train

            #--------------TRAINING-------------#
        
            
                model, train_dataloader, validation_dataloader, optimizer, scheduler = prepare_for_training_stance_ideology_paper(all_input_ids_Train, all_input_masks_Train, labelsTrain, labelsTrain_ideology, all_input_ids_Val,
                                                                            all_input_masks_Val, labelsVal, labelsVal_ideology, model_current, batch_size, epochs, num_warmup_steps, learning_rate)    
                training_stats, last_epoch, min_val_loss, max_val_acc = train_stance_ideology_paper (train_nums, val_nums, train_nums_ideology, val_nums_ideology, model_save_path, model, train_dataloader, validation_dataloader, epochs, batch_size, optimizer,
                                                                            scheduler, patience, verbose, delta, seedVal, False)
                
                val_loss, val_acc, avg_agree_val_acc, avg_disagree_val_acc, avg_discuss_val_acc, avg_unrelated_val_acc, val_predictions = run_test_stance_ideology(val_nums, val_nums_ideology, model_current, model_save_path, all_input_ids_Val, all_input_masks_Val, labelsVal, labelsVal_ideology)
                
            else:
                
                all_input_ids_Train, all_input_masks_Train  = preprocessing_for_bert(tokenizer, sentencesQueryTitle_Train, max_len, doc_stride) #train
                all_input_ids_Val, all_input_masks_Val = preprocessing_for_bert(tokenizer, sentencesQueryTitle_Val, max_len, doc_stride) #train
            
            
                model, train_dataloader, validation_dataloader, optimizer, scheduler = prepare_for_training_stance_ideology_paper(all_input_ids_Train, all_input_masks_Train, labelsTrain, labelsTrain_ideology, all_input_ids_Val,
                                                                            all_input_masks_Val, labelsVal, labelsVal_ideology, model_current, batch_size, epochs, num_warmup_steps, learning_rate)    
                training_stats, last_epoch, min_val_loss, max_val_acc = train_stance_ideology_paper(train_nums, val_nums, train_nums_ideology, val_nums_ideology, model_save_path, model, train_dataloader, validation_dataloader, epochs, batch_size, optimizer,
                                                                            scheduler, patience, verbose, delta, seedVal, False)
        
        
                val_loss, val_acc, avg_agree_val_acc, avg_disagree_val_acc, avg_discuss_val_acc, avg_unrelated_val_acc, val_predictions = run_test_stance_ideology(val_nums, val_nums_ideology, model_current, model_save_path, all_input_ids_Val, all_input_masks_Val, labelsVal, labelsVal_ideology)
            #if idx == 0:
                #best_val_losses.append(val_loss)
                #best_model_savepaths.append(model_save_path)
            #else:
                #if val_loss < best_val_losses[idxModel]:
                    #best_val_losses[idxModel] = val_loss
                    #best_model_savepaths[idxModel] = model_save_path
            
            print(len(val_predictions))
            if idxModel == 0:
                current_val_fold_all_predictions = val_predictions
            else:
                current_val_fold_all_predictions = np.concatenate((current_val_fold_all_predictions, val_predictions), axis=1)
                
            print(current_val_fold_all_predictions.shape)

        print("**************")
        print(current_val_fold_all_predictions)
        current_concat_meta_layer_features = np.concatenate((all_input_ids_Val, all_input_masks_Val, current_val_fold_all_predictions), axis=1)
        
        
        filename_fold_predictions = './dataset/batches_cleaned/stance/cv_set_val_predictions_' + str(idx) + '.tsv'
        pd.DataFrame(current_val_fold_all_predictions).to_csv(filename_fold_predictions, sep='\t', header=None, index=None)
        
        filename_fold_targets = './dataset/batches_cleaned/stance/cv_set_val_targets_' + str(idx) + '.tsv'
        pd.DataFrame(encoded_current_val_targets).to_csv(filename_fold_targets, sep='\t', header=None, index=None)
    
        if idx == 0:
            all_val_folds_all_predictions = current_val_fold_all_predictions
        else:
            all_val_folds_all_predictions = np.concatenate((all_val_folds_all_predictions,current_val_fold_all_predictions))
    
    #all_val_encoded_targets_tensor = torch.cat(all_val_encoded_targets, dim=0)
    #meta-learner phase
    
    current_val_fold_all_predictions = np.empty((100, num_base_models))
    encoded_current_val_targets = np.empty((100, k_fold))
    for idx in range(0, k_fold):
        filename_curr = './dataset/batches_cleaned/stance/cv_set_val_predictions_' + str(idx) + '.tsv'
        current_val_fold_all_predictions = pd.read_csv(filename_curr, sep='\t', header = 0, dtype='float64')
        
        
        filename_fold_targets = './dataset/batches_cleaned/stance/cv_set_val_targets_' + str(idx) + '.tsv'
        encoded_current_val_targets = pd.read_csv(filename_fold_targets, sep='\t', header = 0, dtype='int64')
        #current_val_fold_all_predictions = np.loadtxt(filename_curr, dtype=float)
        
        if idx == 0:
            all_val_folds_all_predictions = current_val_fold_all_predictions
            all_val_encoded_targets = encoded_current_val_targets
        else:
            all_val_folds_all_predictions = np.concatenate((all_val_folds_all_predictions, current_val_fold_all_predictions))
            all_val_encoded_targets = np.concatenate((all_val_encoded_targets, encoded_current_val_targets))
    
    
    for idxModel in range(0, num_base_models):
        model_save_path = './models/BERT_SERP/model_finetuned' + "_" + str(idxModel) + "_" + str(0)
        best_model_savepaths.append(model_save_path)

    print("---------------")
    print(all_val_folds_all_predictions.shape)
    print(all_val_encoded_targets.shape)
    print("Meta Learning Training is Starting...")
    
    X = pd.DataFrame (columns=['qID', 'docID', 'ideology', 'docCont' 'Q', 'title'])
    y = pd.DataFrame (columns=['pro', 'agst', 'neut', 'not-rel'])
        
    X_val = pd.DataFrame (columns=['qID', 'docID', 'ideology', 'docCont' 'Q', 'title'])
    y_val = pd.DataFrame (columns=['pro', 'agst', 'neut', 'not-rel'])
    
    new_df = pd.DataFrame(all_val_folds_all_predictions)
    new_y = pd.DataFrame(all_val_encoded_targets)
    
    new_df  = new_df.iloc[: , -16:]
    
    #creating a k-fold CV
    X, X_val, y, y_val = train_test_split(new_df, new_y, test_size=0.25, shuffle = True, random_state = seedVal, stratify = new_y)
    
    print(y)
    y = y.replace(-1, 1)
    y_val = y_val.replace(-1, 1)
    
    print(y)
    
    
    df = X.copy()
    dfVal = X_val.copy()

    
    #There is a problem here!
    train_nums = count_class_num (y)
    val_nums = count_class_num (y_val)
    
    print(train_nums)
    print(val_nums)
    
    model_save_path_metalearner = './models/BERT_SERP/metalearner'
    
    
    model, train_dataloader, validation_dataloader, optimizer, scheduler = prepare_for_training_stance_ideology_metalearner(X, y, X_val, y_val, batch_size, epochs, num_warmup_steps, learning_rate)
    training_stats, last_epoch, min_val_loss, max_val_acc = train_stance_ideology_metalearner (train_nums, val_nums, train_nums_ideology, val_nums_ideology, model_save_path_metalearner, model, train_dataloader, validation_dataloader, epochs, batch_size, optimizer,
                                                                            scheduler, patience, verbose, delta, seedVal, False)    
    
    
    print("Testing phase is starting...")
    #apply the same process to the test set
    
    sentencesQueryCont_Test = []
    labelsTest = []
    
    #sentencesQueryTitle_Test, labelsTest = generate_datasets_emergent (dfTest)
    sentencesQueryTitle_Test, sentencesQueryTitleCont_Test, labelsTest, labelsTest_ideology = generate_datasets (dfTest)

    
    current_test_fold_all_predictions = np.empty((dfTest.shape[0], num_base_models))
    for idxModel in range(0, num_base_models):
        model_current = model_list[idxModel]
        tokenizer = load_tokenizer(model_current)
        
        all_input_ids_Test, all_input_masks_Test = preprocessing_for_bert(tokenizer, sentencesQueryTitleCont_Test, max_len, doc_stride) #test
        
        model_save_path = './models/BERT_SERP/model_finetuned' + "_" + str(idxModel) + "_" + str(4)
        
        print("Current best model: " + str(model_save_path))
        
        test_loss, test_acc, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc, test_predictions = run_test_stance_ideology(test_nums, test_nums_ideology, model_current, 
        model_save_path, all_input_ids_Test, all_input_masks_Test, labelsTest, labelsTest_ideology)
        
        if idxModel == 0:
            current_test_fold_all_predictions = test_predictions
        else:
            current_test_fold_all_predictions = np.concatenate((current_test_fold_all_predictions, test_predictions), axis=1)
            
    print(current_test_fold_all_predictions.shape)
    print(labelsTest.shape)
    
    print("Testing Meta phase is starting...")
 
    test_loss, test_acc, avg_agree_test_acc, avg_disagree_test_acc, avg_discuss_test_acc, avg_unrelated_test_acc = run_test_stance_ideology_meta(test_nums, test_nums_ideology, model_current, model_save_path_metalearner, current_test_fold_all_predictions, labelsTest, labelsTest_ideology)
    
    print("****************")
    print('Test Loss: ' + str(test_loss))
    print('Test Stance Acc: ' + str(test_acc))
    
    print('Agree Class Acc: ' + str(avg_agree_test_acc))
    print('Disagree Class Acc: ' + str(avg_disagree_test_acc))
    print('Discuss Class Acc: ' + str(avg_discuss_test_acc))
    print('Unrelated Class Acc: ' + str(avg_unrelated_test_acc))

In [None]:
def create_determinism(seedVal):
    torch.manual_seed(seedVal)
    torch.cuda.manual_seed(seedVal)
    np.random.seed(seedVal)
    random.seed(seedVal)
    torch.backends.cudnn.deterministic = True

In [None]:
#%rm -rf "./runs/"

In [None]:
#import zipfile
#with zipfile.ZipFile('./dataset/file.zip', 'r') as zip_ref:
    #zip_ref.extractall('./dataset/myFolder/')

In [None]:
#!pip install import_ipynb

In [None]:
import os
import string
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
import random
from random import randint
import time
import datetime
from transformers import AutoModel
from transformers import DistilBertModel
from torch.utils.data import TensorDataset, random_split
from sklearn.model_selection import train_test_split

import import_ipynb
import module_mix
from module_mix import MixLinear

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#model = "bert-base-uncased"
train_path = './dataset/fnc/train'
val_path = './dataset/fnc/test'

model_save_path = './model_save'


max_len = 512
doc_stride = 0

batch_size = 16
epochs = 100
num_warmup_steps = 10
learning_rate = 1e-5
##-----Early Stopping
patience = 10
verbose = True
delta = 0.000001
#seedVal = 42

seedVal = get_random_seed()
print(seedVal)

#value = randint(0, 100)


#0 - tinybert
#1 - distilbert
#2 - bert

#bert_type = 2

#create_determinism(seedVal)

#folder_preparations()
run_wholeprocess_stance_serp_DT(train_path, val_path, max_len, doc_stride, batch_size, num_warmup_steps, learning_rate, epochs, seedVal)
#run_wholeprocess_stance_serp(train_path, val_path, max_len, doc_stride, batch_size, num_warmup_steps, learning_rate, epochs, seedVal)
#create_more_notrel_docs()