This is a Transformer Tool used to calculate the word-level surprisal. Code is based on
@article{michaelov_2022_AnaphoricZeroPronouns,
  title={Do language models make human-like predictions about the coreferents of Italian anaphoric zero pronouns?},
  author={Michaelov, James A. and Bergen, Benjamin K.},
  journal={arXiv preprint arXiv:2208.14554},
  year={2022}
}

psychformers general from github


In [2]:
import os
import argparse
from transformers import AutoTokenizer,AutoModelForCausalLM,AutoModelForMaskedLM
from torch.nn import functional as F
import torch
import numpy as np
import copy



def parse_args():
    parser = argparse.ArgumentParser(description='Calculates surprisal and other metrics...')

    parser.add_argument('--stimuli', '-i', type=str, default="/content/drive/MyDrive/PsychFormers/Stimuli.txt",
                        help='stimuli to test')
    parser.add_argument('--stimuli_list', '-ii', type=str, default=None,
                        help='path to file containing list of stimulus files to test')
    parser.add_argument('--output_directory', '-o', type=str, default='/content/drive/MyDrive/PsychFormers/output',
                        help='output directory')
    parser.add_argument('--primary_decoder', '-d', type=str, default='causal',
                        help='for models with both masked and causal versions, determine which to use (default is masked)')
    parser.add_argument('--model', '-m', type=str, default="gpt2-large",
                        help='select a model to use')
    parser.add_argument('--model_list', '-mm', type=str, default=None,
                        help='path to file with a list of models to run')
    parser.add_argument('--task', '-t', type=str, default="surprisal",
                        help='metric to caclulate')
    parser.add_argument('--task_list', '-tt', type=str, default=None,
                        help='path to file with list of metrics to caclulate')
    parser.add_argument('--following_context', '-f', action="store_true", default=False,
                        help='whether or not consider the following context with masked language models (default is False)')
    parser.add_argument('--use_cpu', '-cpu', action="store_true", default=False,
                        help='use CPU for models even if CUDA is available')

    # Create an argparse.Namespace object with the desired arguments
    args = argparse.Namespace(
        stimuli="/content/drive/MyDrive/PsychFormers/Stimuli.txt",
        stimuli_list=None,  # Or provide a path if needed
        output_directory='/content/drive/MyDrive/PsychFormers/output',
        primary_decoder="causal",
        model="gpt2-large",
        model_list=None,  # Or provide a path if needed
        task="surprisal",
        task_list=None,  # Or provide a path if needed
        following_context=False,
        use_cpu=False
    )

    return args

def process_args(args):
    try:
        output_directory = args.output_directory
    except:
        print("Error: Please specify a valid output directory.")

    if not os.path.exists(output_directory):
        try:
            os.makedirs(output_directory)
        except:
            print("Error: Cannot create output directory (Note: output directory does not already exist).")

    try:
        primary_decoder = args.primary_decoder
        assert primary_decoder=="causal" or primary_decoder=="masked"
    except:
        print("Error: Please select either 'causal' or 'masked' for primary decoder argument.")

    try:
        include_following_context = args.following_context
        assert type(include_following_context)==bool
    except:
        print("Error: 'following_context' argument must be Boolean.")

    try:
        cpu = args.use_cpu
        assert type(cpu)==bool
    except:
        print("Error: 'use_cpu' argument must be Boolean.")

    if args.model_list:
        try:
            assert os.path.exists(args.model_list)
            with open(args.model_list, "r") as f:
                model_list = f.read().splitlines()
        except:
            print("Error: 'model_list' argument does not have a valid path. Trying to use individual specified model.")
            try:
                assert args.model
                model_list = [args.model]
            except:
                print("Error: No model specified")
    else:
        try:
            assert args.model
            model_list = [args.model]
        except:
            print("Error: No model specified")



    if args.task_list:
        try:
            assert os.path.exists(args.task_list)
            with open(args.task_list, "r") as f:
                metric_list = f.read().splitlines()
        except:
            print("Error: 'metric_list' argument does not have a valid path. Trying to use individual specified metric.")
            try:
                assert args.task
                metric_list = [args.task]
            except:
                print("Error: No metric specified")
    else:
        try:
            assert args.task
            metric_list = [args.task]
        except:
            print("Error: No metric specified")


    if args.stimuli_list:
        try:
            assert os.path.exists(args.stimuli_list)
            with open(args.stimuli_list, "r") as f:
                stimulus_file_list = f.read().splitlines()
        except:
            print("Error: 'stimuli_list' argument does not have a valid path. Trying to use individual stimulus set.")
            try:
                assert args.stimuli
                stimulus_file_list = [args.stimuli]
            except:
                print("Error: No stimuli specified")
    else:
        try:
            assert args.stimuli
            stimulus_file_list = [args.stimuli]
        except:
            print("Error: No stimuli specified")

    return(output_directory,primary_decoder,include_following_context,model_list,metric_list,stimulus_file_list,cpu)

def create_and_run_models(model_list,stimulus_file_list,metric_list,primary_decoder,output_directory,include_following_context,cpu):
    if primary_decoder == "masked":
        for model_name in model_list:

            model_name_cleaned = model_name.replace("/","-")

            if 'tokenizer' in locals():
                del(tokenizer)

            if 'model' in locals():
                del(model)

            try:
                tokenizer = AutoTokenizer.from_pretrained(model_name)

                if (not tokenizer.bos_token) and (tokenizer.cls_token):
                    tokenizer.bos_token = tokenizer.cls_token
                if (not tokenizer.eos_token) and (tokenizer.sep_token):
                    tokenizer.eos_token = tokenizer.sep_token

                tokenizer.add_tokens(["[!StimulusMarker!]"," [!StimulusMarker!]"])

            except:
                print("Cannot create a tokenizer for model {0}".format(model_name))

            try:
                model = AutoModelForMaskedLM.from_pretrained(model_name)
                model_type = "masked"
            except:
                try:
                    model = AutoModelForCausalLM.from_pretrained(model_name,is_decoder=True)
                    model_type = "causal"
                except:
                    print("Model {0} is not a masked or causal language model. This is not supported".format(model_name))
            try:
                assert model and tokenizer
                if model and tokenizer:
                    try:
                        if model_type=="causal":
                            process_stims_causal(model.to("cuda" if (torch.cuda.is_available() and not cpu) else "cpu"),tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context)
                        elif model_type=="masked":
                            process_stims_masked(model.to("cuda" if (torch.cuda.is_available() and not cpu) else "cpu"),tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context)
                    except:
                        print("Cannot run either a masked or causal form of {0}".format(model_name))
            except:
                print("Cannot run experiment without both a tokenizer for and a causal or masked form of {0}".format(model_name))

    elif primary_decoder == "causal":
        for model_name in model_list:

            model_name_cleaned = model_name.replace("/","-")

            if 'tokenizer' in locals():
                del(tokenizer)

            if 'model' in locals():
                del(model)

            try:
                tokenizer = AutoTokenizer.from_pretrained(model_name)

                if (not tokenizer.bos_token) and (tokenizer.cls_token):
                    tokenizer.bos_token = tokenizer.cls_token
                if (not tokenizer.eos_token) and (tokenizer.sep_token):
                    tokenizer.eos_token = tokenizer.sep_token

                tokenizer.add_tokens(["[!StimulusMarker!]"," [!StimulusMarker!]"])

            except:
                print("Cannot create a tokenizer for model {0}".format(model_name))

            try:
                model = AutoModelForCausalLM.from_pretrained(model_name,is_decoder=True)
                model_type = "causal"
                if "Masked" in model.config.architectures[0]:
                    model_type = "causal_mask"
            except:
                try:
                    model = AutoModelForMaskedLM.from_pretrained(model_name)
                    model_type = "masked"
                except:
                    print("Model {0} is not a causal or masked language model. This is not supported".format(model_name))
            try:
                assert model and tokenizer
                if model and tokenizer:
                    try:
                        if model_type=="causal":
                            process_stims_causal(model.to("cuda" if (torch.cuda.is_available() and not cpu) else "cpu"),tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context)
                        elif model_type=="masked":
                            process_stims_masked(model.to("cuda" if (torch.cuda.is_available() and not cpu) else "cpu"),tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context)
                        elif model_type=="causal_mask":
                            process_stims_causal_mask(model.to("cuda" if (torch.cuda.is_available() and not cpu) else "cpu"),tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context)

                    except:
                        print("Cannot run either a causal or masked form of {0}".format(model_name))
            except:
                print("Cannot run experiment without both a tokenizer for and a causal or masked form of {0}".format(model_name))


def process_stims_causal(model,tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context):
    for i in range(len(stimulus_file_list)):
        stimuli_name = stimulus_file_list[i].split('/')[-1].split('.')[0]

        if "surprisal" in metric_list:
            filename = output_directory + "/" + stimuli_name + "." + "surprisal" + "." + model_name_cleaned + ".causal.output"
            with open(filename,"w") as f:
                f.write("FullSentence\tSentence\tTargetWords\tSurprisal\tNumTokens\n")

        with open(stimulus_file_list[i],'r') as f:
            stimulus_list = f.read().splitlines()
        for j in range(len(stimulus_list)):
            try:
                stimulus = stimulus_list[j]
                stimulus_spaces = stimulus.replace("*", "[!StimulusMarker!]")
                stimulus_spaces = stimulus_spaces.replace(" [!StimulusMarker!]", "[!StimulusMarker!] ")
                encoded_stimulus = tokenizer.encode(stimulus_spaces)

                if (len(tokenizer.tokenize("a[!StimulusMarker!]"))==2):
                    dummy_var_idxs = np.where((np.array(encoded_stimulus)==tokenizer.encode("[!StimulusMarker!]")[-1]) | (np.array(encoded_stimulus)==tokenizer.encode("a[!StimulusMarker!]")[-1]))[0]
                    preceding_context = encoded_stimulus[:dummy_var_idxs[0]]
                    if (len(preceding_context)==0) or (not ((preceding_context[0]==tokenizer.bos_token_id) or (preceding_context[0]==tokenizer.eos_token_id))):
                        preceding_context = [tokenizer.bos_token_id] + preceding_context
                    target_words = encoded_stimulus[dummy_var_idxs[0]+1:dummy_var_idxs[1]]
                    following_words = encoded_stimulus[dummy_var_idxs[1]+1:]

                    if "[!StimulusMarker!] " in stimulus_spaces and tokenizer.decode(target_words)[0]!=" ":
                        target_words_decoded = " " +tokenizer.decode(target_words)
                        target_words = tokenizer.encode(target_words_decoded)
                        if tokenizer.bos_token_id  in target_words:
                            target_words.remove(tokenizer.bos_token_id)
                        if tokenizer.eos_token_id  in target_words:
                            target_words.remove(tokenizer.eos_token_id)

                    if "surprisal" in metric_list:
                        get_surprisal_causal(model,tokenizer,preceding_context,following_words,target_words,stimuli_name,model_name_cleaned,output_directory,stimulus)
            except:
                print("Problem with stimulus on line {0}: {1}\n".format(str(j+1),stimulus_list[j]))


def get_surprisal_causal(model,tokenizer,preceding_context,following_words,target_words,stimuli_name,model_name_cleaned,output_directory,stimulus):
    filename = output_directory + "/" + stimuli_name + "." + "surprisal" + "." + model_name_cleaned + ".causal.output"
    current_context = copy.deepcopy(preceding_context)
    all_probabilities = []
    for i in range(len(target_words)):
        current_target = target_words[i]
        input = torch.LongTensor([current_context]).to(model.device)
        with torch.no_grad():
            next_token_logits = model(input, return_dict=True).logits[:, -1, :]
        probs = F.softmax(next_token_logits,dim=-1)
        probability = probs[0,current_target]
        current_context.append(current_target)
        all_probabilities.append(probability.item())
    all_probabilities = np.array(all_probabilities)
    num_tokens = len(all_probabilities)
    sum_surprisal = np.sum(-np.log2(all_probabilities))
    sentence = tokenizer.decode(preceding_context[1:]+target_words)
    full_sentence = tokenizer.decode(preceding_context[1:]+target_words+following_words)
    target_string = tokenizer.decode(target_words)
    with open(filename,"a") as f:
        f.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(
            stimulus.replace("*",""),
            sentence,
            target_string,
            sum_surprisal,
            num_tokens
        ))


def process_stims_masked(model,tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context):
    for i in range(len(stimulus_file_list)):
        stimuli_name = stimulus_file_list[i].split('/')[-1].split('.')[0]

        if "surprisal" in metric_list:
            filename = output_directory + "/" + stimuli_name + "." + "surprisal" + "." + model_name_cleaned + ".masked.output"
            with open(filename,"w") as f:
                f.write("FullSentence\tSentence\tTargetWords\tSurprisal\tNumTokens\n")

        with open(stimulus_file_list[i],'r') as f:
            stimulus_list = f.read().splitlines()
        for j in range(len(stimulus_list)):
            try:
                stimulus = stimulus_list[j]
                stimulus_spaces = stimulus.replace("*", "[!StimulusMarker!]")
                if (tokenizer.tokenize(" a")[0][0]==tokenizer.tokenize(" b")[0][0]) and (tokenizer.tokenize("a")[0][0]!=tokenizer.tokenize("b")[0][0]):
                    stimulus_spaces = stimulus_spaces.replace(" [!StimulusMarker!]", "[!StimulusMarker!] ")
                else:
                    stimulus_spaces = stimulus_spaces.replace("[!StimulusMarker!]", "[!StimulusMarker!] ")
                    stimulus_spaces = stimulus_spaces.replace(" [!StimulusMarker!]", "[!StimulusMarker!]")
                encoded_stimulus = tokenizer.encode(stimulus_spaces)[1:-1]

                if (len(tokenizer.tokenize("a[!StimulusMarker!]"))==2):
                    dummy_var_idxs = np.where((np.array(encoded_stimulus)==tokenizer.encode("[!StimulusMarker!]")[-2]) | (np.array(encoded_stimulus)==tokenizer.encode("a[!StimulusMarker!]")[-2]))[0]
                    preceding_context = encoded_stimulus[:dummy_var_idxs[0]]
                    if (len(preceding_context)==0) or (not preceding_context[0]==tokenizer.bos_token_id):
                        preceding_context = [tokenizer.bos_token_id] + preceding_context
                    target_words = encoded_stimulus[dummy_var_idxs[0]+1:dummy_var_idxs[1]]
                    following_words = encoded_stimulus[dummy_var_idxs[1]+1:]
                    if "surprisal" in metric_list:
                        get_surprisal_masked(model,tokenizer,preceding_context,following_words,target_words,stimuli_name,model_name_cleaned,output_directory,include_following_context,stimulus)
            except:
                print("Problem with stimulus on line {0}: {1}\n".format(str(j+1),stimulus_list[j]))

def get_surprisal_masked(model,tokenizer,preceding_context,following_words,target_words,stimuli_name,model_name_cleaned,output_directory,include_following_context,stimulus):
    filename = output_directory + "/" + stimuli_name + "." + "surprisal" + "." + model_name_cleaned + ".masked.output"
    current_context = copy.deepcopy(preceding_context)
    all_probabilities = []
    for i in range(len(target_words)):
        current_target = target_words[i]
        context_plus_mask = current_context + [tokenizer.mask_token_id]
        if include_following_context==True:
            context_plus_mask = context_plus_mask + following_words
        model_input_list = context_plus_mask+[tokenizer.eos_token_id]
        mask_idx = model_input_list.index(tokenizer.mask_token_id)
        input = torch.LongTensor([model_input_list]).to(model.device)
        with torch.no_grad():
            next_token_logits = model(input, return_dict=True).logits[:, mask_idx, :]
        probs = F.softmax(next_token_logits,dim=-1)
        probability = probs[0,current_target]
        current_context.append(current_target)
        all_probabilities.append(probability.item())
    all_probabilities = np.array(all_probabilities)
    num_tokens = len(all_probabilities)
    sum_surprisal = np.sum(-np.log2(all_probabilities))
    sentence = tokenizer.decode(preceding_context[1:]+target_words)
    full_sentence = tokenizer.decode(preceding_context[1:]+target_words+following_words)
    target_string = tokenizer.decode(target_words)
    with open(filename,"a") as f:
        f.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(
            stimulus.replace("*",""),
            sentence,
            target_string,
            sum_surprisal,
            num_tokens
        ))

def process_stims_causal_mask(model,tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context):
    for i in range(len(stimulus_file_list)):
        stimuli_name = stimulus_file_list[i].split('/')[-1].split('.')[0]

        if "surprisal" in metric_list:
            filename = output_directory + "/" + stimuli_name + "." + "surprisal" + "." + model_name_cleaned + ".causal_mask.output"
            with open(filename,"w") as f:
                f.write("FullSentence\tSentence\tTargetWords\tSurprisal\tNumTokens\n")

        with open(stimulus_file_list[i],'r') as f:
            stimulus_list = f.read().splitlines()
        for j in range(len(stimulus_list)):
            try:
                stimulus = stimulus_list[j]
                stimulus_spaces = stimulus.replace("*", "[!StimulusMarker!]")
                if (tokenizer.tokenize(" a")[0][0]==tokenizer.tokenize(" b")[0][0]) and (tokenizer.tokenize("a")[0][0]!=tokenizer.tokenize("b")[0][0]):
                    stimulus_spaces = stimulus_spaces.replace(" [!StimulusMarker!]", "[!StimulusMarker!] ")
                else:
                    stimulus_spaces = stimulus_spaces.replace("[!StimulusMarker!]", "[!StimulusMarker!] ")
                    stimulus_spaces = stimulus_spaces.replace(" [!StimulusMarker!]", "[!StimulusMarker!]")
                encoded_stimulus = tokenizer.encode(stimulus_spaces)[1:-1]

                if (len(tokenizer.tokenize("a[!StimulusMarker!]"))==2):
                    dummy_var_idxs = np.where((np.array(encoded_stimulus)==tokenizer.encode("[!StimulusMarker!]")[-2]) | (np.array(encoded_stimulus)==tokenizer.encode("a[!StimulusMarker!]")[-2]))[0]
                    preceding_context = encoded_stimulus[:dummy_var_idxs[0]]
                    if (len(preceding_context)==0) or (not preceding_context[0]==tokenizer.bos_token_id):
                        preceding_context = [tokenizer.bos_token_id] + preceding_context
                    target_words = encoded_stimulus[dummy_var_idxs[0]+1:dummy_var_idxs[1]]
                    following_words = encoded_stimulus[dummy_var_idxs[1]+1:]

                    if "surprisal" in metric_list:
                        get_surprisal_causal_mask(model,tokenizer,preceding_context,following_words,target_words,stimuli_name,model_name_cleaned,output_directory,include_following_context,stimulus)
            except:
                print("Problem with stimulus on line {0}: {1}\n".format(str(j+1),stimulus_list[j]))

def get_surprisal_causal_mask(model,tokenizer,preceding_context,following_words,target_words,stimuli_name,model_name_cleaned,output_directory,include_following_context,stimulus):
    filename = output_directory + "/" + stimuli_name + "." + "surprisal" + "." + model_name_cleaned + ".causal_mask.output"
    current_context = copy.deepcopy(preceding_context)
    all_probabilities = []
    for i in range(len(target_words)):
        current_target = target_words[i]
        context_plus_mask = current_context + [tokenizer.mask_token_id]
        if include_following_context==True:
            context_plus_mask = context_plus_mask + following_words
        model_input_list = context_plus_mask+[tokenizer.eos_token_id]
        mask_idx = model_input_list.index(tokenizer.mask_token_id)
        input = torch.LongTensor([model_input_list]).to(model.device)
        with torch.no_grad():
            next_token_logits = model(input, return_dict=True).logits[:, mask_idx, :]
        probs = F.softmax(next_token_logits,dim=-1)
        probability = probs[0,current_target]
        current_context.append(current_target)
        all_probabilities.append(probability.item())
    all_probabilities = np.array(all_probabilities)
    num_tokens = len(all_probabilities)
    sum_surprisal = np.sum(-np.log2(all_probabilities))
    sentence = tokenizer.decode(preceding_context[1:]+target_words)
    full_sentence = tokenizer.decode(preceding_context[1:]+target_words+following_words)
    target_string = tokenizer.decode(target_words)
    with open(filename,"a") as f:
        f.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(
            stimulus.replace("*",""),
            sentence,
            target_string,
            sum_surprisal,
            num_tokens
        ))


def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args = parse_args()
    output_directory,primary_decoder,include_following_context,model_list,metric_list,stimulus_file_list,cpu = process_args(args)
    create_and_run_models(model_list,stimulus_file_list,metric_list,primary_decoder,output_directory,include_following_context,cpu)

if __name__ == "__main__":
    main()

Deepseek R1

In [4]:
import os
import argparse
from transformers import AutoTokenizer,AutoModelForCausalLM,AutoModelForMaskedLM
from torch.nn import functional as F
import torch
import numpy as np
import copy



def parse_args():
    parser = argparse.ArgumentParser(description='Calculates surprisal and other metrics...')

    parser.add_argument('--stimuli', '-i', type=str, default="/content/drive/MyDrive/PsychFormers/interpreted_stimuli.txt",
                        help='stimuli to test')
    parser.add_argument('--stimuli_list', '-ii', type=str, default=None,
                        help='path to file containing list of stimulus files to test')
    parser.add_argument('--output_directory', '-o', type=str, default='/content/drive/MyDrive/PsychFormers/interpreted stimuli output',
                        help='output directory')
    parser.add_argument('--primary_decoder', '-d', type=str, default='causal',
                        help='for models with both masked and causal versions, determine which to use (default is masked)')
    parser.add_argument('--model', '-m', type=str, default="IDEA-CCNL/Wenzhong-GPT2-3.5B",
                        help='select a model to use')
    parser.add_argument('--model_list', '-mm', type=str, default=None,
                        help='path to file with a list of models to run')
    parser.add_argument('--task', '-t', type=str, default="surprisal",
                        help='metric to caclulate')
    parser.add_argument('--task_list', '-tt', type=str, default=None,
                        help='path to file with list of metrics to caclulate')
    parser.add_argument('--following_context', '-f', action="store_true", default=False,
                        help='whether or not consider the following context with masked language models (default is False)')
    parser.add_argument('--use_cpu', '-cpu', action="store_true", default=True,
                        help='use CPU for models even if CUDA is available')

    # Create an argparse.Namespace object with the desired arguments
    args = argparse.Namespace(
        stimuli="/content/drive/MyDrive/PsychFormers/interpreted_stimuli.txt",
        stimuli_list=None,  # Or provide a path if needed
        output_directory='/content/drive/MyDrive/PsychFormers/interpreted stimuli output',
        primary_decoder="causal",
        model="IDEA-CCNL/Wenzhong-GPT2-3.5B",
        model_list=None,  # Or provide a path if needed
        task="surprisal",
        task_list=None,  # Or provide a path if needed
        following_context=False,
        use_cpu=True
    )

    return args

def process_args(args):
    try:
        output_directory = args.output_directory
    except:
        print("Error: Please specify a valid output directory.")

    if not os.path.exists(output_directory):
        try:
            os.makedirs(output_directory)
        except:
            print("Error: Cannot create output directory (Note: output directory does not already exist).")

    try:
        primary_decoder = args.primary_decoder
        assert primary_decoder=="causal" or primary_decoder=="masked"
    except:
        print("Error: Please select either 'causal' or 'masked' for primary decoder argument.")

    try:
        include_following_context = args.following_context
        assert type(include_following_context)==bool
    except:
        print("Error: 'following_context' argument must be Boolean.")

    try:
        cpu = args.use_cpu
        assert type(cpu)==bool
    except:
        print("Error: 'use_cpu' argument must be Boolean.")

    if args.model_list:
        try:
            assert os.path.exists(args.model_list)
            with open(args.model_list, "r") as f:
                model_list = f.read().splitlines()
        except:
            print("Error: 'model_list' argument does not have a valid path. Trying to use individual specified model.")
            try:
                assert args.model
                model_list = [args.model]
            except:
                print("Error: No model specified")
    else:
        try:
            assert args.model
            model_list = [args.model]
        except:
            print("Error: No model specified")



    if args.task_list:
        try:
            assert os.path.exists(args.task_list)
            with open(args.task_list, "r") as f:
                metric_list = f.read().splitlines()
        except:
            print("Error: 'metric_list' argument does not have a valid path. Trying to use individual specified metric.")
            try:
                assert args.task
                metric_list = [args.task]
            except:
                print("Error: No metric specified")
    else:
        try:
            assert args.task
            metric_list = [args.task]
        except:
            print("Error: No metric specified")


    if args.stimuli_list:
        try:
            assert os.path.exists(args.stimuli_list)
            with open(args.stimuli_list, "r") as f:
                stimulus_file_list = f.read().splitlines()
        except:
            print("Error: 'stimuli_list' argument does not have a valid path. Trying to use individual stimulus set.")
            try:
                assert args.stimuli
                stimulus_file_list = [args.stimuli]
            except:
                print("Error: No stimuli specified")
    else:
        try:
            assert args.stimuli
            stimulus_file_list = [args.stimuli]
        except:
            print("Error: No stimuli specified")

    return(output_directory,primary_decoder,include_following_context,model_list,metric_list,stimulus_file_list,cpu)

def create_and_run_models(model_list,stimulus_file_list,metric_list,primary_decoder,output_directory,include_following_context,cpu):
    if primary_decoder == "masked":
        for model_name in model_list:

            model_name_cleaned = model_name.replace("/","-")

            if 'tokenizer' in locals():
                del(tokenizer)

            if 'model' in locals():
                del(model)

            try:
                tokenizer = AutoTokenizer.from_pretrained(model_name)

                if (not tokenizer.bos_token) and (tokenizer.cls_token):
                    tokenizer.bos_token = tokenizer.cls_token
                if (not tokenizer.eos_token) and (tokenizer.sep_token):
                    tokenizer.eos_token = tokenizer.sep_token

                tokenizer.add_tokens(["[!StimulusMarker!]"," [!StimulusMarker!]"])

            except:
                print("Cannot create a tokenizer for model {0}".format(model_name))

            try:
                model = AutoModelForMaskedLM.from_pretrained(model_name)
                model_type = "masked"
            except:
                try:
                    model = AutoModelForCausalLM.from_pretrained(model_name,is_decoder=True)
                    model_type = "causal"
                except:
                    print("Model {0} is not a masked or causal language model. This is not supported".format(model_name))
            try:
                assert model and tokenizer
                if model and tokenizer:
                    try:
                        if model_type=="causal":
                            process_stims_causal(model.to("cuda" if (torch.cuda.is_available() and not cpu) else "cpu"),tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context)
                        elif model_type=="masked":
                            process_stims_masked(model.to("cuda" if (torch.cuda.is_available() and not cpu) else "cpu"),tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context)
                    except:
                        print("Cannot run either a masked or causal form of {0}".format(model_name))
            except:
                print("Cannot run experiment without both a tokenizer for and a causal or masked form of {0}".format(model_name))

    elif primary_decoder == "causal":
        for model_name in model_list:

            model_name_cleaned = model_name.replace("/","-")

            if 'tokenizer' in locals():
                del(tokenizer)

            if 'model' in locals():
                del(model)

            try:
                tokenizer = AutoTokenizer.from_pretrained(model_name)

                if (not tokenizer.bos_token) and (tokenizer.cls_token):
                    tokenizer.bos_token = tokenizer.cls_token
                if (not tokenizer.eos_token) and (tokenizer.sep_token):
                    tokenizer.eos_token = tokenizer.sep_token

                tokenizer.add_tokens(["[!StimulusMarker!]"," [!StimulusMarker!]"])

            except:
                print("Cannot create a tokenizer for model {0}".format(model_name))

            try:
                model = AutoModelForCausalLM.from_pretrained(model_name,is_decoder=True)
                model_type = "causal"
                if "Masked" in model.config.architectures[0]:
                    model_type = "causal_mask"
            except:
                try:
                    model = AutoModelForMaskedLM.from_pretrained(model_name)
                    model_type = "masked"
                except:
                    print("Model {0} is not a causal or masked language model. This is not supported".format(model_name))
            try:
                assert model and tokenizer
                if model and tokenizer:
                    try:
                        if model_type=="causal":
                            process_stims_causal(model.to("cuda" if (torch.cuda.is_available() and not cpu) else "cpu"),tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context)
                        elif model_type=="masked":
                            process_stims_masked(model.to("cuda" if (torch.cuda.is_available() and not cpu) else "cpu"),tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context)
                        elif model_type=="causal_mask":
                            process_stims_causal_mask(model.to("cuda" if (torch.cuda.is_available() and not cpu) else "cpu"),tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context)

                    except:
                        print("Cannot run either a causal or masked form of {0}".format(model_name))
            except:
                print("Cannot run experiment without both a tokenizer for and a causal or masked form of {0}".format(model_name))


def process_stims_causal(model,tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context):
    for i in range(len(stimulus_file_list)):
        stimuli_name = stimulus_file_list[i].split('/')[-1].split('.')[0]

        if "surprisal" in metric_list:
            filename = output_directory + "/" + stimuli_name + "." + "surprisal" + "." + model_name_cleaned + ".causal.output"
            with open(filename,"w") as f:
                f.write("FullSentence\tSentence\tTargetWords\tSurprisal\tNumTokens\n")

        with open(stimulus_file_list[i],'r') as f:
            stimulus_list = f.read().splitlines()
        for j in range(len(stimulus_list)):
            try:
                stimulus = stimulus_list[j]
                stimulus_spaces = stimulus.replace("*", "[!StimulusMarker!]")
                stimulus_spaces = stimulus_spaces.replace(" [!StimulusMarker!]", "[!StimulusMarker!] ")
                encoded_stimulus = tokenizer.encode(stimulus_spaces)

                if (len(tokenizer.tokenize("a[!StimulusMarker!]"))==2):
                    dummy_var_idxs = np.where((np.array(encoded_stimulus)==tokenizer.encode("[!StimulusMarker!]")[-1]) | (np.array(encoded_stimulus)==tokenizer.encode("a[!StimulusMarker!]")[-1]))[0]
                    preceding_context = encoded_stimulus[:dummy_var_idxs[0]]
                    if (len(preceding_context)==0) or (not ((preceding_context[0]==tokenizer.bos_token_id) or (preceding_context[0]==tokenizer.eos_token_id))):
                        preceding_context = [tokenizer.bos_token_id] + preceding_context
                    target_words = encoded_stimulus[dummy_var_idxs[0]+1:dummy_var_idxs[1]]
                    following_words = encoded_stimulus[dummy_var_idxs[1]+1:]

                    if "[!StimulusMarker!] " in stimulus_spaces and tokenizer.decode(target_words)[0]!=" ":
                        target_words_decoded = " " +tokenizer.decode(target_words)
                        target_words = tokenizer.encode(target_words_decoded)
                        if tokenizer.bos_token_id  in target_words:
                            target_words.remove(tokenizer.bos_token_id)
                        if tokenizer.eos_token_id  in target_words:
                            target_words.remove(tokenizer.eos_token_id)

                    if "surprisal" in metric_list:
                        get_surprisal_causal(model,tokenizer,preceding_context,following_words,target_words,stimuli_name,model_name_cleaned,output_directory,stimulus)
            except:
                print("Problem with stimulus on line {0}: {1}\n".format(str(j+1),stimulus_list[j]))


def get_surprisal_causal(model,tokenizer,preceding_context,following_words,target_words,stimuli_name,model_name_cleaned,output_directory,stimulus):
    filename = output_directory + "/" + stimuli_name + "." + "surprisal" + "." + model_name_cleaned + ".causal.output"
    current_context = copy.deepcopy(preceding_context)
    all_probabilities = []
    for i in range(len(target_words)):
        current_target = target_words[i]
        input = torch.LongTensor([current_context]).to(model.device)
        with torch.no_grad():
            next_token_logits = model(input, return_dict=True).logits[:, -1, :]
        probs = F.softmax(next_token_logits,dim=-1)
        probability = probs[0,current_target]
        current_context.append(current_target)
        all_probabilities.append(probability.item())
    all_probabilities = np.array(all_probabilities)
    num_tokens = len(all_probabilities)
    sum_surprisal = np.sum(-np.log2(all_probabilities))
    sentence = tokenizer.decode(preceding_context[1:]+target_words)
    full_sentence = tokenizer.decode(preceding_context[1:]+target_words+following_words)
    target_string = tokenizer.decode(target_words)
    with open(filename,"a") as f:
        f.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(
            stimulus.replace("*",""),
            sentence,
            target_string,
            sum_surprisal,
            num_tokens
        ))


def process_stims_masked(model,tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context):
    for i in range(len(stimulus_file_list)):
        stimuli_name = stimulus_file_list[i].split('/')[-1].split('.')[0]

        if "surprisal" in metric_list:
            filename = output_directory + "/" + stimuli_name + "." + "surprisal" + "." + model_name_cleaned + ".masked.output"
            with open(filename,"w") as f:
                f.write("FullSentence\tSentence\tTargetWords\tSurprisal\tNumTokens\n")

        with open(stimulus_file_list[i],'r') as f:
            stimulus_list = f.read().splitlines()
        for j in range(len(stimulus_list)):
            try:
                stimulus = stimulus_list[j]
                stimulus_spaces = stimulus.replace("*", "[!StimulusMarker!]")
                if (tokenizer.tokenize(" a")[0][0]==tokenizer.tokenize(" b")[0][0]) and (tokenizer.tokenize("a")[0][0]!=tokenizer.tokenize("b")[0][0]):
                    stimulus_spaces = stimulus_spaces.replace(" [!StimulusMarker!]", "[!StimulusMarker!] ")
                else:
                    stimulus_spaces = stimulus_spaces.replace("[!StimulusMarker!]", "[!StimulusMarker!] ")
                    stimulus_spaces = stimulus_spaces.replace(" [!StimulusMarker!]", "[!StimulusMarker!]")
                encoded_stimulus = tokenizer.encode(stimulus_spaces)[1:-1]

                if (len(tokenizer.tokenize("a[!StimulusMarker!]"))==2):
                    dummy_var_idxs = np.where((np.array(encoded_stimulus)==tokenizer.encode("[!StimulusMarker!]")[-2]) | (np.array(encoded_stimulus)==tokenizer.encode("a[!StimulusMarker!]")[-2]))[0]
                    preceding_context = encoded_stimulus[:dummy_var_idxs[0]]
                    if (len(preceding_context)==0) or (not preceding_context[0]==tokenizer.bos_token_id):
                        preceding_context = [tokenizer.bos_token_id] + preceding_context
                    target_words = encoded_stimulus[dummy_var_idxs[0]+1:dummy_var_idxs[1]]
                    following_words = encoded_stimulus[dummy_var_idxs[1]+1:]
                    if "surprisal" in metric_list:
                        get_surprisal_masked(model,tokenizer,preceding_context,following_words,target_words,stimuli_name,model_name_cleaned,output_directory,include_following_context,stimulus)
            except:
                print("Problem with stimulus on line {0}: {1}\n".format(str(j+1),stimulus_list[j]))

def get_surprisal_masked(model,tokenizer,preceding_context,following_words,target_words,stimuli_name,model_name_cleaned,output_directory,include_following_context,stimulus):
    filename = output_directory + "/" + stimuli_name + "." + "surprisal" + "." + model_name_cleaned + ".masked.output"
    current_context = copy.deepcopy(preceding_context)
    all_probabilities = []
    for i in range(len(target_words)):
        current_target = target_words[i]
        context_plus_mask = current_context + [tokenizer.mask_token_id]
        if include_following_context==True:
            context_plus_mask = context_plus_mask + following_words
        model_input_list = context_plus_mask+[tokenizer.eos_token_id]
        mask_idx = model_input_list.index(tokenizer.mask_token_id)
        input = torch.LongTensor([model_input_list]).to(model.device)
        with torch.no_grad():
            next_token_logits = model(input, return_dict=True).logits[:, mask_idx, :]
        probs = F.softmax(next_token_logits,dim=-1)
        probability = probs[0,current_target]
        current_context.append(current_target)
        all_probabilities.append(probability.item())
    all_probabilities = np.array(all_probabilities)
    num_tokens = len(all_probabilities)
    sum_surprisal = np.sum(-np.log2(all_probabilities))
    sentence = tokenizer.decode(preceding_context[1:]+target_words)
    full_sentence = tokenizer.decode(preceding_context[1:]+target_words+following_words)
    target_string = tokenizer.decode(target_words)
    with open(filename,"a") as f:
        f.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(
            stimulus.replace("*",""),
            sentence,
            target_string,
            sum_surprisal,
            num_tokens
        ))

def process_stims_causal_mask(model,tokenizer,stimulus_file_list,metric_list,model_name_cleaned,output_directory,include_following_context):
    for i in range(len(stimulus_file_list)):
        stimuli_name = stimulus_file_list[i].split('/')[-1].split('.')[0]

        if "surprisal" in metric_list:
            filename = output_directory + "/" + stimuli_name + "." + "surprisal" + "." + model_name_cleaned + ".causal_mask.output"
            with open(filename,"w") as f:
                f.write("FullSentence\tSentence\tTargetWords\tSurprisal\tNumTokens\n")

        with open(stimulus_file_list[i],'r') as f:
            stimulus_list = f.read().splitlines()
        for j in range(len(stimulus_list)):
            try:
                stimulus = stimulus_list[j]
                stimulus_spaces = stimulus.replace("*", "[!StimulusMarker!]")
                if (tokenizer.tokenize(" a")[0][0]==tokenizer.tokenize(" b")[0][0]) and (tokenizer.tokenize("a")[0][0]!=tokenizer.tokenize("b")[0][0]):
                    stimulus_spaces = stimulus_spaces.replace(" [!StimulusMarker!]", "[!StimulusMarker!] ")
                else:
                    stimulus_spaces = stimulus_spaces.replace("[!StimulusMarker!]", "[!StimulusMarker!] ")
                    stimulus_spaces = stimulus_spaces.replace(" [!StimulusMarker!]", "[!StimulusMarker!]")
                encoded_stimulus = tokenizer.encode(stimulus_spaces)[1:-1]

                if (len(tokenizer.tokenize("a[!StimulusMarker!]"))==2):
                    dummy_var_idxs = np.where((np.array(encoded_stimulus)==tokenizer.encode("[!StimulusMarker!]")[-2]) | (np.array(encoded_stimulus)==tokenizer.encode("a[!StimulusMarker!]")[-2]))[0]
                    preceding_context = encoded_stimulus[:dummy_var_idxs[0]]
                    if (len(preceding_context)==0) or (not preceding_context[0]==tokenizer.bos_token_id):
                        preceding_context = [tokenizer.bos_token_id] + preceding_context
                    target_words = encoded_stimulus[dummy_var_idxs[0]+1:dummy_var_idxs[1]]
                    following_words = encoded_stimulus[dummy_var_idxs[1]+1:]

                    if "surprisal" in metric_list:
                        get_surprisal_causal_mask(model,tokenizer,preceding_context,following_words,target_words,stimuli_name,model_name_cleaned,output_directory,include_following_context,stimulus)
            except:
                print("Problem with stimulus on line {0}: {1}\n".format(str(j+1),stimulus_list[j]))

def get_surprisal_causal_mask(model,tokenizer,preceding_context,following_words,target_words,stimuli_name,model_name_cleaned,output_directory,include_following_context,stimulus):
    filename = output_directory + "/" + stimuli_name + "." + "surprisal" + "." + model_name_cleaned + ".causal_mask.output"
    current_context = copy.deepcopy(preceding_context)
    all_probabilities = []
    for i in range(len(target_words)):
        current_target = target_words[i]
        context_plus_mask = current_context + [tokenizer.mask_token_id]
        if include_following_context==True:
            context_plus_mask = context_plus_mask + following_words
        model_input_list = context_plus_mask+[tokenizer.eos_token_id]
        mask_idx = model_input_list.index(tokenizer.mask_token_id)
        input = torch.LongTensor([model_input_list]).to(model.device)
        with torch.no_grad():
            next_token_logits = model(input, return_dict=True).logits[:, mask_idx, :]
        probs = F.softmax(next_token_logits,dim=-1)
        probability = probs[0,current_target]
        current_context.append(current_target)
        all_probabilities.append(probability.item())
    all_probabilities = np.array(all_probabilities)
    num_tokens = len(all_probabilities)
    sum_surprisal = np.sum(-np.log2(all_probabilities))
    sentence = tokenizer.decode(preceding_context[1:]+target_words)
    full_sentence = tokenizer.decode(preceding_context[1:]+target_words+following_words)
    target_string = tokenizer.decode(target_words)
    with open(filename,"a") as f:
        f.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(
            stimulus.replace("*",""),
            sentence,
            target_string,
            sum_surprisal,
            num_tokens
        ))


def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args = parse_args()
    output_directory,primary_decoder,include_following_context,model_list,metric_list,stimulus_file_list,cpu = process_args(args)
    create_and_run_models(model_list,stimulus_file_list,metric_list,primary_decoder,output_directory,include_following_context,cpu)

if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/739 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/7.18G [00:00<?, ?B/s]

English Version:
text marking with **

In [3]:
import re

def mark_and_split_sentences_in_paragraph(paragraph, output_filename="marked_sentences.txt"):
    """Marks each word in each sentence of a paragraph with "**" and writes to a file.

    Args:
        paragraph: The input paragraph (string).
        output_filename: The name of the output text file.
    """

    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s\。\；\;', paragraph) # Robust sentence splitting
    all_marked_sentences = []

    for sentence in sentences:
        words = re.findall(r'\b\w+\b', sentence)
        modified_sentences = []

        for i in range(len(words)):
            new_sentence = ""
            word_index = 0
            for word in re.split(r'\b(\w+)\b', sentence):
                if word.isalnum():
                    if word_index == i:
                        new_sentence += f"*{word}*"
                    else:
                        new_sentence += word
                    word_index += 1
                else:
                    new_sentence += word
            modified_sentences.append(new_sentence)

        all_marked_sentences.extend(modified_sentences) # Add all variations of the current sentence

    output_text = "\n".join(all_marked_sentences)

    try:
        with open(output_filename, "w", encoding="utf-8") as f:
            f.write(output_text)
        print(f"Marked sentences written to {output_filename}")
    except Exception as e:
        print(f"An error occurred: {e}")




# Example usage:
paragraph = """大会主席先生，各位阁下，女士们、先生们，我们的世界正处于旋涡之中。我们正处在一个史诗般的变革时代，面临着我们从未见过的挑战，这些挑战需要全球性的解决方案。然而，地缘政治分歧不断加深。 地球持续升温。战争愈演愈烈，却不知如何收场。核姿态和新武器投下了阴影。我们正逐步走向难以想象的境地，一个有可能吞噬世界的火药桶。与此同时，2024年全球一半人会参加投票，而全人类都会受到影响。身临这一旋涡，面对在座诸位，我坚信两个最重要的事实。第一，当今世界的现状是不可持续的。我们不能再这样下去了。第二，我们面临的挑战是可以解决的。但这需要我们确保国际问题解决机制能够真正解决问题。未来峰会是第一步，但我们还有很长的路要走。要实现目标，必须正视造成不可持续性的三大因素。
一个有罪不罚的世界--违法和侵权行为威胁着国际法和《联合国宪章》的根基；一个不平等的世界--不公正和不满情绪有可能削弱国家，甚至将其推向边缘；一个充满不确定性的世界—不加管理的全球风险以不可知的方式威胁着我们的未来。"""

mark_and_split_sentences_in_paragraph(paragraph, "test_output.txt")


Marked sentences written to test_output.txt


Chinese Version of marking **

In [3]:
import jieba
import re


def segment_and_mark_chinese_text(text, output_file):
    """
    Segments Chinese text, marks each word with "**", and writes each marked word
    on a new line with the original sentence surrounding it.  Takes text as input.

    Args:
        text: The Chinese text to process (string).
        output_file: Path to the output file.
    """

    try:
        with open(output_file, 'w', encoding='utf-8') as outfile:
            # Split the input text into sentences or paragraphs based on common Chinese punctuation.
            # You can customize this splitting if needed.
           sentences = re.split(r"[。？！；：.?!;]+", text)
           for sentence in sentences:
                sentence = sentence.strip()  # Remove extra whitespace
                if sentence:
                    words = jieba.lcut(sentence)
                    for word in words:
                        marked_word = f"*{word}*"
                        outfile.write(f"{sentence.replace(word, marked_word)}\n")


    except Exception as e:
        print(f"An error occurred: {e}")


if __name__ == "__main__":
    chinese_text = """
    大会主席,诸位阁下,女士们先生们.我们的世界正处于旋风之中.我们现在处于巨大的变革的时代,面对的是前所未有的挑战,这些挑战要求有全球的解决方案.但是地缘政治的分歧不断加深.我们的星球不断升温.战争肆虐,没有人知道战争会如何终结.而核态势和新武器投下了黑暗的阴影.我们逐渐走向不可想象的结局，也就是会吞噬整个世界的火药桶与此同时,在2024年，全球一半的人类会进行大选,而大选的结果会影响所有的人.我今天站在这里，在这个旋风之中，相信有两个高于一切的真像.首先,我们世界的状态是不可持续的.我们不能够再一直这样下去.第二点,我们面对的挑战是可以解决的.这就要求我们要保证确保国际解决问题的机制能够实实在在地解决问题.未来峰会就是第一步,当然我们还有很长的路要走.而到达我们的终点,要求面对三个主要的，不可持续的驱动因素。首先有一个有罪不罚的世界，就是各种违法和侵权的行为威胁了国际法和《联合国宪章》的基石;还有一个不平等的世界--也就是不正义和布满威胁，伤害各国,会让他们陷入绝境。另外还有一个不确定的世界，也就是未经管理的全球威胁会威胁我们的未来，以不可想象的方式来威胁着我们的未来.
    """  # Paste your Chinese text here

    output_filename = "output.txt"

    segment_and_mark_chinese_text(chinese_text, output_filename)
    print(f"Segmentation and marking complete. Output written to '{output_filename}'")




Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.579 seconds.
DEBUG:jieba:Loading model cost 0.579 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


Segmentation and marking complete. Output written to 'output.txt'


ANOVA

Prompt: allow me to elaborate. i have two sets of data. dataset 1 is on column A-D. column A is the group number, B is the data for the group called "original", C is the data for "tsl", and D is for the group "int". Similarly, the dataset 2 is on column F-I, with F being the group number, G the data for "original", H for "tsl", I for "int". What i want to do is conduct an ANOVA using python to compare B-C-D individually, and compare G-H-I individually.

In [6]:
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd


def perform_anova_and_posthoc(data, dependent_variable, group_variable):
    """Performs ANOVA and post-hoc test using scipy."""
    try:
        # Perform ANOVA using scipy.stats.f_oneway
        groups = data[group_variable].unique()
        args = [data[data[group_variable] == g][dependent_variable] for g in groups]
        fvalue, pvalue = stats.f_oneway(*args)

        # Create ANOVA table (simplified)
        anova_table = pd.DataFrame({
            'Source': [group_variable, 'Residual'],
            'F': [fvalue, np.nan],  # F-value for the group effect
            'PR(>F)': [pvalue, np.nan]  # p-value for the group effect
        }).set_index('Source')

        group_means = data.groupby(group_variable)[dependent_variable].mean()

        alpha = 0.05
        if pvalue < alpha:  # Check p-value for significance
            tukey = pairwise_tukeyhsd(data[dependent_variable], data[group_variable], alpha=alpha)
            return anova_table, tukey, group_means
        else:
            return anova_table, None, group_means

    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None, None


# Load your Excel data (replace 'your_file.xlsx' with your file name)
excel_file = pd.ExcelFile('/content/drive/MyDrive/PsychFormers/ANOVA/20250202ANOVA分析.xlsx')

# Load Dataset 1
df1 = excel_file.parse('Sheet1')  # Replace 'Sheet1' with your sheet name if needed
df1 = df1.rename(columns={'A': 'Group', 'B': 'Original', 'C': 'TSL', 'D': 'INT'})  # Rename columns for clarity

# Load Dataset 2
df2 = excel_file.parse('Sheet1')  # Replace 'Sheet1' with your sheet name if needed
df2 = df2.rename(columns={'F': 'Group', 'G': 'Original', 'H': 'TSL', 'I': 'INT'})  # Rename columns

# Melt DataFrames to long format for ANOVA
df1_long = pd.melt(df1, id_vars=['Group'], value_vars=['Original', 'TSL', 'INT'], var_name='Treatment', value_name='Value')
df2_long = pd.melt(df2, id_vars=['Group'], value_vars=['Original', 'TSL', 'INT'], var_name='Treatment', value_name='Value')

# Perform ANOVA and post-hoc tests for Dataset 1
print("Dataset 1 Analysis:")
anova1, tukey1, means1 = perform_anova_and_posthoc(df1_long, 'Value', 'Treatment')
if anova1 is not None:
    print("ANOVA Table:")
    print(anova1)
    print("\nGroup Means:")
    print(means1)
    if tukey1 is not None:
        print("\nTukey's HSD Post-Hoc Test:")
        print(tukey1)
    else:
        print("\nANOVA was not significant, so Tukey's test was not performed.")

# Perform ANOVA and post-hoc tests for Dataset 2
print("\nDataset 2 Analysis:")
anova2, tukey2, means2 = perform_anova_and_posthoc(df2_long, 'Value', 'Treatment')
if anova2 is not None:
    print("ANOVA Table:")
    print(anova2)
    print("\nGroup Means:")
    print(means2)

    if tukey2 is not None:
        print("\nTukey's HSD Post-Hoc Test:")
        print(tukey2)
    else:
        print("\nANOVA was not significant, so Tukey's test was not performed.")

Dataset 1 Analysis:
ANOVA Table:
                  F   PR(>F)
Source                      
Treatment  1.105376  0.33846
Residual        NaN      NaN

Group Means:
Treatment
INT         101.589083
Original     86.930012
TSL          80.691654
Name: Value, dtype: float64

ANOVA was not significant, so Tukey's test was not performed.

Dataset 2 Analysis:
ANOVA Table:
                  F   PR(>F)
Source                      
Treatment  1.105376  0.33846
Residual        NaN      NaN

Group Means:
Treatment
INT         101.589083
Original     86.930012
TSL          80.691654
Name: Value, dtype: float64

ANOVA was not significant, so Tukey's test was not performed.
