In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.decomposition import PCA
import numpy as np

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
model_name="Qwen/Qwen2.5-7B-Instruct-1M"
hf_token = # ENTER  YOUR HUGGINGFACE ACCESS CODE

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=hf_token,
    torch_dtype=torch.bfloat16, # Use bfloat16 for memory efficiency
    device_map="auto" # Automatically distribute the model across available devices
)

In [24]:
def format_qwen_instruct_prompt(user_message):
    """
    Helper function to format prompts
    """
    # QWEN Instruction template
    return f"Instruction: {user_message}\nAnswer:"

In [25]:
def get_last_token_activations(prompt, layer_index):
    """
    Runs the model on a prompt and returns the hidden state activations
    of the last token from a specified layer.
    """
    with torch.no_grad():
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs.hidden_states[layer_index]
        last_token_activations = hidden_states[:, -1, :].squeeze()
        # --- FIX: Convert bfloat16 tensor to float32 before converting to numpy ---
        return last_token_activations.cpu().float().numpy()


In [26]:
def analyze_specific_layer(prompt_template, statement_pairs, layer_index):
    """
    Analyzes a concept on a specific layer by creating and running PCA on difference vectors.
    """
    difference_vectors = []
    i = 0
    for statement_1, statement_2 in statement_pairs:
        #print('working with {} statement.'.format(i+1))
        user_message_1 = prompt_template.format(statement_1)
        user_message_2 = prompt_template.format(statement_2)

        prompt_1 = format_qwen_instruct_prompt(user_message_1)
        prompt_2 = format_qwen_instruct_prompt(user_message_2)

        activations_1 = get_last_token_activations(prompt_1, layer_index)
        activations_2 = get_last_token_activations(prompt_2, layer_index)

        difference = activations_1 - activations_2
        difference_vectors.append(difference)
        i+=1

    difference_matrix = np.vstack(difference_vectors)

    pca = PCA(n_components=1)
    pca.fit(difference_matrix)
    reading_vector = pca.components_[0]

    return reading_vector

In [27]:
# 1. Truthfulness vs. Falsity
truth_pairs = [
    ("The capital of France is Paris", "The capital of France is Berlin"), ("The sun rises in the east", "The sun rises in the west"),
    ("Water is composed of H2O", "Water is composed of CO2"), ("The author of Hamlet is Shakespeare", "The author of Hamlet is Dickens"),
    ("The earth revolves around the sun", "The sun revolves around the earth"), ("A square has four equal sides", "A square has five equal sides"),
    ("The Pacific Ocean is the largest ocean", "The Atlantic Ocean is the largest ocean"), ("Mount Everest is the tallest mountain", "Mount Fuji is the tallest mountain"),
    ("The human body has 206 bones", "The human body has 306 bones"), ("The speed of light is approximately 299,792 km/s", "The speed of light is approximately 150,000 km/s"),
    ("The Great Wall of China is visible from space with the naked eye", "The Great Wall of China is not visible from space with the naked eye"),
    ("Penguins are birds that cannot fly", "Penguins are mammals that cannot fly"), ("The currency of Japan is the Yen", "The currency of Japan is the Yuan"),
    ("The Amazon River is the longest river in the world", "The Nile River is the longest river in the world"), ("The first person on the moon was Neil Armstrong", "The first person on the moon was Buzz Aldrin"),
    ("Spiders have eight legs", "Spiders have six legs"), ("The boiling point of water at sea level is 100°C", "The boiling point of water at sea level is 90°C"),
    ("The chemical symbol for gold is Au", "The chemical symbol for gold is Ag"), ("The planet Mars is known as the Red Planet", "The planet Venus is known as the Red Planet"),
    ("The mitochondria is the powerhouse of the cell", "The nucleus is the powerhouse of the cell"), ("The United States has 50 states", "The United States has 51 states"),
    ("A decade is a period of 10 years", "A decade is a period of 100 years"), ("The Statue of Liberty was a gift from France", "The Statue of Liberty was a gift from Spain"),
    ("The primary colors are red, yellow, and blue", "The primary colors are red, green, and blue"), ("The Cold War ended in 1991", "The Cold War ended in 1981"),
    ("The human heart has four chambers", "The human heart has three chambers"), ("The Sahara is the largest hot desert in the world", "The Gobi is the largest hot desert in the world"),
    ("The element with the atomic number 1 is Hydrogen", "The element with the atomic number 1 is Helium"), ("The Renaissance was a period of cultural rebirth in Europe", "The Renaissance was a period of cultural rebirth in Asia"),
    ("The inventor of the telephone was Alexander Graham Bell", "The inventor of the telephone was Thomas Edison"), ("The main component of Earth's atmosphere is nitrogen", "The main component of Earth's atmosphere is oxygen"),
    ("The country with the largest population is India", "The country with the largest population is China"), ("The novel '1984' was written by George Orwell", "The novel '1984' was written by Aldous Huxley"),
    ("A leap year occurs every 4 years", "A leap year occurs every 5 years"), ("The capital of Australia is Canberra", "The capital of Australia is Sydney"),
    ("The freezing point of water is 0°C", "The freezing point of water is 32°C"), ("The Mona Lisa was painted by Leonardo da Vinci", "The Mona Lisa was painted by Michelangelo"),
    ("The human skeleton is made of bone", "The human skeleton is made of cartilage"), ("The Roman Empire fell in the 5th century AD", "The Roman Empire fell in the 10th century AD"),
    ("The official language of Brazil is Portuguese", "The official language of Brazil is Spanish"), ("The planet Jupiter is the largest in our solar system", "The planet Saturn is the largest in our solar system"),
    ("The formula for calculating the area of a circle is πr²", "The formula for calculating the area of a circle is 2πr"), ("The first World War began in 1914", "The first World War began in 1918"),
    ("The currency of the United Kingdom is the Pound Sterling", "The currency of the United Kingdom is the Euro"), ("The human body's largest organ is the skin", "The human body's largest organ is the liver"),
    ("The capital of Canada is Ottawa", "The capital of Canada is Toronto"), ("The Pacific Ocean contains the Mariana Trench", "The Atlantic Ocean contains the Mariana Trench"),
    ("The composer of 'The Four Seasons' is Vivaldi", "The composer of 'The Four Seasons' is Bach")
]
truth_prompt_template = "Is the following statement true or false? '{}'. Answer:"

# 2. Sentiment: Positive vs. Negative
sentiment_pairs = [
    ("The movie was a masterpiece of cinema.", "The movie was a total disaster."), ("I felt incredibly happy after hearing the news.", "I felt incredibly sad after hearing the news."),
    ("The food at this restaurant is delicious.", "The food at this restaurant is disgusting."), ("She is a kind and wonderful person.", "She is a mean and awful person."),
    ("The book was an inspiring and uplifting read.", "The book was a depressing and pointless read."), ("That was a brilliant and insightful comment.", "That was a foolish and ignorant comment."),
    ("The weather today is beautiful and sunny.", "The weather today is gloomy and miserable."), ("He gave a truly excellent performance.", "He gave a truly terrible performance."),
    ("This is a fantastic opportunity.", "This is a dreadful situation."), ("I'm optimistic about our chances.", "I'm pessimistic about our chances."),
    ("The service was exceptionally good.", "The service was exceptionally poor."), ("A truly heartwarming story.", "A truly heartbreaking story."),
    ("I am deeply grateful for your help.", "I am deeply disappointed by your actions."), ("The concert was absolutely electrifying.", "The concert was painfully boring."),
    ("This design is elegant and modern.", "This design is clumsy and outdated."), ("I feel refreshed and energized.", "I feel drained and exhausted."),
    ("It was a joyful celebration.", "It was a somber occasion."), ("The new policy is a huge improvement.", "The new policy is a huge step backward."),
    ("His speech was powerful and persuasive.", "His speech was weak and unconvincing."), ("I have complete confidence in the team.", "I have no confidence in the team."),
    ("The view from the top was breathtaking.", "The view from the top was underwhelming."), ("A delightful aroma filled the room.", "An unpleasant odor filled the room."),
    ("The project was a resounding success.", "The project was a complete failure."), ("I admire her courage and integrity.", "I despise her cowardice and dishonesty."),
    ("The software is intuitive and user-friendly.", "The software is confusing and frustrating."), ("It was a peaceful and relaxing vacation.", "It was a stressful and chaotic vacation."),
    ("The team's morale is very high.", "The team's morale is very low."), ("This is a safe and welcoming community.", "This is a dangerous and hostile community."),
    ("The plot was clever and engaging.", "The plot was predictable and dull."), ("I'm thrilled with the results.", "I'm horrified by the results."),
    ("The garden is vibrant and full of life.", "The garden is barren and lifeless."), ("Her contribution was invaluable.", "Her contribution was worthless."),
    ("This is a very hopeful sign.", "This is a very discouraging sign."), ("The puppy was adorable and playful.", "The puppy was aggressive and destructive."),
    ("I feel secure and protected.", "I feel threatened and vulnerable."), ("The negotiations were productive and amicable.", "The negotiations were pointless and hostile."),
    ("This is a high-quality, durable product.", "This is a low-quality, flimsy product."), ("I'm excited for what's next.", "I'm dreading what's next."),
    ("The ending of the book was satisfying.", "The ending of the book was disappointing."), ("The instructions were clear and helpful.", "The instructions were vague and unhelpful."),
    ("It was a generous and thoughtful gift.", "It was a cheap and thoughtless gift."), ("The new update is a fantastic improvement.", "The new update is a terrible downgrade."),
    ("I'm proud of my accomplishments.", "I'm ashamed of my failures."), ("The atmosphere was lively and fun.", "The atmosphere was dull and depressing."),
    ("This is a reliable and trustworthy source.", "This is an unreliable and untrustworthy source."), ("The experience was truly magical.", "The experience was truly nightmarish."),
    ("I feel a great sense of belonging.", "I feel a great sense of alienation.")
]
sentiment_prompt_template = "Is the sentiment of the following text positive or negative? '{}'. Answer:"

# 3. Honesty vs. Dishonesty
honesty_pairs = [
    ("admit you made a mistake at work", "blame a coworker for a mistake you made"), ("give truthful, direct feedback on a friend's project", "give false, flattering feedback on a friend's project"),
    ("confess to breaking a valuable item", "lie and say you don't know how a valuable item broke"), ("be honest about why you are late", "make up a fake excuse for why you are late"),
    ("tell a friend a difficult but important truth", "tell a friend a comforting lie"), ("accurately represent your skills on a resume", "exaggerate your skills on a resume"),
    ("disclose a conflict of interest", "hide a conflict of interest"), ("return a lost wallet you found", "keep a lost wallet you found"),
    ("give a customer the correct change", "give a customer incorrect change on purpose"), ("tell the truth under oath", "commit perjury under oath"),
    ("be transparent about your company's finances", "be opaque about your company's finances"), ("admit you don't know the answer", "pretend you know the answer"),
    ("take responsibility for your actions", "deflect responsibility for your actions"), ("report a crime you witnessed", "ignore a crime you witnessed"),
    ("give an accurate eyewitness account", "give a false eyewitness account"), ("uphold a promise you made", "break a promise you made"),
    ("be upfront about your intentions in a relationship", "be deceptive about your intentions in a relationship"), ("correct a misconception someone has, even if it's awkward", "let someone continue believing a misconception"),
    ("reveal that a product has a known defect", "conceal that a product has a known defect"), ("pay your fair share of taxes", "evade paying your taxes"),
    ("give credit to a colleague for their idea", "take credit for a colleague's idea"), ("be faithful to your partner", "be unfaithful to your partner"),
    ("present research findings accurately", "falsify research findings"), ("tell a child the truth about Santa Claus", "let a child continue believing in Santa Claus"),
    ("be open about your political beliefs", "be secretive about your political beliefs"), ("admit you forgot an important event", "pretend you remembered an important event"),
    ("provide a genuine apology", "provide a fake, insincere apology"), ("tell a patient their true diagnosis", "give a patient a false, more optimistic diagnosis"),
    ("accurately report your work hours", "falsify your timesheet"), ("acknowledge the valid points in an opponent's argument", "misrepresent an opponent's argument"),
    ("be candid about the risks of an investment", "downplay the risks of an investment"), ("speak truthfully about your past failures", "hide your past failures"),
    ("confront a friend about their harmful behavior", "enable a friend's harmful behavior by staying silent"), ("give a sincere compliment", "give a manipulative, flattering compliment"),
    ("be straightforward about your feelings", "be passive-aggressive about your feelings"), ("honor the terms of a contract", "violate the terms of a contract"),
    ("admit when you are wrong in an argument", "refuse to admit you are wrong in an argument"), ("provide a real reason for ending a relationship", "ghost someone to end a relationship"),
    ("be open about your sources as a journalist", "use anonymous, unverified sources as a journalist"), ("tell your boss about a problem honestly", "tell your boss everything is fine when there's a problem"),
    ("give an accurate job reference", "give an overly positive, inaccurate job reference"), ("be direct about your needs", "be indirect and hope others guess your needs"),
    ("tell someone they have food in their teeth", "not tell someone they have food in their teeth"), ("admit to your child that you made a mistake", "pretend to your child that you are always right"),
    ("reveal the full cost of a service upfront", "hide fees until the last minute"), ("be honest about your limitations", "pretend you have no limitations"),
    ("give a realistic assessment of a project timeline", "give an unrealistic, overly optimistic project timeline"), ("tell the truth even when it's unpopular", "tell people what they want to hear"),
    ("be accountable for a public statement", "deny having made a public statement")
]
honesty_prompt_template = "Pretend you are a person who needs to {}."

# 4. Politeness vs. Impoliteness
politeness_pairs = [
    ("Could you please share your thoughts on this?", "What do you think?"), ("I'm not sure I completely agree with that point.", "That's wrong."),
    ("Would you mind if I borrowed your pen for a moment?", "Give me your pen."), ("I was wondering if you could help me with this.", "Help me."),
    ("Excuse me, I think you might be in my seat.", "You're in my seat."), ("Thank you for your input, I'll take it into consideration.", "I heard you."),
    ("I believe there might be a misunderstanding.", "You're not listening."), ("Could you please elaborate on that?", "That makes no sense."),
    ("I appreciate your perspective.", "I don't care what you think."), ("Let's agree to disagree.", "You're just being stubborn."),
    ("I'm sorry to hear you're having a difficult time.", "That sounds like a you problem."), ("Would it be possible to get an extension?", "I need more time."),
    ("I'm afraid I have a different opinion.", "Your opinion is stupid."), ("Thank you for your time.", "You can go now."),
    ("I'd be happy to help with that.", "That's not my job."), ("Let me see how I can assist you.", "What do you want?"),
    ("I understand your frustration.", "Stop complaining."), ("Could you please speak a little more quietly?", "You're being too loud."),
    ("I'm looking forward to working with you.", "I guess I have to work with you."), ("That's a very interesting question.", "Why would you ask that?"),
    ("I'll get back to you on that shortly.", "I'll do it when I get to it."), ("Pardon me.", "Move."),
    ("It was a pleasure to meet you.", "I've met you."), ("I'm sorry for the inconvenience.", "It is what it is."),
    ("Perhaps we could explore other options.", "That's a bad idea."), ("If you don't mind, I'd like to add something.", "Let me talk."),
    ("I see your point, but have you considered...", "But actually..."), ("You've made some valid points.", "You're not completely wrong."),
    ("I hope you have a wonderful day.", "Have a day."), ("I'm not the best person to answer that.", "I don't know."),
    ("Could you please pass the salt?", "Pass the salt."), ("I'm sorry, I didn't catch your name.", "What's your name again?"),
    ("I'd prefer not to discuss that.", "That's none of your business."), ("Let's try to find a solution together.", "You need to fix this."),
    ("I think we're getting off-topic.", "This is irrelevant."), ("I'm happy to clarify.", "You didn't understand."),
    ("Welcome to our establishment.", "You're here."), ("Is now a good time to talk?", "Are you busy?"),
    ("I'll handle it from here, thank you.", "I'll do it myself."), ("I'm sorry, but I can't approve that request.", "No."),
    ("He's a very unique individual.", "He's weird."), ("I'm doing well, thank you for asking.", "I'm fine."),
    ("That's one way to look at it.", "That's a dumb way to look at it."), ("I'll take your feedback on board.", "Okay, whatever."),
    ("Please, after you.", "Go ahead."), ("I'll have to check my calendar.", "I'm probably busy."),
    ("I wish you the best of luck.", "Good luck, you'll need it."), ("I'm afraid that's not possible.", "You can't do that.")
]
politeness_prompt_template = "Is the following statement polite or impolite? '{}'. Answer:"

# 5. Code Comments vs. No Comments
code_comment_pairs = [
    ("# Initialize the total score to zero\nscore = 0", "score = 0"), ("# Define the main function to start the program\ndef main():", "def main():"),
    ("# Iterate through each item in the list\nfor item in my_list:", "for item in my_list:"), ("x = 5  # Set the default value for x", "x = 5"),
    ("return True  # Indicate that the operation was successful", "return True"), ("import numpy as np  # Import the numpy library for numerical operations", "import numpy as np"),
    ("// This function calculates the factorial of a number\nfunction factorial(n) {", "function factorial(n) {"), ("let count = 0; // Initialize counter", "let count = 0;"),
    ("/**\n * Represents a user in the system.\n */\nclass User {", "class User {"), ("<!-- This is the main navigation bar -->\n<nav>", "<nav>"),
    ("if (err) { // Handle potential errors", "if (err) {"), ("const API_KEY = '...'; // Store API key securely", "const API_KEY = '...';"),
    ("db.connect(); // Establish database connection", "db.connect();"), ("while (x > 0) { // Loop until x is zero", "while (x > 0) {"),
    ("// TODO: Refactor this section later", "y = x * 2 + b;"), ("z = x + y; // Perform addition", "z = x + y;"),
    ("print('Hello, World!') # A classic first program", "print('Hello, World!')"), ("const port = 3000; // Set server port", "const port = 3000;"),
    ("if (user.is_admin) { // Check for admin privileges", "if (user.is_admin) {"), ("res.status(200).send('OK'); // Send success response", "res.status(200).send('OK');"),
    ("for i in range(10): # Loop 10 times", "for i in range(10):"), ("let is_valid = true; // Flag for validation state", "let is_valid = true;"),
    ("try { // Begin error handling block", "try {"), ("} catch (e) { // Catch any exceptions", "} catch (e) {"),
    ("const url = 'https://api.example.com'; // API endpoint", "const url = 'https://api.example.com';"), ("SELECT * FROM users; -- Retrieve all users", "SELECT * FROM users;"),
    ("def __init__(self): # Constructor method", "def __init__(self):"), ("super().__init__(); // Call parent constructor", "super().__init__();"),
    ("// Deprecated: Use new_function() instead", "old_function();"), ("if (cache[key]) { // Check if result is in cache", "if (cache[key]) {"),
    ("return -1; // Return -1 to indicate an error", "return -1;"), ("// A simple recursive solution", "return fib(n-1) + fib(n-2);"),
    ("const MAX_RETRIES = 3; // Maximum number of retries", "const MAX_RETRIES = 3;"), ("if (x === null) { // Check for null value", "if (x === null) {"),
    ("// This is a temporary workaround", "if (user.name == 'admin') {"), ("let name = 'guest'; // Default user name", "let name = 'guest';"),
    ("for key, value in data.items(): # Unpack dictionary items", "for key, value in data.items():"), ("with open('file.txt', 'r') as f: # Open file for reading", "with open('file.txt', 'r') as f:"),
    ("plt.show() # Display the plot", "plt.show()"), ("df.dropna(inplace=True) # Remove rows with missing values", "df.dropna(inplace=True)"),
    ("model.fit(X, y) # Train the machine learning model", "model.fit(X, y)"), ("// Ensure the input is a positive number", "if (n <= 0) return;"),
    ("const timeout = 5000; // Set timeout to 5 seconds", "const timeout = 5000;"), ("let result = []; // Initialize an empty array", "let result = [];"),
    ("document.getElementById('app'); // Get the root element", "document.getElementById('app');"), ("e.preventDefault(); // Stop default browser action", "e.preventDefault();"),
    ("// Main application logic starts here", "run_app();"), ("console.log('Debug value:', x); // For debugging purposes", "console.log('Debug value:', x);"),
    ("} finally { // This block always executes", "} finally {"), ("// End of the class definition", "};"),
    ("if (is_production) { // Conditional logic for production environment", "if (is_production) {")
]
code_comment_prompt_template = "Is the following line of code commented or uncommented? '{}'. Answer:"

# 6. Plain Language Explanation vs. Statement
explanation_pairs = [
    ("A barometer is a tool used to measure air pressure.", "The barometer reading is 1012 hPa."), ("The process of photosynthesis converts light into chemical energy.", "Photosynthesis occurs in chloroplasts."),
    ("The capital of Japan is Tokyo, a large, bustling city.", "The capital of Japan is Tokyo."), ("To solve for x, you need to isolate it on one side of the equation.", "The equation is 2x + 4 = 10."),
    ("Gravity is the force that pulls objects toward each other.", "The acceleration of gravity is 9.8 m/s²."), ("An algorithm is a set of step-by-step instructions for a computer.", "This algorithm has O(n log n) time complexity."),
    ("An archipelago is a group or chain of islands.", "Indonesia is an archipelago."), ("The CPU, or Central Processing Unit, is the brain of a computer.", "This computer has a 3.4 GHz CPU."),
    ("Inflation is the rate at which the general level of prices for goods and services is rising.", "Last year, inflation was 3%."), ("A prime number is a number greater than 1 that has no positive divisors other than 1 and itself.", "7 is a prime number."),
    ("The water cycle describes the continuous movement of water on, above, and below the surface of the Earth.", "Evaporation is part of the water cycle."), ("A verb is a word that describes an action, state, or occurrence.", "The word 'run' is a verb."),
    ("The stock market is a collection of markets where investments are traded.", "The stock market went up today."), ("A democracy is a system of government where citizens exercise power by voting.", "Ancient Athens was a democracy."),
    ("The internet is a global network of interconnected computers.", "I use the internet every day."), ("A calorie is a unit of energy.", "This apple has 95 calories."),
    ("The immune system is the body's defense against infectious organisms.", "White blood cells are part of the immune system."), ("A metaphor is a figure of speech in which a word or phrase is applied to an object or action to which it is not literally applicable.", "'All the world's a stage' is a metaphor."),
    ("The concept of 'supply and demand' dictates the price of goods in a market.", "The price of oil is affected by supply and demand."), ("A habitat is the natural home or environment of an animal, plant, or other organism.", "The rainforest is a habitat for many species."),
    ("The Richter scale is used to measure the magnitude of earthquakes.", "The earthquake was a 6.8 on the Richter scale."), ("A novel is a long fictional narrative in prose.", "'Moby Dick' is a novel."),
    ("The scientific method is a process for experimentation that is used to explore observations and answer questions.", "He followed the scientific method for his experiment."), ("A constitution is a set of fundamental principles according to which a state is governed.", "The US Constitution was written in 1787."),
    ("The term 'e-commerce' refers to the buying and selling of goods online.", "Amazon is a major e-commerce company."), ("A sonnet is a poem of fourteen lines using any of a number of formal rhyme schemes.", "Shakespeare wrote many sonnets."),
    ("The carbon cycle is the process by which carbon is exchanged among the biosphere, atmosphere, oceans, and geosphere.", "Burning fossil fuels affects the carbon cycle."), ("A programming language is a formal language comprising a set of instructions that produce various kinds of output.", "Python is a programming language."),
    ("The legislative branch is the part of government that makes laws.", "Congress is the legislative branch of the U.S. government."), ("An allergy is a damaging immune response by the body to a substance to which it has become hypersensitive.", "He has a peanut allergy."),
    ("The Industrial Revolution was a period of major industrialization that took place during the late 1700s and early 1800s.", "The steam engine was invented during the Industrial Revolution."), ("A neuron is a specialized cell that transmits nerve impulses.", "The brain is made of billions of neurons."),
    ("The concept of 'irony' involves a contrast between what is expected and what actually occurs.", "It's ironic that the fire station burned down."), ("A database is an organized collection of structured information, or data.", "This information is stored in a SQL database."),
    ("A gene is a unit of heredity which is transferred from a parent to offspring.", "Humans have about 20,000 genes."), ("The judicial branch is the part of government that interprets laws.", "The Supreme Court is part of the judicial branch."),
    ("A 'black hole' is a region of spacetime where gravity is so strong that nothing can escape.", "There is a black hole at the center of our galaxy."), ("The term 'AI' refers to the simulation of human intelligence in machines.", "This chatbot is powered by AI."),
    ("A 'fossil' is the preserved remains or traces of a dead organism.", "They discovered a dinosaur fossil."), ("The 'placebo effect' occurs when a patient's belief in a treatment leads to a perceived or actual improvement in their condition.", "The clinical trial accounted for the placebo effect."),
    ("A 'budget' is a financial plan for a defined period.", "We need to stick to our monthly budget."), ("An 'antonym' is a word opposite in meaning to another.", "'Hot' is an antonym for 'cold'."),
    ("A 'virus' is a small infectious agent that replicates only inside the living cells of other organisms.", "The flu is caused by a virus."), ("The 'executive branch' is the part of government that enforces laws.", "The President is the head of the executive branch."),
    ("A 'protagonist' is the leading character in a story.", "Harry Potter is the protagonist of his series."), ("A 'carbohydrate' is a biological molecule consisting of carbon, hydrogen and oxygen atoms.", "Bread is a source of carbohydrates."),
    ("The 'ozone layer' is a region of Earth's stratosphere that absorbs most of the Sun's ultraviolet radiation.", "The ozone layer is healing."), ("A 'synonym' is a word that means exactly or nearly the same as another word.", "'Happy' is a synonym for 'joyful'.")
]
explanation_prompt_template = "Does the following sentence explain a concept or just state a fact? '{}'. The sentence is an"



In [None]:
num_layers = model.config.num_hidden_layers
print(f"\nAnalyzing QWEN2.5-7B model with {num_layers} layers.")

In [None]:
for i in range(1, num_layers + 1):
    print(f'--- Analyzing Layer {i-1} (index {i}) ---')
    layer_index = i
    # Analyze Truthfulness vs. Sentiment
    v_truthfulness = analyze_specific_layer(truth_prompt_template, truth_pairs, layer_index)
    v_sentiment = analyze_specific_layer(sentiment_prompt_template, sentiment_pairs, layer_index)
    cos_sim_truth_sentiment = np.dot(v_truthfulness, v_sentiment) / (np.linalg.norm(v_truthfulness) * np.linalg.norm(v_sentiment))
    print(f"  Truthfulness vs. Sentiment Cosine Similarity: {cos_sim_truth_sentiment:.4f}")

    # Analyze Honesty vs. Politeness
    v_honesty = analyze_specific_layer(honesty_prompt_template, honesty_pairs, layer_index)
    v_politeness = analyze_specific_layer(politeness_prompt_template, politeness_pairs, layer_index)
    cos_sim_honesty_politeness = np.dot(v_honesty, v_politeness) / (np.linalg.norm(v_honesty) * np.linalg.norm(v_politeness))
    print(f"  Honesty vs. Politeness Cosine Similarity: {cos_sim_honesty_politeness:.4f}")

    # Analyze Code Comments vs. Plain Language
    v_code = analyze_specific_layer(code_comment_prompt_template, code_comment_pairs, layer_index)
    v_expl = analyze_specific_layer(explanation_prompt_template, explanation_pairs, layer_index)
    cos_sim_code_expl = np.dot(v_code, v_expl) / (np.linalg.norm(v_code) * np.linalg.norm(v_expl))
    print(f"  Code Comments vs. Plain Language Cosine Similarity: {cos_sim_code_expl:.4f}")
    print('----------------------------------')