<a href="https://colab.research.google.com/github/danyuchn/GMAT-score-report-analysis/blob/main/irt-simulation-tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import expit
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pytz  # Requires installation of pytz; if not installed, use `pip install pytz`
import json

def get_subject_order():
    valid_subjects = {
        "V": "Verbal",
        "Q": "Quantitative",
        "DI": "Data Insights"
    }
    while True:
        order_input = input("Enter the order of subjects (separated by '/' e.g., V/Q/DI): ").strip().upper()
        subjects_abbr = order_input.split('/')
        subjects = []
        try:
            for abbr in subjects_abbr:
                abbr = abbr.strip()
                if abbr not in valid_subjects:
                    raise ValueError(f"Invalid subject abbreviation: {abbr}")
                subjects.append(valid_subjects[abbr])
            if not subjects:
                raise ValueError("Please enter at least one valid subject.")
            return subjects
        except ValueError as ve:
            print(f"Input error: {ve}")
            print("Please re-enter valid subject abbreviations, separated by '/', e.g., V/Q/DI.")

def get_theta_est(subject):
    while True:
        try:
            theta = float(input(f"Enter the initial θ value for {subject}: ").strip())
            return theta
        except ValueError:
            print("Invalid input format, please enter a valid number.")

def get_wrong_questions(subject):
    while True:
        try:
            input_str = input(f"Enter the question numbers answered incorrectly for {subject} (comma-separated): ").strip()
            if not input_str:
                return []
            # Support both comma and space separation
            wrong_questions = list(map(int, [num.strip() for num in input_str.replace(',', ' ').split()]))
            return wrong_questions
        except ValueError:
            print("Invalid input format, please ensure you enter integers separated by commas or spaces.")

def main():
    subjects = get_subject_order()
    data = {}

    # For each subject, first input theta, then wrong question numbers
    for subject in subjects:
        theta_est = get_theta_est(subject)
        wrong_questions = get_wrong_questions(subject)
        data[subject] = {
            "theta_est": theta_est,
            "wrong_questions": wrong_questions
        }

    print("\nCollected wrong question numbers and θ values are as follows:")
    for subject in subjects:
        print(f"{subject}: θ value = {data[subject]['theta_est']}, Wrong questions = {data[subject]['wrong_questions']}")

    return data  # Return the collected data

# Functions for answer selection and negative log-likelihood
def select_next_question(theta, remaining_questions):
    def item_information(row, theta):
        a = row['a']
        b = row['b']
        c = row['c']
        P = c + (1 - c) * expit(a * (theta - b))
        Q = 1 - P
        return (a ** 2) * (P * Q) / ((1 - c) ** 2)

    remaining_questions = remaining_questions.copy()
    remaining_questions['information'] = remaining_questions.apply(lambda row: item_information(row, theta), axis=1)
    idx = remaining_questions['information'].idxmax()
    return idx

def neg_log_likelihood(theta, history):
    ll = 0
    for resp in history:
        a = resp['a']
        b = resp['b']
        c = resp['c']
        P = c + (1 - c) * expit(a * (theta - b))
        if resp['answered_correctly']:
            ll += np.log(P)
        else:
            ll += np.log(1 - P)
    return -ll

# Simulate the exam process for each subject (same logic for all subjects)
def simulate_exam(total_questions, wrong_questions, question_bank, theta_est, history, label):
    for i in range(total_questions):
        next_q_idx = select_next_question(theta_est, question_bank)
        question = question_bank.loc[next_q_idx]
        question_number = i + 1
        answer_correct = question_number not in wrong_questions

        history.append({
            'question_id': question['id'],
            'a': question['a'],
            'b': question['b'],
            'c': question['c'],
            'answered_correctly': answer_correct
        })

        res = minimize(neg_log_likelihood, theta_est, args=(history,), bounds=[(-4, 4)])
        theta_est = res.x[0]
        question_bank = question_bank.drop(next_q_idx)
        history[-1]['theta_est'] = theta_est
        print(f"{label} Question {question_number}: ID={question['id']}, b={question['b']:.2f}, Answer={'Correct' if answer_correct else 'Incorrect'}, θ Estimate={theta_est:.2f}")

    return pd.DataFrame(history)

# Initialize parameters for the three subjects
def initialize_subjects():
    np.random.seed(1000)

    subjects_params = {
        "Quantitative": {
            "total_questions": 21,
            "num_questions": 1000
        },
        "Verbal": {
            "total_questions": 23,
            "num_questions": 1000
        },
        "Data Insights": {
            "total_questions": 20,
            "num_questions": 1000
        }
    }

    question_banks = {}
    histories = {}

    for subject, params in subjects_params.items():
        a_params = np.random.uniform(0.2, 1.5, params['num_questions'])
        b_params = np.random.uniform(-2, 2, params['num_questions'])
        c_params = np.random.uniform(0.1, 0.25, params['num_questions'])
        question_bank = pd.DataFrame({
            'a': a_params,
            'b': b_params,
            'c': c_params,
            'id': np.arange(1, params['num_questions'] + 1)
        })
        question_banks[subject] = question_bank
        histories[subject] = []

    return subjects_params, question_banks, histories

def visualize_theta_and_b(subject_label, df_history, initial_theta, total_questions):
    # Theta Estimate plot
    theta_history = df_history['theta_est'].tolist()
    theta_history.insert(0, initial_theta)  # Insert initial theta
    plt.figure(figsize=(14, 7), dpi=150)
    plt.plot(range(0, total_questions + 1), theta_history, marker='o', linestyle='-', color='green', label='Theta Estimate')
    plt.title(f'Theta Estimate Over Questions ({subject_label})')
    plt.xlabel('Question Number')
    plt.ylabel('Theta Estimate')
    plt.ylim(-4.0, 4.0)
    plt.grid(True)
    plt.legend()
    plt.show()

    # Difficulty Parameter b plot
    b_values = df_history['b'].tolist()
    b_values = [round(b, 2) for b in b_values]
    question_numbers = list(range(1, total_questions + 1))
    plt.figure(figsize=(14, 7), dpi=150)
    plt.plot(question_numbers, b_values, marker='o', linestyle='-', color='orange', label='Difficulty Parameter b')
    plt.title(f'Difficulty Parameter (b) of Selected Questions Over Questions ({subject_label})')
    plt.xlabel('Question Number')
    plt.ylabel('Difficulty Parameter b')
    plt.ylim(-2.0, 2.0)
    plt.grid(True)
    plt.legend()
    plt.show()

def save_all_difficulty_parameters(q_df, v_df, di_df):
    q_b = q_df['b'].round(2).reset_index(drop=True)
    v_b = v_df['b'].round(2).reset_index(drop=True)
    di_b = di_df['b'].round(2).reset_index(drop=True)
    max_questions = max(len(q_b), len(v_b), len(di_b))
    q_b_extended = q_b.reindex(range(max_questions))
    v_b_extended = v_b.reindex(range(max_questions))
    di_b_extended = di_b.reindex(range(max_questions))
    combined_df = pd.DataFrame({
        'Question_Number': range(1, max_questions + 1),
        'Q_b': q_b_extended,
        'V_b': v_b_extended,
        'DI_b': di_b_extended
    })
    combined_df.set_index('Question_Number', inplace=True)
    tz = pytz.timezone('Asia/Bangkok')
    current_time = datetime.now(tz)
    filename_time = current_time.strftime("%Y%m%d_%H%M%S")
    filename = f"Difficulty_Parameters_{filename_time}.xlsx"
    combined_df.to_excel(filename)
    print(f"Difficulty parameters saved to {filename}")
    print("\n---Combined Difficulty Parameters---")
    print(combined_df)
    print("\n---End---\n")

def visualize_all(subjects_params, question_banks, histories, data):
    subjects = list(data.keys())
    q_df_history = {}
    for subject in subjects:
        print(f"\n---Simulating {subject}---")
        params = subjects_params[subject]
        question_bank = question_banks[subject].copy()
        history = histories[subject]
        wrong_questions = data[subject]['wrong_questions']
        theta_est = data[subject]['theta_est']
        total_questions = params['total_questions']
        df_history = simulate_exam(
            total_questions=total_questions,
            wrong_questions=wrong_questions,
            question_bank=question_bank,
            theta_est=theta_est,
            history=history,
            label=subject
        )
        q_df_history[subject] = df_history
    return q_df_history

if __name__ == "__main__":
    # Collect user input
    user_data = main()
    # Initialize subject parameters
    subjects_params, question_banks, histories = initialize_subjects()
    # Run simulation for each subject
    df_histories = visualize_all(subjects_params, question_banks, histories, user_data)
    q_df_history = df_histories.get("Quantitative")
    v_df_history = df_histories.get("Verbal")
    di_df_history = df_histories.get("Data Insights")
    q_theta_est = user_data.get("Quantitative", {}).get("theta_est", 0.0)
    v_theta_est = user_data.get("Verbal", {}).get("theta_est", 0.0)
    di_theta_est = user_data.get("Data Insights", {}).get("theta_est", 0.0)
    q_total_questions = subjects_params.get("Quantitative", {}).get("total_questions", 21)
    v_total_questions = subjects_params.get("Verbal", {}).get("total_questions", 23)
    di_total_questions = subjects_params.get("Data Insights", {}).get("total_questions", 20)
    # Visualize theta and b for each subject
    if q_df_history is not None:
        visualize_theta_and_b("Quantitative", q_df_history, q_theta_est, q_total_questions)
    if v_df_history is not None:
        visualize_theta_and_b("Verbal", v_df_history, v_theta_est, v_total_questions)
    if di_df_history is not None:
        visualize_theta_and_b("Data Insights", di_df_history, di_theta_est, di_total_questions)
    # Note: Theta volatility visualization has been removed as requested
    if q_df_history is not None and v_df_history is not None and di_df_history is not None:
        save_all_difficulty_parameters(q_df_history, v_df_history, di_df_history)

IntSlider(value=75, continuous_update=False, description='Quantitative (Q):', max=90, min=60, style=SliderStyl…

IntSlider(value=80, continuous_update=False, description='Verbal (V):', max=90, min=60, style=SliderStyle(desc…

IntSlider(value=85, continuous_update=False, description='Data Insights (DI):', max=90, min=60, style=SliderSt…

Output()