# IMPORT LIBRARIES

In [1]:
import tkinter as tk
from tkinter import ttk, messagebox, filedialog
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import statsmodels.api as sm

# EXTRACT DATA AND PERFORM TEST

In [2]:
print("=" * 80)
print("ONE-WAY ANOVA ANALYSIS USING EXCEL DATA".center(80))
print("=" * 80)
print("\n")

print("PURPOSE:")
print("=" * 80)
print("~ Conduct a one-way analysis of variance (ANOVA) to determine if there are any statistically\n"
      "~ significant differences between the means of three or more independent (unrelated) groups.\n"
      "~ This method helps in understanding if the group factor has an effect on the variable of interest.")
print("\n")

print("INPUTS:")
print("=" * 80)
print("~ Excel File: An Excel file containing the dataset with observations. Each column can represent\n"
      "  a different group or a different variable.\n"
      "  NOTE: This script may only be used for continuous/numerical data\n"
      "~ Number of groups: The number of distinct groups or categories to include in the analysis.\n"
      "~ Column selection: User selects which columns from the loaded dataset to include in the ANOVA.")
print("\n")

print("OUTPUTS:")
print("=" * 80)
print("~ Boxplots and histograms for visual analysis of the distribution across selected groups.\n"
      "~ Summary statistics including mean, standard deviation, variance, and confidence intervals for\n"
      "  each selected group.\n"
      "~ ANOVA table summarizing the F-statistic, p-value, and other relevant metrics indicating whether\n"
      "  significant differences exist between group means.")
print("\n")

print("OPERATIONAL FLOW:")
print("=" * 80)
print("1. User loads an Excel file through the GUI, which populates the entry field with the file path.\n"
      "2. User specifies the number of groups for the analysis using a spinbox that updates the GUI dynamically.\n"
      "3. User selects columns from the loaded dataset for inclusion in the analysis.\n"
      "4. Upon pressing the 'Perform Analysis' button, the app processes these selections to:\n"
      "   - Generate boxplots and histograms for selected data.\n"
      "   - Calculate summary statistics.\n"
      "   - Conduct and display the results of the one-way ANOVA.\n"
      "5. Results are printed in the terminal and status updates are displayed in the GUI.")
print("=" * 80)
print("\n\n\n")


class one_way_ANOVA_App(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("One-Way ANOVA")
        self.geometry("300x250")
        self.df = None
        self.column_comboboxes = []
        
        self.create_widgets()
    
    def create_widgets(self):
        ttk.Label(self, text="Select Excel File:").grid(row=0, column=0, sticky='w', padx=10, pady=5)
        self.file_path_entry = ttk.Entry(self, width=30)
        self.file_path_entry.grid(row=1, column=0, padx=10, pady=5)
        load_button = ttk.Button(self, text="Load File", command=self.load_file)
        load_button.grid(row=1, column=1, padx=10, pady=5)
        
        ttk.Label(self, text="Number of groups:\n(Hit Enter key to update number)").grid(row=2, column=0, sticky='w', padx=10, pady=5)
        self.num_groups_var = tk.IntVar(value=3)
        num_groups_spinbox = ttk.Spinbox(self, from_=3, to=10, textvariable=self.num_groups_var, wrap=True, width=10)
        num_groups_spinbox.grid(row=2, column=1, padx=10, pady=5)
        num_groups_spinbox.bind('<Return>', lambda e: self.update_column_comboboxes())
        num_groups_spinbox.bind('<<Increment>>', lambda e: self.update_column_comboboxes())
        num_groups_spinbox.bind('<<Decrement>>', lambda e: self.update_column_comboboxes())

        self.update_column_comboboxes()

        anova_button = ttk.Button(self, text="Perform Analysis", command=self.perform_analysis)
        anova_button.grid(row=15, column=0, padx=10, pady=5, columnspan=2, sticky='ew')

        # Status label to notify the user
        self.status_label = ttk.Label(self, text="")
        self.status_label.grid(row=16, column=0, columnspan=2, sticky='ew', padx=10, pady=0)

    def load_file(self):
        file_path = filedialog.askopenfilename(title="Select Excel File", filetypes=[("Excel files", "*.xlsx"), ("All files", "*.*")])
        if file_path:
            self.file_path_entry.delete(0, tk.END)
            self.file_path_entry.insert(0, file_path)
            self.df = self.read_excel_file(file_path)
            if self.df is not None:
                self.update_group_selection(self.df.columns)

    def read_excel_file(self, file_path):
        try:
            df = pd.read_excel(file_path)
            df.dropna(axis=1, how='all', inplace=True)
            return df
        except Exception as e:
            messagebox.showerror("Error", f"Failed to read the file due to: {e}")
            return None

    def update_column_comboboxes(self):
        for combobox in self.column_comboboxes:
            combobox.grid_forget()
        self.column_comboboxes = []
        for i in range(self.num_groups_var.get()):
            combobox = ttk.Combobox(self, values=[], state="readonly")
            combobox.grid(row=3+i, column=0, padx=10, pady=2, sticky='ew')
            self.column_comboboxes.append(combobox)
        if self.df is not None:
            self.update_group_selection(self.df.columns)

    def update_group_selection(self, columns):
        for combobox in self.column_comboboxes:
            combobox['values'] = list(columns)
            if len(columns) > 0:
                combobox.current(0)

    def perform_analysis(self):
        selected_columns = [combobox.get() for combobox in self.column_comboboxes if combobox.get()]
        if not selected_columns:
            messagebox.showerror("Error", "Please select valid columns for analysis.")
            return
        
        self.plot_boxplot(selected_columns)
        self.plot_histogram(selected_columns)
        self.calculate_summary_stats(selected_columns)
        self.perform_one_way_anova(selected_columns)
        self.status_label.config(text="Results printed in terminal.")

    def calculate_summary_stats(self, columns):
        # Initialize a list to hold the data for each column
        stats_data = []

        # Z-score for 95% confidence; it approximates to 1.96
        z_score = norm.ppf(0.975)

        # Collect stats for each column
        for column in columns:
            data = self.df[column]
            sample_size = len(data)
            data_sum = data.sum()
            data_mean = data.mean()
            data_std = data.std()
            data_var = data.var()

            # Calculate the standard error of the mean
            sem = data_std / (sample_size ** 0.5)
            # Confidence interval calculations
            lower_ci = data_mean - z_score * sem
            upper_ci = data_mean + z_score * sem

            # Append a dictionary of the stats for each column to the list
            stats_data.append({
                'Column': column,
                'Sample Size': sample_size,
                'Sum': data_sum,
                'Mean': data_mean,
                'Standard Deviation': data_std,
                'Variance': data_var,
                '95% CI Lower': lower_ci,
                '95% CI Upper': upper_ci
            })

        # Create a DataFrame from the list of dictionaries
        stats_df = pd.DataFrame(stats_data)

        print("=" * 80)
        print("SUMMARY STATISTICS:")
        print("=" * 80)
        # Print the DataFrame
        print(stats_df.to_string(index=False))
        print("=" * 80)
        print("\n\n\n")

    def plot_boxplot(self, columns):
        try:
            plt.figure(figsize=(10, 6))
            self.df.boxplot(column=columns)
            plt.title(f'Box Plot of Selected Columns')
            plt.ylabel('Values')
            plt.grid(True)
            plt.show()
        except KeyError as e:
            messagebox.showerror("Error", f"One or more selected columns not found for boxplot.")

    def plot_histogram(self, columns):
        try:
            plt.figure(figsize=(10, 6))
            for column in columns:
                # Using histtype='step' to create line histograms
                plt.hist(self.df[column], bins=12, alpha=0.7, label=f'{column}', histtype='step', linewidth=2)
            plt.title('Histogram of Selected Columns')
            plt.xlabel('Values')
            plt.ylabel('Frequency')
            plt.legend()
            plt.grid(True)
            plt.show()
        except KeyError as e:
            messagebox.showerror("Error", f"One or more selected columns not found for histogram: {e}")
    def perform_one_way_anova(self, columns):
        try:
            data = [self.df[col].dropna() for col in columns]
            df_anova = pd.DataFrame({
                'data': np.concatenate(data),
                'group': np.repeat(columns, [len(self.df[col].dropna()) for col in columns])
            })
            anova = sm.stats.anova_lm(sm.formula.ols('data ~ group', data=df_anova).fit(), typ=1)
            
            print("=" * 80)
            print("ANOVA RESULTS:")
            print("=" * 80)
            print(anova)
            print("=" * 80)
        except Exception as e:
            messagebox.showerror("Error", f"Failed to perform ANOVA due to: {e}")

if __name__ == '__main__':
    app = one_way_ANOVA_App()
    app.mainloop()


                    ONE-WAY ANOVA ANALYSIS USING EXCEL DATA                     


PURPOSE:
~ Conduct a one-way analysis of variance (ANOVA) to determine if there are any statistically
~ significant differences between the means of three or more independent (unrelated) groups.
~ This method helps in understanding if the group factor has an effect on the variable of interest.


INPUTS:
~ Excel File: An Excel file containing the dataset with observations. Each column can represent
  a different group or a different variable.
  NOTE: This script may only be used for continuous/numerical data
~ Number of groups: The number of distinct groups or categories to include in the analysis.
~ Column selection: User selects which columns from the loaded dataset to include in the ANOVA.


OUTPUTS:
~ Boxplots and histograms for visual analysis of the distribution across selected groups.
~ Summary statistics including mean, standard deviation, variance, and confidence intervals for
  each selected gro