In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Define the path to the file
file_path = r'C:\Users\year3\Downloads\Lab Session Data.xlsx'

# Function to handle data preprocessing
def preprocess_data(df):
    # Convert all columns to numeric where possible, others remain as categorical
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='ignore')
    
    # Separate numeric and categorical columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    categorical_cols = df.select_dtypes(include=[object]).columns

    # Impute missing values
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())  # Mean for numeric columns
    for col in categorical_cols:
        # Convert to string if there are mixed types
        df[col] = df[col].astype(str)
        mode_value = df[col].mode()
        if not mode_value.empty:
            df[col] = df[col].replace('nan', mode_value[0])  # Handle 'nan' strings
            df[col] = df[col].fillna(mode_value[0])  # Mode for categorical columns

    # Normalize numeric columns
    if not numeric_cols.empty:
        df[numeric_cols] = (df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std()

    return df

# Load and preprocess datasets
def analyze_matrix(matrix):
    matrix_clean = preprocess_data(matrix)
    print("\nCleaned Data:")
    print(matrix_clean.head())

    # Proceed with your analysis as needed

try:
    # Load the dataset
    df = pd.read_excel(file_path, sheet_name='thyroid0387_UCI')

    # Create two square matrices (for example purposes)
    matrix1 = df.iloc[:10, :10]
    matrix2 = df.iloc[10:20, 10:20]

    # Preprocess and analyze matrices
    print("Matrix 1:")
    analyze_matrix(matrix1)
    print("\nMatrix 2:")
    analyze_matrix(matrix2)

except FileNotFoundError:
    print(f"Error: The file at path '{file_path}' was not found. Please check the file path and try again.")
except Exception as e:
    print(f"An error occurred: {e}")


Matrix 1:

Cleaned Data:
   Record ID       age sex on thyroxine query on thyroxine  \
0  -0.999708 -0.586874   F            f                  f   
1  -0.999355 -0.586874   F            f                  f   
2  -0.989464  0.131747   F            f                  f   
3  -0.281543 -0.167678   F            f                  f   
4  -0.281190 -0.407219   F            f                  f   

  on antithyroid medication sick pregnant thyroid surgery I131 treatment  
0                         f    f        f               f              f  
1                         f    f        f               f              f  
2                         f    f        f               f              f  
3                         f    f        f               f              f  
4                         f    f        f               f              f  

Matrix 2:

Cleaned Data:
   query hypothyroid query hyperthyroid lithium goitre tumor hypopituitary  \
10                 f                  f       f 