In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.decomposition import PCA
from scipy import stats

# Load the Iris dataset
def load_iris_data():
    iris = datasets.load_iris()
    df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
    df['species'] = iris.target_names[iris.target]
    return df

# Output summary statistics of each variable to a text file
def output_summary(df, file_name):
    with open(file_name, 'w') as f:
        f.write("Summary of Iris Dataset\n")
        f.write("=========================\n")
        f.write(df.describe().to_string())
        f.write("\n\n")
        f.write("Correlation Matrix:\n")
        f.write(df.corr().to_string())

# Create histograms for each variable and save as PNG files
def plot_histograms(df):
    for column in df.columns[:-1]:  # Skip the 'species' column
        plt.figure(figsize=(8, 6))
        sns.histplot(df[column], kde=True)
        plt.title(f'Histogram of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.savefig(f'{column}_histogram.png')
        plt.close()

# Generate scatter plots for each pair of variables
def plot_scatter_plots(df):
    feature_columns = df.columns[:-1]  # Exclude 'species' column
    for i, col1 in enumerate(feature_columns):
        for col2 in feature_columns[i+1:]:
            plt.figure(figsize=(8, 6))
            sns.scatterplot(x=df[col1], y=df[col2], hue=df['species'])
            plt.title(f'Scatter Plot of {col1} vs {col2}')
            plt.xlabel(col1)
            plt.ylabel(col2)
            plt.legend(title='Species')
            plt.savefig(f'{col1}_{col2}_scatterplot.png')
            plt.close()

# Perform additional analysis: Correlation matrix, missing values, and boxplots
def additional_analysis(df):
    # Check for missing values
    missing_values = df.isnull().sum()
    
    # Correlation matrix visualization
    plt.figure(figsize=(8, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.savefig('correlation_matrix.png')
    plt.close()

    # Boxplots for each feature
    for column in df.columns[:-1]:  # Skip the 'species' column
        plt.figure(figsize=(8, 6))
        sns.boxplot(x='species', y=column, data=df)
        plt.title(f'Boxplot of {column} by Species')
        plt.xlabel('Species')
        plt.ylabel(column)
        plt.savefig(f'{column}_boxplot.png')
        plt.close()

    return missing_values

# Perform PCA for dimensionality reduction and visualization
def perform_pca(df):
    # Features to apply PCA on (exclude species)
    features = df.columns[:-1]
    X = df[features].values
    
    # Standardize the features before applying PCA
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    # Create a DataFrame with the PCA results
    pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
    pca_df['species'] = df['species']
    
    # Plot PCA results
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x='PC1', y='PC2', hue='species', data=pca_df, palette='Set2')
    plt.title('PCA of Iris Dataset')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend(title='Species')
    plt.savefig('pca_2d_plot.png')
    plt.close()

# Perform ANOVA test for each feature across species
def perform_anova(df):
    anova_results = {}
    for column in df.columns[:-1]:  # Skip 'species' column
        species_groups = [df[df['species'] == species][column].values for species in df['species'].unique()]
        f_stat, p_val = stats.f_oneway(*species_groups)
        anova_results[column] = {'f-statistic': f_stat, 'p-value': p_val}
    
    return anova_results

# Main function
def main():
    # Load dataset
    df = load_iris_data()
    
    # Output summary to a text file
    output_summary(df, 'iris_summary.txt')
    
    # Plot histograms
    plot_histograms(df)
    
    # Plot scatter plots
    plot_scatter_plots(df)
    
    # Perform additional analysis (correlation matrix & missing values)
    missing_values = additional_analysis(df)
    print("Missing values in each column:")
    print(missing_values)
    
    # Perform PCA
    perform_pca(df)
    
    # Perform ANOVA test for each feature
    anova_results = perform_anova(df)
    print("\nANOVA Results for each feature:")
    for feature, result in anova_results.items():
        print(f"{feature}: F-statistic = {result['f-statistic']:.2f}, p-value = {result['p-value']:.4f}")
    
if __name__ == "__main__":
    main()
