In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [1]:


# Define column names based on the UCI Mushroom dataset attributes
column_names = [
    'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
    'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape',
    'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
    'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color',
    'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'
]

# Load the dataset
df = pd.read_csv('agaricus-lepiota.data', header=None, names=column_names)

# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Check data types and missing values
print("\nData types:")
print(df.dtypes)

print("\nMissing values per column:")
print(df.isnull().sum())

# In this dataset, missing values are often represented as '?'
print("\nCount of '?' in each column:")
for column in df.columns:
    question_marks = (df[column] == '?').sum()
    if question_marks > 0:
        print(f"{column}: {question_marks}")

# Check the distribution of the target variable
print("\nClass distribution:")
print(df['class'].value_counts())
print(df['class'].value_counts(normalize=True).map(lambda x: f"{x:.2%}"))

# Let's examine the distribution of categorical features
plt.figure(figsize=(20, 15))
for i, column in enumerate(df.columns[1:13], 1):  # First half of features excluding class
    plt.subplot(4, 3, i)
    df[column].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {column}')
    plt.tight_layout()
plt.savefig('feature_distribution_1.png')
plt.close()

plt.figure(figsize=(20, 15))
for i, column in enumerate(df.columns[13:], 1):  # Second half of features
    plt.subplot(4, 3, i)
    df[column].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {column}')
    plt.tight_layout()
plt.savefig('feature_distribution_2.png')
plt.close()

# For categorical data, we can use a cramer's V correlation or chi-square test
# Let's create a simple relationship visualization based on odds ratios
def categorical_correlation(df, target_col):
    """Calculate and visualize correlation between categorical features and target"""
    correlation_dict = {}
    
    for column in df.columns:
        if column != target_col:
            # Create a contingency table
            contingency = pd.crosstab(df[column], df[target_col])
            
            # Calculate chi-square p-value
            from scipy.stats import chi2_contingency
            chi2, p, dof, expected = chi2_contingency(contingency)
            
            # Store the p-value as a measure of correlation
            correlation_dict[column] = p
    
    # Create a dataframe for visualization
    corr_df = pd.DataFrame.from_dict(correlation_dict, orient='index', columns=['p_value'])
    corr_df = corr_df.sort_values('p_value')
    
    # Visualize
    plt.figure(figsize=(10, 8))
    plt.barh(corr_df.index, -np.log10(corr_df['p_value']))
    plt.title('Feature importance (-log10(p-value) from chi-square)')
    plt.xlabel('-log10(p-value)')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    
    return corr_df

# Run the categorical correlation analysis
feature_importance = categorical_correlation(df, 'class')
print("\nFeature importance based on chi-square p-value:")
print(feature_importance)

# Let's also look at how certain features vary with the target
def plot_feature_by_target(df, feature, target='class'):
    """Create a stacked bar chart showing feature distribution by target class"""
    plt.figure(figsize=(10, 6))
    pd.crosstab(df[feature], df[target], normalize='index').plot(kind='bar', stacked=True)
    plt.title(f'{feature} by {target}')
    plt.tight_layout()
    plt.savefig(f'{feature}_by_target.png')
    plt.close()

# Plot a few important features
for feature in feature_importance.iloc[:5].index:  # Top 5 features
    plot_feature_by_target(df, feature)

# Let's also do a simple label encoding to see if that helps with correlation visualization
def encode_and_correlate(df):
    """Encode categorical variables and visualize correlation matrix"""
    # Create a copy of the dataframe
    encoded_df = df.copy()
    
    # Apply label encoding to all columns
    label_encoder = LabelEncoder()
    for col in encoded_df.columns:
        encoded_df[col] = label_encoder.fit_transform(encoded_df[col])
    
    # Calculate correlation matrix
    corr_matrix = encoded_df.corr()
    
    # Visualize correlation matrix
    plt.figure(figsize=(14, 12))
    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix After Label Encoding')
    plt.tight_layout()
    plt.savefig('correlation_matrix.png')
    plt.close()
    
    # Return correlation with target class
    return corr_matrix['class'].sort_values(ascending=False)

# Run the encoding and correlation
encoded_correlations = encode_and_correlate(df)
print("\nFeature correlations with target after label encoding:")
print(encoded_correlations)

Dataset shape: (8124, 23)

First 5 rows of the dataset:
  class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0     p         x           s         n       t    p               f   
1     e         x           s         y       t    a               f   
2     e         b           s         w       t    l               f   
3     p         x           y         w       t    p               f   
4     e         x           s         g       f    n               f   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                      w

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>