# Assignment 5 - Visualizing Data Veracity Challenges in Multi-Label Classification

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
# Data Loading
from scipy.io import arff

## Part A: Preprocessing and Initial Setup

In [6]:
# Load the yeast dataset
data, meta = arff.loadarff('dataset/yeast.arff')

# Convert to pandas DataFrame
df = pd.DataFrame(data)

# Get column names
columns = df.columns.tolist()

# Find where features end and labels begin
# Features are named 'Att1', 'Att2', etc.
# Labels are named 'Class1', 'Class2', etc.
feature_cols = [col for col in columns if col.startswith('Att')]
label_cols = [col for col in columns if col.startswith('Class')]

print(f"Feature columns: {len(feature_cols)} (from {feature_cols[0]} to {feature_cols[-1]})")
print(f"Label columns: {len(label_cols)} (from {label_cols[0]} to {label_cols[-1]})")

# Separate features (X) and labels (Y)
X = df[feature_cols].astype(float)
Y = df[label_cols].astype(int)

# Print dimensions
print(f"Features (X) shape: {X.shape}")
print(f"Labels (Y) shape: {Y.shape}")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of labels: {Y.shape[1]}")

Feature columns: 103 (from Att1 to Att103)
Label columns: 14 (from Class1 to Class14)
Features (X) shape: (2417, 103)
Labels (Y) shape: (2417, 14)
Number of samples: 2417
Number of features: 103
Number of labels: 14


In [None]:
# Label Selection for visualization

# Create label combinations as strings for easier analysis
Y_combinations = Y.apply(lambda row: ''.join(row.astype(str)), axis=1)

# Count frequency of each label combination
combination_counts = Y_combinations.value_counts()
print("Top 10 frequent label combinations:")
print(combination_counts.head(10))
print()

# Find single-label classes (combinations with exactly one '1')
single_label_mask = Y.sum(axis=1) == 1
single_label_combinations = Y_combinations[single_label_mask]
single_label_counts = single_label_combinations.value_counts()

print("Most frequent single-label classes:")
print(single_label_counts.head())
print()

# Find multi-label classes (combinations with more than one '1')
multi_label_mask = Y.sum(axis=1) > 1
multi_label_combinations = Y_combinations[multi_label_mask]
multi_label_counts = multi_label_combinations.value_counts()

print("Most frequent multi-label combinations:")
print(multi_label_counts.head(5))
print()

# Get the two most frequent single-label classes
top_single_labels = single_label_counts.head(5).index.tolist()
print(f"Top 2 single-label classes: {top_single_labels}")

# Get the most frequent multi-label combination
top_multi_label = multi_label_counts.index[0] if len(multi_label_counts) > 0 else None
print(f"Top multi-label combination: {top_multi_label}")
print()

# Create visualization target variable
visualization_target = []

for combination in Y_combinations:
    if combination in top_single_labels:
        # Find which single label this is
        if combination == top_single_labels[0]:
            visualization_target.append(f"Single-1: {combination}")
        else:
            visualization_target.append(f"Single-2: {combination}")
    elif combination == top_multi_label:
        visualization_target.append(f"Multi: {combination}")
    else:
        visualization_target.append("Other")

# Convert to pandas Series
visualization_target = pd.Series(visualization_target)

# Display the distribution of the new target variable
print("Distribution of visualization target variable:")
print(visualization_target.value_counts())
print()
print(f"Total samples: {len(visualization_target)}")

Top 20 frequent label combinations:
00110000000110    237
00011000000110    233
11000000000110    172
11000000000000    131
01100000000110    117
00001100000110    105
00000011000110     75
01100000000000     68
00001111000110     60
10000000000110     59
Name: count, dtype: int64

Most frequent single-label classes:
10000000000000    32
Name: count, dtype: int64

Most frequent multi-label combinations:
00110000000110    237
00011000000110    233
11000000000110    172
11000000000000    131
01100000000110    117
Name: count, dtype: int64

Top 2 single-label classes: ['10000000000000']
Top multi-label combination: 00110000000110

Distribution of visualization target variable:
Other                       2148
Multi: 00110000000110        237
Single-1: 10000000000000      32
Name: count, dtype: int64

Total samples: 2417
