# Eksperimen Machine Learning - Iris Dataset
## David Dewanto

Notebook ini berisi eksperimen lengkap untuk:
1. Data Loading
2. Exploratory Data Analysis (EDA)
3. Data Preprocessing

Dataset: Iris (Classification)

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries imported successfully!")

## 2. Data Loading

In [None]:
# Load raw dataset
df_raw = pd.read_csv('../iris_raw.csv')

print("Dataset loaded successfully!")
print(f"Shape: {df_raw.shape}")
print(f"\nFirst 5 rows:")
df_raw.head()

In [None]:
# Dataset information
print("Dataset Information:")
print("="*50)
df_raw.info()

In [None]:
# Statistical summary
print("Statistical Summary:")
print("="*50)
df_raw.describe()

## 3. Exploratory Data Analysis (EDA)

### 3.1 Check Missing Values

In [None]:
# Check missing values
missing_values = df_raw.isnull().sum()
print("Missing Values:")
print("="*50)
print(missing_values)
print(f"\nTotal missing values: {missing_values.sum()}")

### 3.2 Check Duplicates

In [None]:
# Check duplicates
duplicates = df_raw.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print("\nDuplicate rows:")
    print(df_raw[df_raw.duplicated()])

### 3.3 Class Distribution

In [None]:
# Target distribution
print("Species Distribution:")
print("="*50)
print(df_raw['species'].value_counts())
print(f"\nPercentage:")
print(df_raw['species'].value_counts(normalize=True) * 100)

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
df_raw['species'].value_counts().plot(kind='bar', ax=ax1, color='skyblue')
ax1.set_title('Species Distribution (Count)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Species')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

# Pie chart
df_raw['species'].value_counts().plot(kind='pie', ax=ax2, autopct='%1.1f%%')
ax2.set_title('Species Distribution (Percentage)', fontsize=14, fontweight='bold')
ax2.set_ylabel('')

plt.tight_layout()
plt.show()

### 3.4 Feature Distributions

In [None]:
# Distribution of numerical features
numerical_features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    axes[idx].hist(df_raw[col], bins=20, color='steelblue', edgecolor='black')
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

### 3.5 Box Plots for Outlier Detection

In [None]:
# Box plots for outlier detection
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    sns.boxplot(data=df_raw, y=col, ax=axes[idx], color='lightblue')
    axes[idx].set_title(f'Box Plot: {col}', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

### 3.6 Correlation Analysis

In [None]:
# Correlation matrix
correlation_matrix = df_raw[numerical_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("Correlation Matrix:")
print(correlation_matrix)

### 3.7 Pair Plot by Species

In [None]:
# Pair plot
sns.pairplot(df_raw, hue='species', height=2.5, diag_kind='kde')
plt.suptitle('Pair Plot of Iris Features by Species', y=1.02, fontsize=16, fontweight='bold')
plt.show()

### 3.8 Feature Statistics by Species

In [None]:
# Group statistics by species
print("Feature Statistics by Species:")
print("="*80)
print(df_raw.groupby('species')[numerical_features].mean())

print("\nStandard Deviation by Species:")
print("="*80)
print(df_raw.groupby('species')[numerical_features].std())

## 4. Data Preprocessing

### 4.1 Create a Copy for Preprocessing

In [None]:
# Create a copy for preprocessing
df = df_raw.copy()
print(f"Working with a copy of the dataset. Shape: {df.shape}")

### 4.2 Handle Missing Values (if any)

In [None]:
# Check and handle missing values
if df.isnull().sum().sum() > 0:
    print("Handling missing values...")
    # For numerical features, fill with median
    for col in numerical_features:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].median(), inplace=True)
    print("Missing values handled.")
else:
    print("No missing values found. Proceeding...")

### 4.3 Remove Duplicates (if any)

In [None]:
# Remove duplicates
before_duplicates = df.shape[0]
df = df.drop_duplicates()
after_duplicates = df.shape[0]

print(f"Removed {before_duplicates - after_duplicates} duplicate rows.")
print(f"Dataset shape after removing duplicates: {df.shape}")

### 4.4 Feature Engineering

In [None]:
# Create new features
df['sepal_area'] = df['sepal length (cm)'] * df['sepal width (cm)']
df['petal_area'] = df['petal length (cm)'] * df['petal width (cm)']
df['sepal_ratio'] = df['sepal length (cm)'] / df['sepal width (cm)']
df['petal_ratio'] = df['petal length (cm)'] / df['petal width (cm)']

print("New features created:")
print("- sepal_area")
print("- petal_area")
print("- sepal_ratio")
print("- petal_ratio")
print(f"\nNew dataset shape: {df.shape}")

### 4.5 Label Encoding for Target Variable

In [None]:
# Label encoding for species
label_encoder = LabelEncoder()
df['target_encoded'] = label_encoder.fit_transform(df['species'])

print("Label Encoding Mapping:")
for idx, label in enumerate(label_encoder.classes_):
    print(f"{label}: {idx}")

### 4.6 Feature Scaling

In [None]:
# Prepare features for scaling
feature_columns = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 
                   'petal width (cm)', 'sepal_area', 'petal_area', 'sepal_ratio', 'petal_ratio']

# Create a scaler
scaler = StandardScaler()

# Fit and transform the features
df_scaled = df.copy()
df_scaled[feature_columns] = scaler.fit_transform(df[feature_columns])

print("Features scaled using StandardScaler")
print("\nScaled features (first 5 rows):")
print(df_scaled[feature_columns].head())

### 4.7 Train-Test Split

In [None]:
# Prepare X and y
X = df_scaled[feature_columns]
y = df_scaled['target_encoded']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train-Test Split:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")
print(f"\nFeatures: {X_train.shape[1]}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts().sort_index())
print(f"\nClass distribution in testing set:")
print(y_test.value_counts().sort_index())

### 4.8 Save Preprocessed Data

In [None]:
# Save the preprocessed dataset
df_preprocessed = df_scaled.copy()
df_preprocessed.to_csv('iris_preprocessing.csv', index=False)

print("Preprocessed dataset saved to: iris_preprocessing.csv")
print(f"Shape: {df_preprocessed.shape}")
print("\nFirst few rows of preprocessed data:")
print(df_preprocessed.head())

## 5. Summary

In [None]:
print("="*80)
print("PREPROCESSING SUMMARY")
print("="*80)
print(f"Original dataset shape: {df_raw.shape}")
print(f"Preprocessed dataset shape: {df_preprocessed.shape}")
print(f"\nOriginal features: {len(numerical_features)}")
print(f"Total features after engineering: {len(feature_columns)}")
print(f"New features created: {len(feature_columns) - len(numerical_features)}")
print(f"\nMissing values: {df_preprocessed.isnull().sum().sum()}")
print(f"Duplicates: {df_preprocessed.duplicated().sum()}")
print(f"\nTraining samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")
print(f"\nTarget classes: {label_encoder.classes_.tolist()}")
print("\nPreprocessing steps completed:")
print("✓ Data loading")
print("✓ Exploratory Data Analysis")
print("✓ Missing value handling")
print("✓ Duplicate removal")
print("✓ Feature engineering")
print("✓ Label encoding")
print("✓ Feature scaling")
print("✓ Train-test split")
print("✓ Data saved")
print("="*80)