In [9]:
# Install required package if not already installed
%pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


# A7: Multi-Class Model Selection using ROC and Precision-Recall Curves

### Importing Data directly from UCIML repo

In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix, 
                            roc_curve, auc, precision_recall_curve, 
                            average_precision_score)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
import warnings
warnings.filterwarnings('ignore')

In [14]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_landsat_satellite = fetch_ucirepo(id=146) 
  
# data (as pandas dataframes) 
X = statlog_landsat_satellite.data.features 
y = statlog_landsat_satellite.data.targets 

In [7]:
# Display dataset information
print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("\nFeatures info:")
X.info()
print("\nTarget distribution:")
print(y.value_counts())

Features shape: (6435, 36)
Target shape: (6435, 1)

Features info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 36 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Attribute1   6435 non-null   int64
 1   Attribute2   6435 non-null   int64
 2   Attribute3   6435 non-null   int64
 3   Attribute4   6435 non-null   int64
 4   Attribute5   6435 non-null   int64
 5   Attribute6   6435 non-null   int64
 6   Attribute7   6435 non-null   int64
 7   Attribute8   6435 non-null   int64
 8   Attribute9   6435 non-null   int64
 9   Attribute10  6435 non-null   int64
 10  Attribute11  6435 non-null   int64
 11  Attribute12  6435 non-null   int64
 12  Attribute13  6435 non-null   int64
 13  Attribute14  6435 non-null   int64
 14  Attribute15  6435 non-null   int64
 15  Attribute16  6435 non-null   int64
 16  Attribute17  6435 non-null   int64
 17  Attribute18  6435 non-null   int64
 18  Attribute19  6435 non

## PART-A: Data Preparation and Baseline 

- Standardizing the features
- Splitting the data into training and test sets
- Training different models
    - K-Nearest Neighbors
    - Decision Tree Classification
    - Dummy Classifier (Prior)
    - Logistic Regression
    - Naive Bayes (Gaussian)
    - Support Vector Machine (SVC)

In [17]:
# Standardizing the features (X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (5148, 36) (5148, 1)
Testing set shape: (1287, 36) (1287, 1)


In [20]:
# 1. K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train.values.ravel())

# 2. Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train.values.ravel())

# 3. Dummy Classifier (Prior strategy)
dummy = DummyClassifier(strategy='prior', random_state=42)
dummy.fit(X_train, y_train.values.ravel())

# 4. Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train.values.ravel())

# 5. Naive Bayes (Gaussian)
nb = GaussianNB()
nb.fit(X_train, y_train.values.ravel())

# 6. Support Vector Machine (SVC with probability=True)
svc = SVC(probability=True, random_state=42)
svc.fit(X_train, y_train.values.ravel())

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,
