In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import os

In [2]:
# C values to test
C_values = [0.1, 1, 10, 100, 1000]

# Store all results
all_results = []

BREAST CANCER DATASET

In [3]:
# Load data file
data_file = 'hw4_data/breast-cancer_scale'
print(f"\nLoading: {data_file}")

X_list = []
y_list = []
max_feature = 0

with open(data_file, 'r') as f:
    for line in f:
        parts = line.strip().split()
        if not parts:
            continue
        
        # Get label
        y_list.append(int(float(parts[0])))
        
        # Get features
        features = {}
        for item in parts[1:]:
            idx, val = item.split(':')
            idx = int(idx) - 1  # Convert to 0-indexed
            features[idx] = float(val)
            max_feature = max(max_feature, idx)
        X_list.append(features)

# Convert to arrays
n_samples = len(y_list)
n_features = max_feature + 1
X = np.zeros((n_samples, n_features))
for i, features in enumerate(X_list):
    for idx, val in features.items():
        X[i, idx] = val
y = np.array(y_list)


Loading: hw4_data/breast-cancer_scale


In [4]:
# Load train/test indices
train_idx = np.loadtxt('hw4_data/breast-cancer_train_indices.txt', dtype=int, delimiter=',')
test_idx = np.loadtxt('hw4_data/breast-cancer_test_indices.txt', dtype=int, delimiter=',')

X_train = X[train_idx]
y_train = y[train_idx]
X_test = X[test_idx]
y_test = y[test_idx]

In [5]:
# Cross-validation
print("\n5-Fold Cross-Validation:")
print(f"{'C':<10} {'Train Error':<15} {'Val Error':<15}")
print("-" * 40)

best_C = None
best_val_error = float('inf')

for C in C_values:
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    train_errors = []
    val_errors = []
    
    for train_fold_idx, val_fold_idx in kf.split(X_train):
        X_train_fold = X_train[train_fold_idx]
        y_train_fold = y_train[train_fold_idx]
        X_val_fold = X_train[val_fold_idx]
        y_val_fold = y_train[val_fold_idx]
        
        model = LogisticRegression(C=C, penalty='l2', solver='lbfgs', 
                                   max_iter=1000, random_state=42)
        model.fit(X_train_fold, y_train_fold)
        
        train_pred = model.predict(X_train_fold)
        val_pred = model.predict(X_val_fold)
        train_errors.append(1 - accuracy_score(y_train_fold, train_pred))
        val_errors.append(1 - accuracy_score(y_val_fold, val_pred))
    
    avg_train_error = np.mean(train_errors)
    avg_val_error = np.mean(val_errors)
    
    marker = ""
    if avg_val_error < best_val_error:
        best_val_error = avg_val_error
        best_C = C
        marker = " ← BEST"
    
    print(f"{C:<10.1f} {avg_train_error:<15.4f} {avg_val_error:<15.4f}{marker}")


5-Fold Cross-Validation:
C          Train Error     Val Error      
----------------------------------------
0.1        0.0320          0.0347          ← BEST
1.0        0.0275          0.0347         
10.0       0.0293          0.0347         
100.0      0.0284          0.0347         
1000.0     0.0284          0.0347         


In [6]:
# Train final model with best C
print(f"\n✓ Training final model with best C = {best_C}")

final_model = LogisticRegression(C=best_C, penalty='l2', solver='lbfgs',
                                 max_iter=1000, random_state=42)
final_model.fit(X_train, y_train)

test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_pred)
test_error = 1 - test_accuracy

print(f"\n{'='*50}")
print("BREAST CANCER - FINAL RESULTS")
print(f"{'='*50}")
print(f"Best C: {best_C}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Error: {test_error:.4f}")

# Save results
all_results.append({
    'dataset': 'Breast Cancer',
    'best_C': best_C,
    'test_accuracy': test_accuracy,
    'test_error': test_error
})


✓ Training final model with best C = 0.1

BREAST CANCER - FINAL RESULTS
Best C: 0.1
Test Accuracy: 0.9416
Test Error: 0.0584


SONAR DATASET

In [8]:
# Load data file
data_file = 'hw4_data/sonar_scale'
print(f"\nLoading: {data_file}")

X_list = []
y_list = []
max_feature = 0

with open(data_file, 'r') as f:
    for line in f:
        parts = line.strip().split()
        if not parts:
            continue
        
        y_list.append(int(float(parts[0])))
        
        features = {}
        for item in parts[1:]:
            idx, val = item.split(':')
            idx = int(idx) - 1
            features[idx] = float(val)
            max_feature = max(max_feature, idx)
        X_list.append(features)

# Convert to arrays
n_samples = len(y_list)
n_features = max_feature + 1
X = np.zeros((n_samples, n_features))
for i, features in enumerate(X_list):
    for idx, val in features.items():
        X[i, idx] = val
y = np.array(y_list)


Loading: hw4_data/sonar_scale


In [9]:
# Load train/test indices
train_idx = np.loadtxt('hw4_data/sonar_train_indices.txt', dtype=int, delimiter=',')
test_idx = np.loadtxt('hw4_data/sonar_test_indices.txt', dtype=int, delimiter=',')

X_train = X[train_idx]
y_train = y[train_idx]
X_test = X[test_idx]
y_test = y[test_idx]


In [10]:
# Cross-validation
print("\n5-Fold Cross-Validation:")
print(f"{'C':<10} {'Train Error':<15} {'Val Error':<15}")
print("-" * 40)

best_C = None
best_val_error = float('inf')

for C in C_values:
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    train_errors = []
    val_errors = []
    
    for train_fold_idx, val_fold_idx in kf.split(X_train):
        X_train_fold = X_train[train_fold_idx]
        y_train_fold = y_train[train_fold_idx]
        X_val_fold = X_train[val_fold_idx]
        y_val_fold = y_train[val_fold_idx]
        
        model = LogisticRegression(C=C, penalty='l2', solver='lbfgs', 
                                   max_iter=1000, random_state=42)
        model.fit(X_train_fold, y_train_fold)
        
        train_pred = model.predict(X_train_fold)
        val_pred = model.predict(X_val_fold)
        train_errors.append(1 - accuracy_score(y_train_fold, train_pred))
        val_errors.append(1 - accuracy_score(y_val_fold, val_pred))
    
    avg_train_error = np.mean(train_errors)
    avg_val_error = np.mean(val_errors)
    
    marker = ""
    if avg_val_error < best_val_error:
        best_val_error = avg_val_error
        best_C = C
        marker = " ← BEST"
    
    print(f"{C:<10.1f} {avg_train_error:<15.4f} {avg_val_error:<15.4f}{marker}")



5-Fold Cross-Validation:
C          Train Error     Val Error      
----------------------------------------
0.1        0.1611          0.2406          ← BEST
1.0        0.0874          0.2588         
10.0       0.0271          0.2592         
100.0      0.0000          0.2889         
1000.0     0.0000          0.2711         


In [11]:
# Train final model with best C
print(f"\n✓ Training final model with best C = {best_C}")

final_model = LogisticRegression(C=best_C, penalty='l2', solver='lbfgs',
                                 max_iter=1000, random_state=42)
final_model.fit(X_train, y_train)

test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_pred)
test_error = 1 - test_accuracy

print(f"\n{'='*50}")
print("SONAR - FINAL RESULTS")
print(f"{'='*50}")
print(f"Best C: {best_C}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Error: {test_error:.4f}")

# Save results
all_results.append({
    'dataset': 'Sonar',
    'best_C': best_C,
    'test_accuracy': test_accuracy,
    'test_error': test_error
})


✓ Training final model with best C = 0.1

SONAR - FINAL RESULTS
Best C: 0.1
Test Accuracy: 0.8095
Test Error: 0.1905


FINAL SUMMARY

In [12]:
print("\n\n" + "=" * 70)
print("FINAL SUMMARY - ALL DATASETS")
print("=" * 70)

for result in all_results:
    print(f"\n{result['dataset']}")
    print(f"  Best C: {result['best_C']}")
    print(f"  Test Accuracy: {result['test_accuracy']:.4f}")
    print(f"  Test Error: {result['test_error']:.4f}")

print("\n" + "=" * 70)
print("DONE!")
print("=" * 70)



FINAL SUMMARY - ALL DATASETS

Breast Cancer
  Best C: 0.1
  Test Accuracy: 0.9416
  Test Error: 0.0584

Sonar
  Best C: 0.1
  Test Accuracy: 0.8095
  Test Error: 0.1905

DONE!
