# Models optimization and training Notebook (s) (60 points)
Must include complete training and optimization of:
- A Penalized (Ridge, Lasso or ElasticNet) linear model (Linear Regression or Logistic Regression).
- Support Vector Machine
- Ensemble model (e.g. Random Forest or Gradient Boosting)
- Neural network implemented in PyTorch

You may use one combined notebook or separate notebooks for each model.

REMEMBER: For the optimization and training stage, you must not use the test set you put aside in the prerequisite Final Project assignment. 

In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('../Data/survey-lung-cancer.csv')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv('../Data/train_lung_cancer.csv', index=False)
test_df.to_csv('../Data/test_lung_cancer.csv', index=False)

## Preprocessing

StandardScaler of X_train, X_test + LabelEncoder of y_train, y_test

In [47]:
import pandas as pd
train_df = pd.read_csv('../Data/train_lung_cancer.csv')

In [48]:
features_df = train_df.drop(columns=['LUNG_CANCER'])
features = features_df.columns.tolist()
#print(features)
# Standardization: turn 1 into 0 and 2 into 1
train_df[features] = train_df[features].replace({1:0, 2:1})
train_df['GENDER']= train_df['GENDER'].replace({'M':0, 'F':1})
train_df['LUNG_CANCER']= train_df['LUNG_CANCER'].replace({'NO':0, 'YES':1})

  train_df['GENDER']= train_df['GENDER'].replace({'M':0, 'F':1})
  train_df['LUNG_CANCER']= train_df['LUNG_CANCER'].replace({'NO':0, 'YES':1})


In [49]:
# Split X_train and y_train
X_train = train_df.drop('LUNG_CANCER', axis=1)
y_train = train_df['LUNG_CANCER']

In [50]:
from sklearn.preprocessing import StandardScaler

# Initialize Scaler
scaler = StandardScaler()

# Scaling the features
X_train = scaler.fit_transform(X_train)

## Training and optimization of the 4 models mentioned above.

### A penalized linear model (Ridge, Lasso or ElasticNet) (Linear Regression or Logistic Regression).

Lasso with Logistic Regression

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# We use 'liblinear' solver because it handles L1 penalty (Lasso) well
# class_weight='balanced' helps with your class imbalance problem
lasso_model = LogisticRegression(penalty='l1', solver='liblinear', class_weight='balanced', random_state=42)

# Grid Search to find the best regularization strength (C)
# C is the inverse of regularization strength; smaller C = stronger regularization
param_grid_lasso = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

grid_search_lasso = GridSearchCV(lasso_model, param_grid_lasso, cv=5, scoring='accuracy')
grid_search_lasso.fit(X_train, y_train)

print("Best Lasso C value:", grid_search_lasso.best_params_)
print("Best Lasso Score:", grid_search_lasso.best_score_)

Best Lasso C value: {'C': 1}
Best Lasso Score: 0.8949387755102041


### Support Vector Machine.

SVM do not need data preprocessing with StandardScaler.

In [52]:
# Support Vector Machine (SVM)
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Define and train the model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = svm_model.predict(X_train)

# Evaluate the model
print("Confusion Matrix (Train):")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report (Train):")
print(classification_report(y_train, y_train_pred))

Confusion Matrix (Train):
[[ 26  11]
 [  7 203]]

Classification Report (Train):
              precision    recall  f1-score   support

           0       0.79      0.70      0.74        37
           1       0.95      0.97      0.96       210

    accuracy                           0.93       247
   macro avg       0.87      0.83      0.85       247
weighted avg       0.92      0.93      0.93       247



### Ensemble model (e.g. Random Forest or Gradient Boosting).

In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the model
rf = RandomForestClassifier(random_state=42)
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}


### Neural network implemented in PyTorch.

In [56]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# 1. Prepare Data for PyTorch
# Convert arrays to FloatTensors
X_tensor = torch.FloatTensor(X_train.values)
y_tensor = torch.FloatTensor(y_train.values).view(-1, 1) # Reshape to column vector

# Create Dataset and DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# 2. Define the Neural Network
class LungCancerNet(nn.Module):
    def __init__(self, input_dim):
        super(LungCancerNet, self).__init__()
        # Simple architecture: Input -> Hidden (16 neurons) -> Output (1 neuron)
        self.layer1 = nn.Linear(input_dim, 16)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(16, 1) 
        # We will use BCEWithLogitsLoss, which includes Sigmoid, so no final activation here

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

# Initialize Model
input_dim = X_train.shape[1]
model = LungCancerNet(input_dim)

# 3. Define Loss and Optimizer
# pos_weight handles class imbalance (Ratio of Negatives / Positives)
# Based on your confusion matrix (~37 Neg / ~210 Pos), imbalance is actually low quantity of negatives.
# Actually, since you want to catch Cancer (1), usually we weight the positive class. 
# But here Cancer(1) is the MAJORITY. We need to weight the MINORITY (0) higher if we want to catch it.
# For simplicity, let's use standard loss first, but you can add pos_weight if needed.
criterion = nn.BCEWithLogitsLoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. Training Loop
epochs = 50
print("Starting Training...")

for epoch in range(epochs):
    epoch_loss = 0
    model.train()
    for batch_X, batch_y in dataloader:
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(batch_X)
        
        # Calculate loss
        loss = criterion(outputs, batch_y)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(dataloader):.4f}')

print("Training Complete.")

# 5. Evaluate on Training Data (Quick Check)
model.eval()
with torch.no_grad():
    y_pred_logits = model(X_tensor)
    y_pred_probs = torch.sigmoid(y_pred_logits)
    y_pred_cls = (y_pred_probs > 0.5).float()
    
    accuracy = (y_pred_cls.eq(y_tensor).sum() / float(y_tensor.shape[0])).item()
    print(f"\nNeural Network Training Accuracy: {accuracy:.4f}")

AttributeError: 'numpy.ndarray' object has no attribute 'values'